From d96ffc18320a58d164c1809871bb3aea7f549ac6 Mon Sep 17 00:00:00 2001 From: Mike Edmunds Date: Fri, 1 May 2026 10:52:06 -0700 Subject: [PATCH 01/11] gh-83938, gh-122476: Stop incorrectly RFC 2047 encoding non-ASCII email addresses (#122540) The email generators had been incorrectly flattening non-ASCII email addresses to RFC 2047 encoded-word format, leaving them undeliverable. (RFC 2047 prohibits use of encoded-word in an addr-spec.) This change raises a HeaderWriteError when attempting to flatten an EmailMessage with a non-ASCII addr-spec and a policy with utf8=False. (Exception: If the non-ASCII address originated from parsing a message, it will be flattened as originally parsed, without error.) This also applies to other contexts in which RFC2047 words are not allowed by the RFCs. Non-ASCII email addresses are supported when using a policy with utf8=True (such as email.policy.SMTPUTF8) under RFCs 6531 and 6532. Non-ASCII email address domains (but not localparts) can also be used with non-SMTPUTF8 policies by encoding the domain as an IDNA A-label. (The email package does not perform this encoding, because it cannot know whether the caller wants IDNA 2003, IDNA 2008, or some other variant such as UTS #46.) Co-authored-by: R. David Murray --- Doc/library/email.policy.rst | 19 +++- Doc/whatsnew/3.15.rst | 10 ++ Lib/email/_header_value_parser.py | 91 +++++++++++++---- .../test_email/test__header_value_parser.py | 8 +- Lib/test/test_email/test_generator.py | 99 ++++++++++++++++++- ...4-07-31-17-22-10.gh-issue-83938.TtUa-c.rst | 8 ++ ...-07-31-17-23-06.gh-issue-122476.TtUa-c.rst | 7 ++ 7 files changed, 216 insertions(+), 26 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst create mode 100644 Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst diff --git a/Doc/library/email.policy.rst b/Doc/library/email.policy.rst index 8f6e4218c97b38..816d02d86f4fc4 100644 --- a/Doc/library/email.policy.rst +++ b/Doc/library/email.policy.rst @@ -403,11 +403,26 @@ added matters. To illustrate:: .. attribute:: utf8 If ``False``, follow :rfc:`5322`, supporting non-ASCII characters in - headers by encoding them as "encoded words". If ``True``, follow - :rfc:`6532` and use ``utf-8`` encoding for headers. Messages + headers by encoding them as :rfc:`2047` "encoded words". If ``True``, + follow :rfc:`6532` and use ``utf-8`` encoding for headers. Messages formatted in this way may be passed to SMTP servers that support the ``SMTPUTF8`` extension (:rfc:`6531`). + When ``False``, the generator will raise + :exc:`~email.errors.HeaderWriteError` if any header includes non-ASCII + characters in a context where :rfc:`2047` does not permit encoded words. + This particularly applies to mailboxes ("addr-spec") with non-ASCII + characters, which can be created via + :class:`~email.headerregistry.Address`. To use a mailbox with a non-ASCII + domain name with ``utf8=False``, first encode the domain using the + third-party :pypi:`idna` or :pypi:`uts46` module or with + :mod:`encodings.idna`. It is not possible to use a non-ASCII username + ("local-part") in a mailbox when ``utf8=False``. + + .. versionchanged:: 3.15 + Can trigger the raising of :exc:`~email.errors.HeaderWriteError`. + (Earlier versions incorrectly applied :rfc:`2047` in certain contexts, + mostly notably in addr-specs.) .. attribute:: refold_source diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index a687ee5115be05..782b2fe002442c 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -914,6 +914,16 @@ faulthandler (Contributed by Eric Froemling in :gh:`149085`.) +email +----- + +* Email generators now raise an error when an :class:`.EmailMessage` cannot be + accurately flattened due to a non-ASCII email address (mailbox) in an address + header. Options for supporting Email Address Internationalization (EAI) are + discussed in :attr:`.EmailPolicy.utf8`. + (Contributed by R David Murray and Mike Edmunds in :gh:`122540`.) + + functools --------- diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index a53903a197f39e..26b6e26ae652fa 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -157,10 +157,7 @@ def all_defects(self): def startswith_fws(self): return self[0].startswith_fws() - @property - def as_ew_allowed(self): - """True if all top level tokens of this part may be RFC2047 encoded.""" - return all(part.as_ew_allowed for part in self) + as_ew_allowed = True @property def comments(self): @@ -429,6 +426,7 @@ def addr_spec(self): class AngleAddr(TokenList): token_type = 'angle-addr' + as_ew_allowed = False @property def local_part(self): @@ -847,26 +845,22 @@ def params(self): class ContentType(ParameterizedHeaderValue): token_type = 'content-type' - as_ew_allowed = False maintype = 'text' subtype = 'plain' class ContentDisposition(ParameterizedHeaderValue): token_type = 'content-disposition' - as_ew_allowed = False content_disposition = None class ContentTransferEncoding(TokenList): token_type = 'content-transfer-encoding' - as_ew_allowed = False cte = '7bit' class HeaderLabel(TokenList): token_type = 'header-label' - as_ew_allowed = False class MsgID(TokenList): @@ -2835,13 +2829,68 @@ def _steal_trailing_WSP_if_exists(lines): def _refold_parse_tree(parse_tree, *, policy): - """Return string of contents of parse_tree folded according to RFC rules. - - """ # max_line_length 0/None means no limit, ie: infinitely long. maxlen = policy.max_line_length or sys.maxsize encoding = 'utf-8' if policy.utf8 else 'us-ascii' lines = [''] # Folded lines to be output + if parse_tree.as_ew_allowed: + _refold_with_ew(parse_tree, lines, maxlen, encoding, policy=policy) + else: + _refold_without_ew(parse_tree, lines, maxlen, encoding, policy=policy) + return policy.linesep.join(lines) + policy.linesep + +def _refold_without_ew(parse_tree, lines, maxlen, encoding, *, policy): + parts = list(parse_tree) + while parts: + part = parts.pop(0) + tstr = str(part) + try: + tstr.encode(encoding) + except UnicodeEncodeError: + if any(isinstance(x, errors.UndecodableBytesDefect) + for x in part.all_defects): + # There is garbage data from parsing a message in binary mode, + # just pass it through. Not good, but the best we can do. + pass + elif policy.utf8: + # If this happens, it's a programmer error. + raise + else: + raise errors.HeaderWriteError( + f"Non-ASCII {part.token_type} '{part}' is invalid" + " under current policy setting (utf8=False)" + ) + if len(tstr) <= maxlen - len(lines[-1]): + lines[-1] += tstr + continue + # This part is too long to fit. The RFC wants us to break at + # "major syntactic breaks", so unless we don't consider this + # to be one, check if it will fit on the next line by itself. + if (part.syntactic_break and + len(tstr) + 1 <= maxlen): + newline = _steal_trailing_WSP_if_exists(lines) + if newline or part.startswith_fws(): + lines.append(newline + tstr) + continue + if not hasattr(part, 'encode'): + # It's not a terminal, try folding the subparts. + newparts = list(part) + parts = newparts + parts + continue + # We can't figure out how to wrap, it, so give up. + newline = _steal_trailing_WSP_if_exists(lines) + if newline or part.startswith_fws(): + lines.append(newline + tstr) + else: + # We can't fold it onto the next line either... + lines[-1] += tstr + return + + +def _refold_with_ew(parse_tree, lines, maxlen, encoding, *, policy): + """Return string of contents of parse_tree folded according to RFC rules. + + """ last_word_is_ew = False last_ew = None # if there is an encoded word in the last line of lines, # points to the encoded word's first character @@ -2855,6 +2904,11 @@ def _refold_parse_tree(parse_tree, *, policy): if part is end_ew_not_allowed: wrap_as_ew_blocked -= 1 continue + if part.token_type == 'mime-parameters': + # Mime parameter folding (using RFC2231) is extra special. + _fold_mime_parameters(part, lines, maxlen, encoding) + last_word_is_ew = False + continue tstr = str(part) if not want_encoding: if part.token_type in ('ptext', 'vtext'): @@ -2876,14 +2930,11 @@ def _refold_parse_tree(parse_tree, *, policy): charset = 'utf-8' want_encoding = True - if part.token_type == 'mime-parameters': - # Mime parameter folding (using RFC2231) is extra special. - _fold_mime_parameters(part, lines, maxlen, encoding) - last_word_is_ew = False - continue - if want_encoding and not wrap_as_ew_blocked: - if not part.as_ew_allowed: + if any( + not x.as_ew_allowed for x in part + if hasattr(x, 'as_ew_allowed') + ): want_encoding = False last_ew = None if part.syntactic_break: @@ -2964,6 +3015,8 @@ def _refold_parse_tree(parse_tree, *, policy): [ValueTerminal(make_quoted_pairs(p), 'ptext') for p in newparts] + [ValueTerminal('"', 'ptext')]) + _refold_without_ew(newparts, lines, maxlen, encoding, policy=policy) + continue if part.token_type == 'comment': newparts = ( [ValueTerminal('(', 'ptext')] + @@ -2991,7 +3044,7 @@ def _refold_parse_tree(parse_tree, *, policy): lines[-1] += tstr last_word_is_ew = last_word_is_ew and not bool(tstr.strip(_WSP)) - return policy.linesep.join(lines) + policy.linesep + return def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, last_word_is_ew): """Fold string to_encode into lines as encoded word, combining if allowed. diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index f3c03062572ba5..bc698759614c36 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -3374,10 +3374,12 @@ def test_fold_unfoldable_element_stealing_whitespace(self): self._test(token, expected, policy=policy) def test_encoded_word_with_undecodable_bytes(self): - self._test(parser.get_address_list( - ' =?utf-8?Q?=E5=AE=A2=E6=88=B6=E6=AD=A3=E8=A6=8F=E4=BA=A4=E7?=' + self._test( + parser.get_address_list( + ' =?utf-8?Q?=E5=AE=A2=E6=88=B6=E6=AD=A3=E8=A6=8F=E4=BA=A4=E7?=' + ' ' )[0], - ' =?unknown-8bit?b?5a6i5oi25q2j6KaP5Lqk5w==?=\n', + ' =?unknown-8bit?b?5a6i5oi25q2j6KaP5Lqk5w==?= \n', ) diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py index 3c9a86f3e8cf29..8d912738029f78 100644 --- a/Lib/test/test_email/test_generator.py +++ b/Lib/test/test_email/test_generator.py @@ -1,4 +1,5 @@ import io +import re import textwrap import unittest import random @@ -295,6 +296,69 @@ def test_keep_long_encoded_newlines(self): g.flatten(msg) self.assertEqual(s.getvalue(), self.typ(expected)) + def test_non_ascii_addr_spec_raises(self): + # non-ascii is not permitted in any part of an addr-spec. If the + # programmer generated it, it's an error. (See also + # test_non_ascii_addr_spec_preserved below.) + p = self.policy.clone(utf8=False, max_line_length=20) + g = self.genclass(self.ioclass(), policy=p) + # XXX The particular part detected here isn't part of a behavioral + # spec and may change in the future. + cases = [ + ('wők@example.com', 'wők', 'local-part'), + ('wok@exàmple.com', 'exàmple.com', 'domain'), + ('wők@exàmple.com', 'wők', 'local-part'), + ( + '"Name, for display" ', + 'wők@example.com', + 'addr-spec', + ), + ( + 'Näyttönimi ', + 'wők@example.com', + 'addr-spec', + ), + ( + '"a lőng quoted string as the local part"@example.com', + 'a lőng quoted string as the local part', + 'local-part', + ), + + ] + for address, badtoken, partname in cases: + with self.subTest(address=address): + msg = EmailMessage() + msg['To'] = address + expected_error = ( + fr"(?i)(?=.*non-ascii)" + fr"(?=.*{re.escape(badtoken)})" + fr"(?=.*{partname})" + fr"(?=.*policy.*utf8)" + ) + with self.assertRaisesRegex( + email.errors.HeaderWriteError, expected_error + ): + g.flatten(msg) + + def test_local_part_quoted_string_wrapped_correctly(self): + msg = self.msgmaker(self.typ(textwrap.dedent("""\ + To: <"a long local part in a quoted string"@example.com> + Subject: test + + None + """)), policy=self.policy.clone(max_line_length=20)) + expected = textwrap.dedent("""\ + To: <"a long local part in a + quoted string"@example.com> + Subject: test + + None + """) + s = self.ioclass() + g = self.genclass(s, policy=self.policy.clone(max_line_length=30)) + g.flatten(msg) + self.assertEqual(s.getvalue(), self.typ(expected)) + def _test_boundary_detection(self, linesep): # Generate a boundary token in the same way as _make_boundary token = random.randrange(sys.maxsize) @@ -515,12 +579,12 @@ def test_cte_type_7bit_transforms_8bit_cte(self): def test_smtputf8_policy(self): msg = EmailMessage() - msg['From'] = "Páolo " + msg['From'] = "Páolo " msg['To'] = 'Dinsdale' msg['Subject'] = 'Nudge nudge, wink, wink \u1F609' msg.set_content("oh là là, know what I mean, know what I mean?") expected = textwrap.dedent("""\ - From: Páolo + From: Páolo To: Dinsdale Subject: Nudge nudge, wink, wink \u1F609 Content-Type: text/plain; charset="utf-8" @@ -555,6 +619,37 @@ def test_smtp_policy(self): g.flatten(msg) self.assertEqual(s.getvalue(), expected) + def test_non_ascii_addr_spec_preserved(self): + # A defective non-ASCII addr-spec parsed from the original + # message is left unchanged when flattening. + # (See also test_non_ascii_addr_spec_raises above.) + source = ( + 'To: jörg@example.com, "But a long name still works with refold_source" ' + ).encode() + expected = ( + b'To: j\xc3\xb6rg@example.com,\n' + b' "But a long name still works with refold_source" \n' + b'\n' + ) + msg = message_from_bytes(source, policy=policy.default) + s = io.BytesIO() + g = BytesGenerator(s, policy=policy.default) + g.flatten(msg) + self.assertEqual(s.getvalue(), expected) + + def test_idna_encoding_preserved(self): + # Nothing tries to decode a pre-encoded IDNA domain. + msg = EmailMessage() + msg["To"] = Address( + username='jörg', + domain='☕.example'.encode('idna').decode() # IDNA 2003 + ) + expected = 'To: jörg@xn--53h.example\n\n'.encode() + s = io.BytesIO() + g = BytesGenerator(s, policy=policy.default.clone(utf8=True)) + g.flatten(msg) + self.assertEqual(s.getvalue(), expected) + if __name__ == '__main__': unittest.main() diff --git a/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst b/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst new file mode 100644 index 00000000000000..7082c72f685b05 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst @@ -0,0 +1,8 @@ +The :mod:`email` module no longer incorrectly uses :rfc:`2047` encoding for +a mailbox with non-ASCII characters in its domain. Under a policy with +:attr:`~email.policy.EmailPolicy.utf8` set ``False``, attempting to serialize +such a message will now raise an :exc:`~email.errors.HeaderWriteError`. +Either apply an appropriate IDNA encoding to convert the domain to ASCII before +serialization, or use :data:`email.policy.SMTPUTF8` (or another policy with +``utf8=True``) to correctly pass through the internationalized domain name +as Unicode characters. diff --git a/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst b/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst new file mode 100644 index 00000000000000..29c076d3a746c6 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst @@ -0,0 +1,7 @@ +The :mod:`email` module no longer incorrectly uses :rfc:`2047` encoding for +a mailbox with non-ASCII characters in its local-part. Under a policy with +:attr:`~email.policy.EmailPolicy.utf8` set ``False``, attempting to serialize +such a message will now raise an :exc:`~email.errors.HeaderWriteError`. +There is no valid 7-bit encoding for an internationalized local-part. Use +:data:`email.policy.SMTPUTF8` (or another policy with ``utf8=True``) to +correctly pass through the local-part as Unicode characters. From b60557382828a094f9da525cbd71c154bb97378b Mon Sep 17 00:00:00 2001 From: Mike Edmunds Date: Fri, 1 May 2026 11:00:46 -0700 Subject: [PATCH 02/11] gh-81074: Allow non-ASCII addr_spec in email.headerregistry.Address (#122477) The email.headerregistry.Address constructor raised an error if addr_spec contained a non-ASCII character. (But it fully supports non-ASCII in the separate username and domain args.) This change removes the error for a non-ASCII addr_spec, as well as the Defect that triggered it. In the unicode era non-ascii is not a defect, though it is an error when an attempt is made to serialize it to ascii. The serialization issue was handled in #122540. --- Lib/email/_header_value_parser.py | 5 ----- Lib/email/errors.py | 6 ++--- .../test_email/test__header_value_parser.py | 11 ---------- Lib/test/test_email/test_headerregistry.py | 22 +++++++++---------- ...4-07-30-19-19-33.gh-issue-81074.YAeWNf.rst | 8 +++++++ 5 files changed, 21 insertions(+), 31 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-07-30-19-19-33.gh-issue-81074.YAeWNf.rst diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 26b6e26ae652fa..9873958f5c2790 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1503,11 +1503,6 @@ def get_local_part(value): local_part.defects.append(errors.ObsoleteHeaderDefect( "local-part is not a dot-atom (contains CFWS)")) local_part[0] = obs_local_part - try: - local_part.value.encode('ascii') - except UnicodeEncodeError: - local_part.defects.append(errors.NonASCIILocalPartDefect( - "local-part contains non-ASCII characters)")) return local_part, value def get_obs_local_part(value): diff --git a/Lib/email/errors.py b/Lib/email/errors.py index 6bc744bd59c5bb..859307dd85be11 100644 --- a/Lib/email/errors.py +++ b/Lib/email/errors.py @@ -109,9 +109,9 @@ class ObsoleteHeaderDefect(HeaderDefect): """Header uses syntax declared obsolete by RFC 5322""" class NonASCIILocalPartDefect(HeaderDefect): - """local_part contains non-ASCII characters""" - # This defect only occurs during unicode parsing, not when - # parsing messages decoded from binary. + """Unused. Note: this error is deprecated and may be removed in the future.""" + # RFC 6532 permits a non-ASCII local-part. _header_value_parser previously + # treated this as a parse-time defect (when parsing Unicode, but not bytes). class InvalidDateDefect(HeaderDefect): """Header has unparsable or invalid date""" diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index bc698759614c36..aded44e85ee336 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1235,17 +1235,6 @@ def test_get_local_part_valid_and_invalid_qp_in_atom_list(self): '@example.com') self.assertEqual(local_part.local_part, r'\example\\ example') - def test_get_local_part_unicode_defect(self): - # Currently this only happens when parsing unicode, not when parsing - # stuff that was originally binary. - local_part = self._test_get_x(parser.get_local_part, - 'exámple@example.com', - 'exámple', - 'exámple', - [errors.NonASCIILocalPartDefect], - '@example.com') - self.assertEqual(local_part.local_part, 'exámple') - # get_dtext def test_get_dtext_only(self): diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py index 2aaa7d68ca3fe1..aa918255d15c37 100644 --- a/Lib/test/test_email/test_headerregistry.py +++ b/Lib/test/test_email/test_headerregistry.py @@ -1543,17 +1543,19 @@ def test_quoting(self): self.assertEqual(str(a), '"Sara J." <"bad name"@example.com>') def test_il8n(self): - a = Address('Éric', 'wok', 'exàmple.com') + a = Address('Éric', 'wők', 'exàmple.com') self.assertEqual(a.display_name, 'Éric') - self.assertEqual(a.username, 'wok') + self.assertEqual(a.username, 'wők') self.assertEqual(a.domain, 'exàmple.com') - self.assertEqual(a.addr_spec, 'wok@exàmple.com') - self.assertEqual(str(a), 'Éric ') + self.assertEqual(a.addr_spec, 'wők@exàmple.com') + self.assertEqual(str(a), 'Éric ') - # XXX: there is an API design issue that needs to be solved here. - #def test_non_ascii_username_raises(self): - # with self.assertRaises(ValueError): - # Address('foo', 'wők', 'example.com') + def test_i18n_in_addr_spec(self): + a = Address(addr_spec='wők@exàmple.com') + self.assertEqual(a.username, 'wők') + self.assertEqual(a.domain, 'exàmple.com') + self.assertEqual(a.addr_spec, 'wők@exàmple.com') + self.assertEqual(str(a), 'wők@exàmple.com') def test_crlf_in_constructor_args_raises(self): cases = ( @@ -1574,10 +1576,6 @@ def test_crlf_in_constructor_args_raises(self): with self.subTest(kwargs=kwargs), self.assertRaisesRegex(ValueError, "invalid arguments"): Address(**kwargs) - def test_non_ascii_username_in_addr_spec_raises(self): - with self.assertRaises(ValueError): - Address('foo', addr_spec='wők@example.com') - def test_address_addr_spec_and_username_raises(self): with self.assertRaises(TypeError): Address('foo', username='bing', addr_spec='bar@baz') diff --git a/Misc/NEWS.d/next/Library/2024-07-30-19-19-33.gh-issue-81074.YAeWNf.rst b/Misc/NEWS.d/next/Library/2024-07-30-19-19-33.gh-issue-81074.YAeWNf.rst new file mode 100644 index 00000000000000..87de4fade14dfb --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-07-30-19-19-33.gh-issue-81074.YAeWNf.rst @@ -0,0 +1,8 @@ +The :mod:`email` module no longer treats email addresses with non-ASCII +characters as defects when parsing a Unicode string or in the ``addr_spec`` +parameter to :class:`email.headerregistry.Address`. :rfc:`5322` permits such +addresses, and they were already supported when parsing bytes and in the Address +``username`` parameter. + +The (undocumented) :exc:`!email.errors.NonASCIILocalPartDefect` is no longer +used and should be considered deprecated. From 60b751c0181ec4f646666fb02bf4b44e3969e44c Mon Sep 17 00:00:00 2001 From: Hugo van Kemenade <1324225+hugovk@users.noreply.github.com> Date: Fri, 1 May 2026 21:39:36 +0300 Subject: [PATCH 03/11] Build mpdecimal from source to workaround unreliable launchpad.net (#149248) --- .github/workflows/posix-deps-apt.sh | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/.github/workflows/posix-deps-apt.sh b/.github/workflows/posix-deps-apt.sh index 7994a01ee4624e..6201e719ca87de 100755 --- a/.github/workflows/posix-deps-apt.sh +++ b/.github/workflows/posix-deps-apt.sh @@ -26,9 +26,16 @@ apt-get -yq --no-install-recommends install \ xvfb \ zlib1g-dev -# Workaround missing libmpdec-dev on ubuntu 24.04: -# https://launchpad.net/~ondrej/+archive/ubuntu/php -# https://deb.sury.org/ -sudo add-apt-repository ppa:ondrej/php -apt-get update -apt-get -yq --no-install-recommends install libmpdec-dev +# Workaround missing libmpdec-dev on ubuntu 24.04 by building mpdecimal +# from source. ppa:ondrej/php (launchpad.net) are unreliable +# (https://status.canonical.com) so fetch the tarball directly +# from the upstream host. +# https://www.bytereef.org/mpdecimal/ +MPDECIMAL_VERSION=4.0.1 +curl -fsSL "https://www.bytereef.org/software/mpdecimal/releases/mpdecimal-${MPDECIMAL_VERSION}.tar.gz" \ + | tar -xz -C /tmp +(cd "/tmp/mpdecimal-${MPDECIMAL_VERSION}" \ + && ./configure --prefix=/usr/local \ + && make -j"$(nproc)" \ + && make install) +ldconfig From fa542449bb01e536e7db702d26918b36e3aebe69 Mon Sep 17 00:00:00 2001 From: Ken Jin Date: Fri, 1 May 2026 19:55:38 +0100 Subject: [PATCH 04/11] gh-139038: Update final JIT figures for 3.15rc1 (#149210) Co-authored-by: Stan Ulbrych --- Doc/whatsnew/3.15.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 782b2fe002442c..83d3cb82195caa 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -1589,11 +1589,11 @@ Upgraded JIT compiler Results from the `pyperformance `__ benchmark suite report -`6-7% `__ +`8-9% `__ geometric mean performance improvement for the JIT over the standard CPython interpreter built with all optimizations enabled on x86-64 Linux. On AArch64 macOS, the JIT has a -`12-13% `__ +`12-13% `__ speedup over the :ref:`tail calling interpreter ` with all optimizations enabled. The speedups for JIT builds versus no JIT builds range from roughly 15% slowdown to over From 4e3811f05300eb436be3d670c9c62631e85844ea Mon Sep 17 00:00:00 2001 From: Hugo van Kemenade <1324225+hugovk@users.noreply.github.com> Date: Fri, 1 May 2026 21:57:09 +0300 Subject: [PATCH 05/11] gh-148726: Document the GC change in What's New in Python 3.14 (#149209) --- Doc/whatsnew/3.14.rst | 45 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index dfdfe66be7e6cc..0bb8858aea16fe 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -953,10 +953,24 @@ when a module is imported) will still emit the syntax warning. (Contributed by Irit Katriel in :gh:`130080`.) +.. _incremental-garbage-collection: .. _whatsnew314-incremental-gc: -Incremental garbage collection ------------------------------- +Garbage collection +------------------ + +**From Python 3.14.5 onwards:** + +The garbage collector (GC) has changed in Python 3.14.5. + +Python 3.14.0-3.14.4 shipped with a new incremental GC. +However, due to a number of `reports +`__ +of significant memory pressure in production environments, +it has been reverted back to the generational GC from 3.13. +This is the GC now used in Python 3.14.5 and later. + +**Previously in Python 3.14.0-3.14.4:** The cycle garbage collector is now incremental. This means that maximum pause times are reduced @@ -2203,7 +2217,18 @@ difflib gc -- -* The new :ref:`incremental garbage collector ` +* **From Python 3.14.5 onwards:** + + Python 3.14.0-3.14.4 shipped with a new incremental garbage collector. + However, due to a number of `reports + `__ + of significant memory pressure in production environments, + it has been reverted back to the generational GC from 3.13. + This is the GC now used in Python 3.14.5 and later. + +* **Previously in Python 3.14.0-3.14.4:** + + The new :ref:`incremental garbage collector ` means that maximum pause times are reduced by an order of magnitude or more for larger heaps. @@ -3447,3 +3472,17 @@ Changes in the C API functions on Python 3.13 and older. .. _pythoncapi-compat project: https://github.com/python/pythoncapi-compat/ + + +Notable changes in 3.14.5 +========================= + +gc +-- + +* The incremental garbage collector shipped in Python 3.14.0-3.14.4 has been + reverted back to the generational garbage collector from 3.13, + due to a number of `reports + `__ + of significant memory pressure in production environments. + See :ref:`whatsnew314-incremental-gc` for details. From 6040d65843198ded82c479eff790d66910cd8435 Mon Sep 17 00:00:00 2001 From: Sam Bull Date: Fri, 1 May 2026 20:32:50 +0100 Subject: [PATCH 06/11] Clarify `max_length` in zstd & zlib decompressor documentation (#143805) Also provide examples of how to decompress data using max_length for zstd and zlib. Co-authored-by: Emma Smith --- Doc/library/compression.zstd.rst | 10 +++++++--- Doc/library/zlib.rst | 5 +++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/Doc/library/compression.zstd.rst b/Doc/library/compression.zstd.rst index 7ca843f27f5e9a..6d99e36e1e5bb6 100644 --- a/Doc/library/compression.zstd.rst +++ b/Doc/library/compression.zstd.rst @@ -331,10 +331,14 @@ Compressing and decompressing data in memory If *max_length* is non-negative, the method returns at most *max_length* bytes of decompressed data. If this limit is reached and further - output can be produced, the :attr:`~.needs_input` attribute will - be set to ``False``. In this case, the next call to + output can be produced (or EOF is reached), the :attr:`~.needs_input` + attribute will be set to ``False``. In this case, the next call to :meth:`~.decompress` may provide *data* as ``b''`` to obtain - more of the output. + more of the output. The full content can thus be read like:: + + process_output(d.decompress(data, max_length)) + while not d.eof and not d.needs_input: + process_output(d.decompress(b"", max_length)) If all of the input data was decompressed and returned (either because this was less than *max_length* bytes, or because diff --git a/Doc/library/zlib.rst b/Doc/library/zlib.rst index ce0a22b9456d0b..f043915c0f4b94 100644 --- a/Doc/library/zlib.rst +++ b/Doc/library/zlib.rst @@ -308,6 +308,11 @@ Decompression objects support the following methods and attributes: :attr:`unconsumed_tail`. This bytestring must be passed to a subsequent call to :meth:`decompress` if decompression is to continue. If *max_length* is zero then the whole input is decompressed, and :attr:`unconsumed_tail` is empty. + For example, the full content could be read like:: + + process_output(d.decompress(data, max_length)) + while chunk := d.decompress(d.unconsumed_tail, max_length): + process_output(chunk) .. versionchanged:: 3.6 *max_length* can be used as a keyword argument. From 8c611e12bc735f863d58cf416f293ddd9811021d Mon Sep 17 00:00:00 2001 From: sobolevn Date: Fri, 1 May 2026 22:37:14 +0300 Subject: [PATCH 07/11] Fix source link in `Doc/howto/descriptor.rst` (#149215) --- Doc/howto/descriptor.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/howto/descriptor.rst b/Doc/howto/descriptor.rst index 9d5a9ac8b718cb..07a405837d9229 100644 --- a/Doc/howto/descriptor.rst +++ b/Doc/howto/descriptor.rst @@ -1640,7 +1640,7 @@ by member descriptors: class Member: def __init__(self, name, clsname, offset): - 'Emulate PyMemberDef in Include/structmember.h' + 'Emulate PyMemberDef in Include/descrobject.h' # Also see descr_new() in Objects/descrobject.c self.name = name self.clsname = clsname From 690e0de70671a07eba705156508a49c609f44bb1 Mon Sep 17 00:00:00 2001 From: sobolevn Date: Fri, 1 May 2026 22:53:28 +0300 Subject: [PATCH 08/11] gh-149083: Change several other docs examples to use `sentinel()` (#149213) --- Doc/faq/programming.rst | 2 +- Doc/howto/descriptor.rst | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Doc/faq/programming.rst b/Doc/faq/programming.rst index ff34bb5d71c22b..591565cbc01357 100644 --- a/Doc/faq/programming.rst +++ b/Doc/faq/programming.rst @@ -1924,7 +1924,7 @@ correctly using identity tests: .. code-block:: python - _sentinel = object() + _sentinel = sentinel('_sentinel') def pop(self, key, default=_sentinel): if key in self: diff --git a/Doc/howto/descriptor.rst b/Doc/howto/descriptor.rst index 07a405837d9229..a7a68281860cb5 100644 --- a/Doc/howto/descriptor.rst +++ b/Doc/howto/descriptor.rst @@ -594,7 +594,7 @@ a pure Python equivalent: def object_getattribute(obj, name): "Emulate PyObject_GenericGetAttr() in Objects/object.c" - null = object() + null = sentinel('null') objtype = type(obj) cls_var = find_name_in_mro(objtype, name, null) descr_get = getattr(type(cls_var), '__get__', null) @@ -1635,7 +1635,7 @@ by member descriptors: .. testcode:: - null = object() + null = sentinel('null') class Member: From 323677325735373a06506e5156b7d8e0e96c9660 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Fri, 1 May 2026 21:16:11 +0100 Subject: [PATCH 09/11] =?UTF-8?q?gh-149202:=20Implement=20PEP=20831=20?= =?UTF-8?q?=E2=80=93=20Frame=20Pointers=20Everywhere:=20Enabling=20System-?= =?UTF-8?q?Level=20Observability=20for=20Python=20(#149201)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Savannah Ostrowski Co-authored-by: Hugo van Kemenade <1324225+hugovk@users.noreply.github.com> Co-authored-by: Emma Smith --- Doc/howto/perf_profiling.rst | 5 +- Doc/using/configure.rst | 18 +++ Doc/whatsnew/3.15.rst | 11 ++ Lib/test/test_frame_pointer_unwind.py | 5 +- ...-04-05-16-10-00.gh-issue-149202.W8sQeR.rst | 4 + configure | 116 ++++++++++++++++-- configure.ac | 28 ++++- 7 files changed, 170 insertions(+), 17 deletions(-) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2026-04-05-16-10-00.gh-issue-149202.W8sQeR.rst diff --git a/Doc/howto/perf_profiling.rst b/Doc/howto/perf_profiling.rst index fc4772bbccab57..653f28ddbabfa4 100644 --- a/Doc/howto/perf_profiling.rst +++ b/Doc/howto/perf_profiling.rst @@ -217,8 +217,9 @@ Example, using the :mod:`sys` APIs in file :file:`example.py`: How to obtain the best results ------------------------------ -For best results, Python should be compiled with -``CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer"`` as this allows +For best results, keep frame pointers enabled. On supported GCC-compatible +toolchains, CPython builds itself with ``-fno-omit-frame-pointer`` and, when +available, ``-mno-omit-leaf-frame-pointer`` by default. These flags allow profilers to unwind using only the frame pointer and not on DWARF debug information. This is because as the code that is interposed to allow ``perf`` support is dynamically generated it doesn't have any DWARF debugging information diff --git a/Doc/using/configure.rst b/Doc/using/configure.rst index d5c17560b6658a..086f6bfa22ad4a 100644 --- a/Doc/using/configure.rst +++ b/Doc/using/configure.rst @@ -780,6 +780,24 @@ also be used to improve performance. .. versionadded:: 3.14 +.. option:: --without-frame-pointers + + Disable frame pointers, which are enabled by default (see :pep:`831`). + + By default, the build appends ``-fno-omit-frame-pointer`` (and + ``-mno-omit-leaf-frame-pointer`` when the compiler supports it) to + ``BASECFLAGS`` so profilers, debuggers, and system tracing tools + (``perf``, ``eBPF``, ``dtrace``, ``gdb``) can walk the C call stack + without DWARF metadata. The flags propagate to third-party C + extensions through :mod:`sysconfig`. On compilers that do not + understand them, the build silently skips them. + + Downstream packagers and authors of native libraries built with + custom build systems should set the same flags so the unwind chain + stays unbroken across all native frames. + + .. versionadded:: 3.15 + .. option:: --without-mimalloc Disable the fast :ref:`mimalloc ` allocator diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 83d3cb82195caa..b075441fdeaa3a 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -86,6 +86,7 @@ Summary -- Release highlights * :pep:`782`: :ref:`A new PyBytesWriter C API to create a Python bytes object ` * :pep:`803`: :ref:`Stable ABI for Free-Threaded Builds ` +* :pep:`831`: :ref:`Frame pointers everywhere ` * :ref:`The JIT compiler has been significantly upgraded ` * :ref:`Improved error messages ` * :ref:`The official Windows 64-bit binaries now use the tail-calling interpreter @@ -2262,6 +2263,16 @@ Build changes and :option:`-X dev <-X>` is passed to the Python or Python is built in :ref:`debug mode `. (Contributed by Donghee Na in :gh:`141770`.) +.. _whatsnew315-frame-pointers: + +* CPython is now built with frame pointers enabled by default + (:pep:`831`). Pass :option:`--without-frame-pointers` to opt out. + Authors of C extensions and native libraries built with custom build + systems should add ``-fno-omit-frame-pointer`` and + ``-mno-omit-leaf-frame-pointer`` to their own ``CFLAGS`` to keep the + unwind chain intact. + (Contributed by Pablo Galindo Salgado and Savannah Ostrowski in :gh:`149201`.) + .. _whatsnew315-windows-tail-calling-interpreter: * 64-bit builds using Visual Studio 2026 (MSVC 18) may now use the new diff --git a/Lib/test/test_frame_pointer_unwind.py b/Lib/test/test_frame_pointer_unwind.py index c70ec281686715..2f9ce2bf049f58 100644 --- a/Lib/test/test_frame_pointer_unwind.py +++ b/Lib/test/test_frame_pointer_unwind.py @@ -27,9 +27,8 @@ def _frame_pointers_expected(machine): ) if "no-omit-frame-pointer" in cflags: - # For example, configure adds -fno-omit-frame-pointer if Python - # has perf trampoline (PY_HAVE_PERF_TRAMPOLINE) and Python is built - # in debug mode. + # For example, configure adds -fno-omit-frame-pointer by default on + # supported GCC-compatible builds. return True if "omit-frame-pointer" in cflags: return False diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-04-05-16-10-00.gh-issue-149202.W8sQeR.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-04-05-16-10-00.gh-issue-149202.W8sQeR.rst new file mode 100644 index 00000000000000..f82ca91f5ba000 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-04-05-16-10-00.gh-issue-149202.W8sQeR.rst @@ -0,0 +1,4 @@ +Enable frame pointers by default for GCC-compatible CPython builds, including +``-mno-omit-leaf-frame-pointer`` when the compiler supports it, so profilers +and debuggers can unwind native interpreter frames more reliably. Users can pass +``--without-frame-pointers`` to opt out. diff --git a/configure b/configure index 6cd7a1900463ee..734aa3a6a721d1 100755 --- a/configure +++ b/configure @@ -1115,6 +1115,7 @@ enable_bolt with_strict_overflow enable_safety enable_slower_safety +with_frame_pointers enable_experimental_jit with_dsymutil with_address_sanitizer @@ -1912,6 +1913,8 @@ Optional Packages: is no) --with-strict-overflow if 'yes', add -fstrict-overflow to CFLAGS, else add -fno-strict-overflow (default is no) + --without-frame-pointers + build without frame pointers (default is no) --with-dsymutil link debug information into final executable with dsymutil in macOS (default is no) --with-address-sanitizer @@ -10241,9 +10244,115 @@ fi fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether to build with frame pointers" >&5 +printf %s "checking whether to build with frame pointers... " >&6; } + +# Check whether --with-frame-pointers was given. +if test ${with_frame_pointers+y} +then : + withval=$with_frame_pointers; +else case e in #( + e) with_frame_pointers=yes ;; +esac +fi + +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $with_frame_pointers" >&5 +printf "%s\n" "$with_frame_pointers" >&6; } + if test "x$ac_cv_gcc_compat" = xyes then : + frame_pointer_cflags= + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -fno-omit-frame-pointer" >&5 +printf %s "checking whether C compiler accepts -fno-omit-frame-pointer... " >&6; } +if test ${ax_cv_check_cflags__Werror__fno_omit_frame_pointer+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) + ax_check_save_flags=$CFLAGS + CFLAGS="$CFLAGS -Werror -fno-omit-frame-pointer" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main (void) +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO" +then : + ax_cv_check_cflags__Werror__fno_omit_frame_pointer=yes +else case e in #( + e) ax_cv_check_cflags__Werror__fno_omit_frame_pointer=no ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext + CFLAGS=$ax_check_save_flags ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ax_cv_check_cflags__Werror__fno_omit_frame_pointer" >&5 +printf "%s\n" "$ax_cv_check_cflags__Werror__fno_omit_frame_pointer" >&6; } +if test "x$ax_cv_check_cflags__Werror__fno_omit_frame_pointer" = xyes +then : + + frame_pointer_cflags="-fno-omit-frame-pointer" + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -mno-omit-leaf-frame-pointer" >&5 +printf %s "checking whether C compiler accepts -mno-omit-leaf-frame-pointer... " >&6; } +if test ${ax_cv_check_cflags__Werror__mno_omit_leaf_frame_pointer+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) + ax_check_save_flags=$CFLAGS + CFLAGS="$CFLAGS -Werror -mno-omit-leaf-frame-pointer" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main (void) +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO" +then : + ax_cv_check_cflags__Werror__mno_omit_leaf_frame_pointer=yes +else case e in #( + e) ax_cv_check_cflags__Werror__mno_omit_leaf_frame_pointer=no ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext + CFLAGS=$ax_check_save_flags ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ax_cv_check_cflags__Werror__mno_omit_leaf_frame_pointer" >&5 +printf "%s\n" "$ax_cv_check_cflags__Werror__mno_omit_leaf_frame_pointer" >&6; } +if test "x$ax_cv_check_cflags__Werror__mno_omit_leaf_frame_pointer" = xyes +then : + + frame_pointer_cflags="$frame_pointer_cflags -mno-omit-leaf-frame-pointer" + +else case e in #( + e) : ;; +esac +fi + + +else case e in #( + e) : ;; +esac +fi + + if test -n "$frame_pointer_cflags" && test "x$with_frame_pointers" != xno; then + BASECFLAGS="$frame_pointer_cflags $BASECFLAGS" + fi + CFLAGS_NODIST="$CFLAGS_NODIST -std=c11" @@ -14124,13 +14233,6 @@ printf "%s\n" "#define PY_HAVE_PERF_TRAMPOLINE 1" >>confdefs.h PERF_TRAMPOLINE_OBJ=Python/asm_trampoline.o - if test "x$Py_DEBUG" = xtrue -then : - - as_fn_append BASECFLAGS " -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer" - -fi - fi diff --git a/configure.ac b/configure.ac index 60511db39fad1e..c8cb1686d55c07 100644 --- a/configure.ac +++ b/configure.ac @@ -2529,7 +2529,30 @@ then AX_CHECK_COMPILE_FLAG([-D_FORTIFY_SOURCE=3], [CFLAGS_NODIST="$CFLAGS_NODIST -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=3"], [AC_MSG_WARN([-D_FORTIFY_SOURCE=3 not supported])], [-Werror]) fi +AC_MSG_CHECKING([whether to build with frame pointers]) +AC_ARG_WITH([frame-pointers], + [AS_HELP_STRING([--without-frame-pointers], + [build without frame pointers (default is no)])], + [], + [with_frame_pointers=yes]) +AC_MSG_RESULT([$with_frame_pointers]) + AS_VAR_IF([ac_cv_gcc_compat], [yes], [ + dnl Keep frame pointers in CPython, stdlib objects, and third-party + dnl extensions built against this Python (BASECFLAGS propagates via + dnl sysconfig) so native profilers can unwind interpreter frames and + dnl generated trampolines without DWARF. + frame_pointer_cflags= + AX_CHECK_COMPILE_FLAG([-fno-omit-frame-pointer], [ + frame_pointer_cflags="-fno-omit-frame-pointer" + AX_CHECK_COMPILE_FLAG([-mno-omit-leaf-frame-pointer], [ + frame_pointer_cflags="$frame_pointer_cflags -mno-omit-leaf-frame-pointer" + ], [], [-Werror]) + ], [], [-Werror]) + if test -n "$frame_pointer_cflags" && test "x$with_frame_pointers" != xno; then + BASECFLAGS="$frame_pointer_cflags $BASECFLAGS" + fi + CFLAGS_NODIST="$CFLAGS_NODIST -std=c11" PY_CHECK_CC_WARNING([enable], [extra], [if we can add -Wextra]) @@ -3788,11 +3811,6 @@ AC_MSG_RESULT([$perf_trampoline]) AS_VAR_IF([perf_trampoline], [yes], [ AC_DEFINE([PY_HAVE_PERF_TRAMPOLINE], [1], [Define to 1 if you have the perf trampoline.]) PERF_TRAMPOLINE_OBJ=Python/asm_trampoline.o - - dnl perf needs frame pointers for unwinding, include compiler option in debug builds - AS_VAR_IF([Py_DEBUG], [true], [ - AS_VAR_APPEND([BASECFLAGS], [" -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer"]) - ]) ]) AC_SUBST([PERF_TRAMPOLINE_OBJ]) From 91e871a3077ca44cd65e966a4fead5410074506b Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Fri, 1 May 2026 16:31:00 -0500 Subject: [PATCH 10/11] gh-124397: Add free-threading support for iterators. (gh-148894) --- Doc/library/threading.rst | 156 ++++++++++++ Doc/whatsnew/3.15.rst | 10 + Lib/test/test_threading.py | 225 ++++++++++++++++++ Lib/threading.py | 143 +++++++++++ ...-04-22-20-49-49.gh-issue-124397.plMglV.rst | 3 + 5 files changed, 537 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2026-04-22-20-49-49.gh-issue-124397.plMglV.rst diff --git a/Doc/library/threading.rst b/Doc/library/threading.rst index 19cc4f191dff8d..fbe3951e034d07 100644 --- a/Doc/library/threading.rst +++ b/Doc/library/threading.rst @@ -1436,3 +1436,159 @@ is equivalent to:: Currently, :class:`Lock`, :class:`RLock`, :class:`Condition`, :class:`Semaphore`, and :class:`BoundedSemaphore` objects may be used as :keyword:`with` statement context managers. + + +Iterator synchronization +------------------------ + +By default, Python iterators do not support concurrent access. Most iterators make +no guarantees when accessed simultaneously from multiple threads. Generator +iterators, for example, raise :exc:`ValueError` if one of their iterator methods +is called while the generator is already executing. The tools in this section +allow reliable concurrency support to be added to ordinary iterators and +iterator-producing callables. + +The :class:`serialize_iterator` wrapper lets multiple threads share a single iterator and +take turns consuming from it. While one thread is running ``__next__()``, the +others block until the iterator becomes available. Each value produced by the +underlying iterator is delivered to exactly one caller. + +The :func:`concurrent_tee` function lets multiple threads each receive the full +stream of values from one underlying iterator. It creates independent iterators +that all draw from the same source. Values are buffered until consumed by all +of the derived iterators. + +.. class:: serialize_iterator(iterable) + + Return an iterator wrapper that serializes concurrent calls to + :meth:`~iterator.__next__` using a lock. + + If the wrapped iterator also defines :meth:`~generator.send`, + :meth:`~generator.throw`, or :meth:`~generator.close`, those calls + are serialized as well. + + This makes it possible to share a single iterator, including a generator + iterator, between multiple threads. A lock ensures that calls are handled + one at a time. No values are duplicated or skipped by the wrapper itself. + Each item from the underlying iterator is given to exactly one caller. + + This wrapper does not copy or buffer values. Threads that call + :func:`next` while another thread is already advancing the iterator will + block until the active call completes. + + Example: + + .. code-block:: python + + import threading + + def squares(n): + for x in range(n): + yield x * x + + def consume(name, iterable): + for item in iterable: + print(name, item) + + source = threading.serialize_iterator(squares(5)) + + t1 = threading.Thread(target=consume, args=("left", source)) + t2 = threading.Thread(target=consume, args=("right", source)) + t1.start() + t2.start() + t1.join() + t2.join() + + In this example, each number is printed exactly once, but the work is shared + between the two threads. + + .. versionadded:: next + + +.. function:: synchronized_iterator(func) + + Wrap an iterator-producing callable so that each iterator it returns is + automatically passed through :class:`serialize_iterator`. + + This is especially useful as a :term:`decorator` for generator functions, + allowing their generator-iterators to be consumed from multiple threads. + + Example: + + .. code-block:: python + + import threading + + @threading.synchronized_iterator + def squares(n): + for x in range(n): + yield x * x + + def consume(name, iterable): + for item in iterable: + print(name, item) + + source = squares(5) + + t1 = threading.Thread(target=consume, args=("left", source)) + t2 = threading.Thread(target=consume, args=("right", source)) + t1.start() + t2.start() + t1.join() + t2.join() + + The returned wrapper preserves the metadata of *func*, such as its name and + wrapped function reference. + + .. versionadded:: next + + +.. function:: concurrent_tee(iterable, n=2) + + Return *n* independent iterators from a single input *iterable*, with + guaranteed behavior when the derived iterators are consumed concurrently. + + This function is similar to :func:`itertools.tee`, but is intended for cases + where the source iterator may feed consumers running in different threads. + Each returned iterator yields every value from the underlying iterable, in + the same order. + + Internally, values are buffered until every derived iterator has consumed + them. + + The returned iterators share the same underlying synchronization lock. Each + individual derived iterator is intended to be consumed by one thread at a + time. If a single derived iterator must itself be shared by multiple + threads, wrap it with :class:`serialize_iterator`. + + If *n* is ``0``, return an empty tuple. If *n* is negative, raise + :exc:`ValueError`. + + Example: + + .. code-block:: python + + import threading + + def squares(n): + for x in range(n): + yield x * x + + def consume(name, iterable): + for item in iterable: + print(name, item) + + source = squares(5) + left, right = threading.concurrent_tee(source) + + t1 = threading.Thread(target=consume, args=("left", left)) + t2 = threading.Thread(target=consume, args=("right", right)) + t1.start() + t2.start() + t1.join() + t2.join() + + In this example, both consumer threads see the full sequence of squares + from a single generator expression. + + .. versionadded:: next diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index b075441fdeaa3a..b63e7a4790e9af 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -1279,6 +1279,16 @@ tarfile (Contributed by Christoph Walcher in :gh:`57911`.) +threading +--------- + +* Added :class:`~threading.serialize_iterator`, + :func:`~threading.synchronized_iterator`, + and :func:`~threading.concurrent_tee` to support concurrent access to + generators and iterators. + (Contributed by Raymond Hettinger in :gh:`124397`.) + + timeit ------ diff --git a/Lib/test/test_threading.py b/Lib/test/test_threading.py index 0ca91ce0d7899d..3d01804513bde9 100644 --- a/Lib/test/test_threading.py +++ b/Lib/test/test_threading.py @@ -2368,6 +2368,231 @@ class BarrierTests(lock_tests.BarrierTests): barriertype = staticmethod(threading.Barrier) +## Test Synchronization tools for iterators ################ + +class ThreadingIteratorToolsTests(BaseTestCase): + def test_serialize_serializes_concurrent_iteration(self): + limit = 10_000 + workers_count = 10 + result = 0 + result_lock = threading.Lock() + start = threading.Event() + + def producer(limit): + for x in range(limit): + yield x + + def consumer(iterator): + nonlocal result + start.wait() + total = 0 + for x in iterator: + total += x + with result_lock: + result += total + + iterator = threading.serialize_iterator(producer(limit)) + workers = [ + threading.Thread(target=consumer, args=(iterator,)) + for _ in range(workers_count) + ] + with threading_helper.wait_threads_exit(): + for worker in workers: + worker.start() + for worker in workers: + # Wait for the worker thread to actually start. + while worker.ident is None: + time.sleep(0.1) + start.set() + for worker in workers: + worker.join() + + self.assertEqual(result, limit * (limit - 1) // 2) + + def test_serialize_generator_methods(self): + # A generator that yields and receives + def echo(): + try: + while True: + val = yield "ready" + yield f"received {val}" + except ValueError: + yield "caught" + + it = threading.serialize_iterator(echo()) + + # Test __next__ + self.assertEqual(next(it), "ready") + + # Test send() + self.assertEqual(it.send("hello"), "received hello") + self.assertEqual(next(it), "ready") + + # Test throw() + self.assertEqual(it.throw(ValueError), "caught") + + # Test close() + it.close() + with self.assertRaises(StopIteration): + next(it) + + def test_serialize_methods_attribute_error(self): + # A standard iterator that does not have send/throw/close + # should raise AttributeError when called. + standard_it = threading.serialize_iterator([1, 2, 3]) + + with self.assertRaises(AttributeError): + standard_it.send("foo") + + with self.assertRaises(AttributeError): + standard_it.throw(ValueError) + + with self.assertRaises(AttributeError): + standard_it.close() + + def test_serialize_generator_methods_locking(self): + # Verifies that generator methods also acquire the lock. + # We can test this by checking if the lock is held during the call. + + class LockCheckingGenerator: + def __init__(self, lock): + self.lock = lock + def __iter__(self): + return self + def send(self, value): + if not self.lock.locked(): + raise RuntimeError("Lock not held during send()") + return value + def throw(self, *args): + if not self.lock.locked(): + raise RuntimeError("Lock not held during throw()") + def close(self): + if not self.lock.locked(): + raise RuntimeError("Lock not held during close()") + + # Manually create the serialize object to inspect the lock + it = threading.serialize_iterator([]) + mock_gen = LockCheckingGenerator(it._lock) + it._iterator = mock_gen + + # These should not raise RuntimeError + it.send(1) + it.throw(ValueError) + it.close() + + def test_serialize_next_exception(self): + # Verify exception pass through for calls to next() + + def f(): + raise RuntimeError + yield None + + g = threading.serialize_iterator(f()) + with self.assertRaises(RuntimeError): + next(g) + + def test_synchronized_serializes_generator_instances(self): + unique = 10 + repetitions = 5 + limit = 100 + start = threading.Event() + + @threading.synchronized_iterator + def atomic_counter(): + # The sleep widens the race window that would exist without + # synchronization between yielding a value and advancing state. + i = 0 + while True: + yield i + time.sleep(0.0005) + i += 1 + + def consumer(counter): + start.wait() + for _ in range(limit): + next(counter) + + unique_counters = [atomic_counter() for _ in range(unique)] + counters = unique_counters * repetitions + workers = [ + threading.Thread(target=consumer, args=(counter,)) + for counter in counters + ] + with threading_helper.wait_threads_exit(): + for worker in workers: + worker.start() + start.set() + for worker in workers: + worker.join() + + self.assertEqual( + {next(counter) for counter in unique_counters}, + {limit * repetitions}, + ) + + def test_synchronized_preserves_wrapped_metadata(self): + def gen(): + yield 1 + + wrapped = threading.synchronized_iterator(gen) + + self.assertEqual(wrapped.__name__, gen.__name__) + self.assertIs(wrapped.__wrapped__, gen) + self.assertEqual(list(wrapped()), [1]) + + def test_concurrent_tee_supports_concurrent_consumers(self): + limit = 5_000 + num_threads = 25 + successes = 0 + failures = [] + result_lock = threading.Lock() + start = threading.Event() + expected = list(range(limit)) + + def producer(limit): + for x in range(limit): + yield x + + def consumer(iterator): + nonlocal successes + start.wait() + items = list(iterator) + with result_lock: + if items == expected: + successes += 1 + else: + failures.append(items[:20]) + + tees = threading.concurrent_tee(producer(limit), n=num_threads) + workers = [ + threading.Thread(target=consumer, args=(iterator,)) + for iterator in tees + ] + with threading_helper.wait_threads_exit(): + for worker in workers: + worker.start() + start.set() + for worker in workers: + worker.join() + + self.assertEqual(failures, []) + self.assertEqual(successes, len(tees)) + + # Verify that locks are shared + self.assertEqual(len({id(t_obj.lock) for t_obj in tees}), 1) + + def test_concurrent_tee_zero_iterators(self): + self.assertEqual(threading.concurrent_tee(range(10), n=0), ()) + + def test_concurrent_tee_negative_n(self): + with self.assertRaises(ValueError): + threading.concurrent_tee(range(10), n=-1) + + +################# + + + class MiscTestCase(unittest.TestCase): def test__all__(self): restore_default_excepthook(self) diff --git a/Lib/threading.py b/Lib/threading.py index 4ebceae7029870..abac31e25886fa 100644 --- a/Lib/threading.py +++ b/Lib/threading.py @@ -29,6 +29,7 @@ 'Barrier', 'BrokenBarrierError', 'Timer', 'ThreadError', 'setprofile', 'settrace', 'local', 'stack_size', 'excepthook', 'ExceptHookArgs', 'gettrace', 'getprofile', + 'serialize_iterator', 'synchronized_iterator', 'concurrent_tee', 'setprofile_all_threads','settrace_all_threads'] # Rename some stuff so "from threading import *" is safe @@ -842,6 +843,148 @@ class BrokenBarrierError(RuntimeError): pass +## Synchronization tools for iterators ##################### + +class serialize_iterator: + """Wrap a non-concurrent iterator with a lock to enforce sequential access. + + Applies a non-reentrant lock around calls to __next__. If the + wrapped iterator also defines send(), throw(), or close(), those + calls are serialized as well. + + Allows iterator and generator instances to be shared by multiple consumer + threads. + + For example, itertools.count does not make thread-safe instances, + but that is easily fixed with: + + atomic_counter = serialize_iterator(itertools.count()) + + """ + + __slots__ = ('_iterator', '_lock') + + def __init__(self, iterable): + self._iterator = iter(iterable) + self._lock = Lock() + + def __iter__(self): + return self + + def __next__(self): + with self._lock: + return next(self._iterator) + + def send(self, value, /): + """Send a value to a generator. + + Raises AttributeError if not a generator. + """ + with self._lock: + return self._iterator.send(value) + + def throw(self, *args): + """Call throw() on a generator. + + Raises AttributeError if not a generator. + """ + with self._lock: + return self._iterator.throw(*args) + + def close(self): + """Call close() on a generator. + + Raises AttributeError if not a generator. + """ + with self._lock: + return self._iterator.close() + + +def synchronized_iterator(func): + """Wrap an iterator-returning callable to make its iterators thread-safe. + + Existing itertools and more-itertools can be wrapped so that their + iterator instances are serialized. + + For example, itertools.count does not make thread-safe instances, + but that is easily fixed with: + + atomic_counter = synchronized_iterator(itertools.count) + + Can also be used as a decorator for generator function definitions + so that the generator instances are serialized:: + + import time + + @synchronized_iterator + def enumerate_and_timestamp(iterable): + for count, value in enumerate(iterable): + yield count, time.time_ns(), value + + """ + + from functools import wraps + + @wraps(func) + def inner(*args, **kwargs): + iterator = func(*args, **kwargs) + return serialize_iterator(iterator) + + return inner + + +def concurrent_tee(iterable, n=2): + """Variant of itertools.tee() but with guaranteed threading semantics. + + Takes a non-threadsafe iterator as an input and creates concurrent + tee objects for other threads to have reliable independent copies of + the data stream. + + The new iterators are only thread-safe if consumed within a single thread. + To share just one of the new iterators across multiple threads, wrap it + with threading.serialize_iterator(). + """ + + if n < 0: + raise ValueError("n must be a non-negative integer") + if n == 0: + return () + iterator = _concurrent_tee(iterable) + result = [iterator] + for _ in range(n - 1): + result.append(_concurrent_tee(iterator)) + return tuple(result) + + +class _concurrent_tee: + __slots__ = ('iterator', 'link', 'lock') + + def __init__(self, iterable): + if isinstance(iterable, _concurrent_tee): + self.iterator = iterable.iterator + self.link = iterable.link + self.lock = iterable.lock + else: + self.iterator = iter(iterable) + self.link = [None, None] + self.lock = Lock() + + def __iter__(self): + return self + + def __next__(self): + link = self.link + if link[1] is None: + with self.lock: + if link[1] is None: + link[0] = next(self.iterator) + link[1] = [None, None] + value, self.link = link + return value + +############################################################ + + # Helper to generate new thread names _counter = _count(1).__next__ def _newname(name_template): diff --git a/Misc/NEWS.d/next/Library/2026-04-22-20-49-49.gh-issue-124397.plMglV.rst b/Misc/NEWS.d/next/Library/2026-04-22-20-49-49.gh-issue-124397.plMglV.rst new file mode 100644 index 00000000000000..431448a484b45f --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-04-22-20-49-49.gh-issue-124397.plMglV.rst @@ -0,0 +1,3 @@ +The threading module added tooling to support concurrent iterator access: +:class:`threading.serialize_iterator`, :func:`threading.synchronized_iterator`, +and :func:`threading.concurrent_tee`. From bb911a2319365a4155e7398b4b7978589d8bed49 Mon Sep 17 00:00:00 2001 From: Amp Tell Date: Sat, 2 May 2026 00:39:58 +0200 Subject: [PATCH 11/11] gh-75707: tarfile: Add optional open() argument "mtime" (GH-138117) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This makes it possible to set the gzip header mtime field without overriding time.time(), making it useful when creating reproducible archives. * 📜🤖 Added by blurb_it. --------- Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com> Co-authored-by: Ethan Furman --- Doc/library/tarfile.rst | 4 +++ Lib/tarfile.py | 18 ++++++++----- Lib/test/test_tarfile.py | 27 +++++++++++++++++++ ...5-08-24-15-09-30.gh-issue-75707.GOWZrC.rst | 1 + 4 files changed, 43 insertions(+), 7 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-08-24-15-09-30.gh-issue-75707.GOWZrC.rst diff --git a/Doc/library/tarfile.rst b/Doc/library/tarfile.rst index a86469bb9ad704..6f1e01cf5aa6ee 100644 --- a/Doc/library/tarfile.rst +++ b/Doc/library/tarfile.rst @@ -142,6 +142,10 @@ Some facts and figures: a Zstandard dictionary used to improve compression of smaller amounts of data. + For modes ``'w:gz'`` and ``'w|gz'``, :func:`tarfile.open` accepts the + keyword argument *mtime* to create a gzip archive header with that mtime. By + default, the mtime is set to the time of creation of the archive. + For special purposes, there is a second format for *mode*: ``'filemode|[compression]'``. :func:`tarfile.open` will return a :class:`TarFile` object that processes its data as a stream of blocks. No random seeking will diff --git a/Lib/tarfile.py b/Lib/tarfile.py index 7f0b0b3c632573..4f47aaab9028d0 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -337,7 +337,7 @@ class _Stream: """ def __init__(self, name, mode, comptype, fileobj, bufsize, - compresslevel, preset): + compresslevel, preset, mtime): """Construct a _Stream object. """ self._extfileobj = True @@ -372,7 +372,7 @@ def __init__(self, name, mode, comptype, fileobj, bufsize, self.exception = zlib.error self._init_read_gz() else: - self._init_write_gz(compresslevel) + self._init_write_gz(compresslevel, mtime) elif comptype == "bz2": try: @@ -421,7 +421,7 @@ def __del__(self): if hasattr(self, "closed") and not self.closed: self.close() - def _init_write_gz(self, compresslevel): + def _init_write_gz(self, compresslevel, mtime): """Initialize for writing with gzip compression. """ self.cmp = self.zlib.compressobj(compresslevel, @@ -429,7 +429,9 @@ def _init_write_gz(self, compresslevel): -self.zlib.MAX_WBITS, self.zlib.DEF_MEM_LEVEL, 0) - timestamp = struct.pack("