From 0abca3e10d13eb925e1a2f92627cf7f15ac67c01 Mon Sep 17 00:00:00 2001 From: Mike Edmunds Date: Wed, 31 Jul 2024 17:12:14 -0700 Subject: [PATCH 01/10] Stop incorrectly RFC 2047 encoding non-ASCII email addresses Email generators had been incorrectly flattening non-ASCII email addresses to RFC 2047 encoded-word format, leaving them undeliverable. (RFC 2047 prohibits use of encoded-word in an addr-spec.) This change raises a ValueError when attempting to flatten an EmailMessage with a non-ASCII addr-spec and a policy with utf8=False. (Exception: If the non-ASCII address originated from parsing a message, it will be flattened as originally parsed, without error.) Non-ASCII email addresses are supported when using a policy with utf8=True (such as email.policy.SMTPUTF8) under RFCs 6531 and 6532. Non-ASCII email address domains (but not localparts) can also be used with non-SMTPUTF8 policies by encoding the domain as an IDNA A-label. (The email package does not perform this encoding, because it cannot know whether the caller wants IDNA 2003, IDNA 2008, or some other variant such as UTS #46.) --- Doc/library/email.policy.rst | 10 ++++- Lib/email/_header_value_parser.py | 11 +++++ Lib/test/test_email/test_generator.py | 58 ++++++++++++++++++++++++++- 3 files changed, 75 insertions(+), 4 deletions(-) diff --git a/Doc/library/email.policy.rst b/Doc/library/email.policy.rst index 314767d0802a08..d5be4d6625176a 100644 --- a/Doc/library/email.policy.rst +++ b/Doc/library/email.policy.rst @@ -406,11 +406,17 @@ added matters. To illustrate:: .. attribute:: utf8 If ``False``, follow :rfc:`5322`, supporting non-ASCII characters in - headers by encoding them as "encoded words". If ``True``, follow - :rfc:`6532` and use ``utf-8`` encoding for headers. Messages + headers by encoding them as :rfc:`2047` "encoded words". If ``True``, + follow :rfc:`6532` and use ``utf-8`` encoding for headers. Messages formatted in this way may be passed to SMTP servers that support the ``SMTPUTF8`` extension (:rfc:`6531`). + .. versionchanged:: 3.13 + If ``False``, the generator will raise a ``ValueError`` if any email + address contains non-ASCII characters. To send to a non-ASCII domain + with ``utf8=False``, encode the domain using the third-party + :pypi:`idna` module or :mod:`encodings.idna`. No RFC allows a non-ASCII + username ("localpart") in an email address with ``utf8=False``. .. attribute:: refold_source diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index ec2215a5e5f33c..ff75b9acd81fd8 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -2829,6 +2829,17 @@ def _refold_parse_tree(parse_tree, *, policy): _fold_mime_parameters(part, lines, maxlen, encoding) continue + if want_encoding and part.token_type == 'addr-spec': + # RFC2047 forbids encoded-word in any part of an addr-spec. + if charset == 'unknown-8bit': + # Non-ASCII addr-spec came from parsed message; leave unchanged. + want_encoding = False + else: + raise ValueError( + "Non-ASCII address requires policy with utf8=True:" + " '{}'".format(part) + ) + if want_encoding and not wrap_as_ew_blocked: if not part.as_ew_allowed: want_encoding = False diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py index c75a842c33578e..f6621e7bd96078 100644 --- a/Lib/test/test_email/test_generator.py +++ b/Lib/test/test_email/test_generator.py @@ -1,4 +1,5 @@ import io +import re import textwrap import unittest from email import message_from_string, message_from_bytes @@ -288,6 +289,28 @@ def test_keep_long_encoded_newlines(self): g.flatten(msg) self.assertEqual(s.getvalue(), self.typ(expected)) + def test_non_ascii_addr_spec_raises(self): + # RFC2047 encoded-word is not permitted in any part of an addr-spec. + # (See also test_non_ascii_addr_spec_preserved below.) + g = self.genclass(self.ioclass(), policy=self.policy.clone(utf8=False)) + cases = [ + 'wők@example.com', + 'wok@exàmple.com', + 'wők@exàmple.com', + '"Name, for display" ', + 'Näyttönimi ', + ] + for address in cases: + with self.subTest(address=address): + msg = EmailMessage() + msg['To'] = address + expected_error = re.escape( + "Non-ASCII address requires policy with utf8=True:" + " '{}'".format(msg['To'].addresses[0].addr_spec) + ) + with self.assertRaisesRegex(ValueError, expected_error): + g.flatten(msg) + class TestGenerator(TestGeneratorBase, TestEmailBase): @@ -432,12 +455,12 @@ def test_cte_type_7bit_transforms_8bit_cte(self): def test_smtputf8_policy(self): msg = EmailMessage() - msg['From'] = "Páolo " + msg['From'] = "Páolo " msg['To'] = 'Dinsdale' msg['Subject'] = 'Nudge nudge, wink, wink \u1F609' msg.set_content("oh là là, know what I mean, know what I mean?") expected = textwrap.dedent("""\ - From: Páolo + From: Páolo To: Dinsdale Subject: Nudge nudge, wink, wink \u1F609 Content-Type: text/plain; charset="utf-8" @@ -472,6 +495,37 @@ def test_smtp_policy(self): g.flatten(msg) self.assertEqual(s.getvalue(), expected) + def test_non_ascii_addr_spec_preserved(self): + # A defective non-ASCII addr-spec parsed from the original + # message is left unchanged when flattening. + # (See also test_non_ascii_addr_spec_raises above.) + source = ( + 'To: jörg@example.com, "But a long name still works with refold_source" ' + ).encode() + expected = ( + b'To: j\xc3\xb6rg@example.com,\n' + b' "But a long name still works with refold_source" \n' + b'\n' + ) + msg = message_from_bytes(source, policy=policy.default) + s = io.BytesIO() + g = BytesGenerator(s, policy=policy.default) + g.flatten(msg) + self.assertEqual(s.getvalue(), expected) + + def test_idna_encoding_preserved(self): + # Nothing tries to decode a pre-encoded IDNA domain. + msg = EmailMessage() + msg["To"] = Address( + username='jörg', + domain='☕.example'.encode('idna').decode() # IDNA 2003 + ) + expected = 'To: jörg@xn--53h.example\n\n'.encode() + s = io.BytesIO() + g = BytesGenerator(s, policy=policy.default.clone(utf8=True)) + g.flatten(msg) + self.assertEqual(s.getvalue(), expected) + if __name__ == '__main__': unittest.main() From faa40063315616479fdcdc6b095160719477d687 Mon Sep 17 00:00:00 2001 From: Mike Edmunds Date: Wed, 31 Jul 2024 17:23:42 -0700 Subject: [PATCH 02/10] Blurbs --- .../next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst | 3 +++ .../Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst | 3 +++ 2 files changed, 6 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst create mode 100644 Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst diff --git a/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst b/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst new file mode 100644 index 00000000000000..673bdd2309dcc6 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst @@ -0,0 +1,3 @@ +Stop incorrectly using RFC 2047 "encoded words" for email addresses with +non-ASCII characters when email.generator is called using a policy with +``utf8=False``. diff --git a/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst b/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst new file mode 100644 index 00000000000000..673bdd2309dcc6 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst @@ -0,0 +1,3 @@ +Stop incorrectly using RFC 2047 "encoded words" for email addresses with +non-ASCII characters when email.generator is called using a policy with +``utf8=False``. From bd6845dc6341704b0ea861f67170c620e47fa56a Mon Sep 17 00:00:00 2001 From: Mike Edmunds Date: Tue, 1 Apr 2025 13:02:16 -0700 Subject: [PATCH 03/10] fixup! Stop incorrectly RFC 2047 encoding non-ASCII email addresses - Incorporate PR review feedback - Improve docs --- Doc/library/email.errors.rst | 9 +++++++++ Doc/library/email.policy.rst | 17 +++++++++++------ Lib/email/_header_value_parser.py | 2 +- Lib/email/errors.py | 4 ++++ Lib/test/test_email/test_generator.py | 10 ++++++---- ...024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst | 8 +++++--- ...24-07-31-17-23-06.gh-issue-122476.TtUa-c.rst | 8 +++++--- 7 files changed, 41 insertions(+), 17 deletions(-) diff --git a/Doc/library/email.errors.rst b/Doc/library/email.errors.rst index 689e7397cbcf1f..d9254039d882a1 100644 --- a/Doc/library/email.errors.rst +++ b/Doc/library/email.errors.rst @@ -59,6 +59,15 @@ The following exception classes are defined in the :mod:`email.errors` module: headers. +.. exception:: InvalidMailboxError() + + Raised when serializing a message with an address header that contains + a mailbox incompatible with the policy in use. + (See :attr:`email.policy.EmailPolicy.utf8`.) + + .. versionadded:: 3.14 + + .. exception:: MessageDefect() This is the base class for all defects found when parsing email messages. diff --git a/Doc/library/email.policy.rst b/Doc/library/email.policy.rst index 3d5f29e21de151..a3e0065cfe469d 100644 --- a/Doc/library/email.policy.rst +++ b/Doc/library/email.policy.rst @@ -411,12 +411,17 @@ added matters. To illustrate:: formatted in this way may be passed to SMTP servers that support the ``SMTPUTF8`` extension (:rfc:`6531`). - .. versionchanged:: 3.13 - If ``False``, the generator will raise a ``ValueError`` if any email - address contains non-ASCII characters. To send to a non-ASCII domain - with ``utf8=False``, encode the domain using the third-party - :pypi:`idna` module or :mod:`encodings.idna`. No RFC allows a non-ASCII - username ("localpart") in an email address with ``utf8=False``. + When ``False``, the generator will raise an + :exc:`~email.errors.InvalidMailboxError` if any address header includes + a mailbox ("addr-spec") with non-ASCII characters. To use a mailbox with + an internationalized domain name, first encode the domain using the + third-party :pypi:`idna` or :pypi:`uts46` module or with + :mod:`encodings.idna`. It is not possible to use a non-ASCII username + ("local-part") in a mailbox when ``utf8=False``. + + .. versionchanged:: 3.14 + Raises :exc:`~email.errors.InvalidMailboxError`. (Earlier versions + incorrectly applied :rfc:`2047` to non-ASCII addr-specs.) .. attribute:: refold_source diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index c0318f6c988d5a..bff9beb32aab6b 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -2843,7 +2843,7 @@ def _refold_parse_tree(parse_tree, *, policy): # Non-ASCII addr-spec came from parsed message; leave unchanged. want_encoding = False else: - raise ValueError( + raise errors.InvalidMailboxError( "Non-ASCII address requires policy with utf8=True:" " '{}'".format(part) ) diff --git a/Lib/email/errors.py b/Lib/email/errors.py index 6bc744bd59c5bb..e5601132d024fe 100644 --- a/Lib/email/errors.py +++ b/Lib/email/errors.py @@ -33,6 +33,10 @@ class HeaderWriteError(MessageError): """Error while writing headers.""" +class InvalidMailboxError(MessageError, ValueError): + """A mailbox was not compatible with the policy in use.""" + + # These are parsing defects which the parser was able to work around. class MessageDefect(ValueError): """Base class for a message defect.""" diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py index f6621e7bd96078..f28cbf1ebdcb3b 100644 --- a/Lib/test/test_email/test_generator.py +++ b/Lib/test/test_email/test_generator.py @@ -304,11 +304,13 @@ def test_non_ascii_addr_spec_raises(self): with self.subTest(address=address): msg = EmailMessage() msg['To'] = address - expected_error = re.escape( - "Non-ASCII address requires policy with utf8=True:" - " '{}'".format(msg['To'].addresses[0].addr_spec) + addr_spec = msg['To'].addresses[0].addr_spec + expected_error = ( + fr"(?i)(?=.*non-ascii)(?=.*utf8.*True)(?=.*{re.escape(addr_spec)})" ) - with self.assertRaisesRegex(ValueError, expected_error): + with self.assertRaisesRegex( + email.errors.InvalidMailboxError, expected_error + ): g.flatten(msg) diff --git a/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst b/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst index 673bdd2309dcc6..fb1574fb4ef709 100644 --- a/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst +++ b/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst @@ -1,3 +1,5 @@ -Stop incorrectly using RFC 2047 "encoded words" for email addresses with -non-ASCII characters when email.generator is called using a policy with -``utf8=False``. +The :mod:`email` module no longer incorrectly encodes non-ASCII characters +in email addresses using :rfc:`2047` encoding. Under a policy with ``utf8=True`` +this means the addresses will be correctly passed through. Under a policy with +``utf8=False``, attempting to serialize a message with non-ASCII email addresses +will now result in an :exc:`~email.errors.InvalidMailboxError`. diff --git a/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst b/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst index 673bdd2309dcc6..fb1574fb4ef709 100644 --- a/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst +++ b/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst @@ -1,3 +1,5 @@ -Stop incorrectly using RFC 2047 "encoded words" for email addresses with -non-ASCII characters when email.generator is called using a policy with -``utf8=False``. +The :mod:`email` module no longer incorrectly encodes non-ASCII characters +in email addresses using :rfc:`2047` encoding. Under a policy with ``utf8=True`` +this means the addresses will be correctly passed through. Under a policy with +``utf8=False``, attempting to serialize a message with non-ASCII email addresses +will now result in an :exc:`~email.errors.InvalidMailboxError`. From 43eaea1fb8aa279c9d1056048e7c2d49b8ed19ad Mon Sep 17 00:00:00 2001 From: Mike Edmunds Date: Mon, 26 May 2025 16:19:02 -0700 Subject: [PATCH 04/10] fixup! Stop incorrectly RFC 2047 encoding non-ASCII email addresses - Incorporate PR feedback - Tailor blurbs to individual issues --- Lib/email/_header_value_parser.py | 4 ++-- Lib/test/test_email/test_generator.py | 2 +- .../2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst | 13 ++++++++----- .../2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst | 12 +++++++----- 4 files changed, 18 insertions(+), 13 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index f51b311d2b8f93..eb86b22860b2c2 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -2846,8 +2846,8 @@ def _refold_parse_tree(parse_tree, *, policy): want_encoding = False else: raise errors.InvalidMailboxError( - "Non-ASCII address requires policy with utf8=True:" - " '{}'".format(part) + f"Non-ASCII mailbox '{part}' is invalid" + " under current policy setting (utf8=False)" ) if want_encoding and not wrap_as_ew_blocked: diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py index f28cbf1ebdcb3b..5c36a5a54323fa 100644 --- a/Lib/test/test_email/test_generator.py +++ b/Lib/test/test_email/test_generator.py @@ -306,7 +306,7 @@ def test_non_ascii_addr_spec_raises(self): msg['To'] = address addr_spec = msg['To'].addresses[0].addr_spec expected_error = ( - fr"(?i)(?=.*non-ascii)(?=.*utf8.*True)(?=.*{re.escape(addr_spec)})" + fr"(?i)(?=.*non-ascii)(?=.*{re.escape(addr_spec)})(?=.*policy.*utf8)" ) with self.assertRaisesRegex( email.errors.InvalidMailboxError, expected_error diff --git a/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst b/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst index fb1574fb4ef709..d4b07bf06b9236 100644 --- a/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst +++ b/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst @@ -1,5 +1,8 @@ -The :mod:`email` module no longer incorrectly encodes non-ASCII characters -in email addresses using :rfc:`2047` encoding. Under a policy with ``utf8=True`` -this means the addresses will be correctly passed through. Under a policy with -``utf8=False``, attempting to serialize a message with non-ASCII email addresses -will now result in an :exc:`~email.errors.InvalidMailboxError`. +The :mod:`email` module no longer incorrectly uses :rfc:`2047` encoding for +a mailbox with non-ASCII characters in its domain. Under a policy with +:attr:`~email.policy.EmailPolicy.utf8` set ``False``, attempting to serialize +such a message will now raise an :exc:`~email.errors.InvalidMailboxError`. +Either apply an appropriate IDNA encoding to convert the domain to ASCII before +serialization, or use :data:`email.policy.SMTPUTF8` (or another policy with +``utf8=True``) to correctly pass through the internationalized domain name +as Unicode characters. diff --git a/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst b/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst index fb1574fb4ef709..6fca53c1b2a409 100644 --- a/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst +++ b/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst @@ -1,5 +1,7 @@ -The :mod:`email` module no longer incorrectly encodes non-ASCII characters -in email addresses using :rfc:`2047` encoding. Under a policy with ``utf8=True`` -this means the addresses will be correctly passed through. Under a policy with -``utf8=False``, attempting to serialize a message with non-ASCII email addresses -will now result in an :exc:`~email.errors.InvalidMailboxError`. +The :mod:`email` module no longer incorrectly uses :rfc:`2047` encoding for +a mailbox with non-ASCII characters in its local-part. Under a policy with +:attr:`~email.policy.EmailPolicy.utf8` set ``False``, attempting to serialize +such a message will now raise an :exc:`~email.errors.InvalidMailboxError`. +There is no valid 7-bit encoding for an internationalized local-part. Use +:data:`email.policy.SMTPUTF8` (or another policy with ``utf8=True``) to +correctly pass through the local-part as Unicode characters. From 5aafc33dff828cb8c81fb43946fe7fb4f18474ec Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Wed, 22 Apr 2026 15:30:50 -0400 Subject: [PATCH 05/10] Bump versionadded to 3.15 --- Doc/library/email.errors.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Doc/library/email.errors.rst b/Doc/library/email.errors.rst index d9254039d882a1..0d9270c85165f4 100644 --- a/Doc/library/email.errors.rst +++ b/Doc/library/email.errors.rst @@ -1,4 +1,4 @@ -:mod:`!email.errors`: Exception and Defect classes +/:mod:`!email.errors`: Exception and Defect classes -------------------------------------------------- .. module:: email.errors @@ -65,7 +65,7 @@ The following exception classes are defined in the :mod:`email.errors` module: a mailbox incompatible with the policy in use. (See :attr:`email.policy.EmailPolicy.utf8`.) - .. versionadded:: 3.14 + .. versionadded:: 3.15 .. exception:: MessageDefect() From 3df70b636f501bbb7b413ad6970f8153a7daaaf4 Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Wed, 22 Apr 2026 16:15:56 -0400 Subject: [PATCH 06/10] fix inadvertent typo --- Doc/library/email.errors.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/email.errors.rst b/Doc/library/email.errors.rst index 460a9e58fc1174..c65e7e9f96cc23 100644 --- a/Doc/library/email.errors.rst +++ b/Doc/library/email.errors.rst @@ -1,4 +1,4 @@ -/:mod:`!email.errors`: Exception and Defect classes +:mod:`!email.errors`: Exception and Defect classes -------------------------------------------------- .. module:: email.errors From 8f3f6fd53512c89043ddf6d169ab42ef5670825f Mon Sep 17 00:00:00 2001 From: R David Murray Date: Sun, 26 Apr 2026 13:44:12 -0400 Subject: [PATCH 07/10] Remove incomplete fix and temporarily disable new tests. --- Lib/email/_header_value_parser.py | 11 ----------- Lib/test/test_email/test_generator.py | 6 ++++-- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index ee70c0b1119259..4c5394ab6353ac 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -2885,17 +2885,6 @@ def _refold_parse_tree(parse_tree, *, policy): last_word_is_ew = False continue - if want_encoding and part.token_type == 'addr-spec': - # RFC2047 forbids encoded-word in any part of an addr-spec. - if charset == 'unknown-8bit': - # Non-ASCII addr-spec came from parsed message; leave unchanged. - want_encoding = False - else: - raise errors.InvalidMailboxError( - f"Non-ASCII mailbox '{part}' is invalid" - " under current policy setting (utf8=False)" - ) - if want_encoding and not wrap_as_ew_blocked: if not part.as_ew_allowed: want_encoding = False diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py index bb45f1c9a4e6ef..fb9d9be3e20df0 100644 --- a/Lib/test/test_email/test_generator.py +++ b/Lib/test/test_email/test_generator.py @@ -296,7 +296,8 @@ def test_keep_long_encoded_newlines(self): g.flatten(msg) self.assertEqual(s.getvalue(), self.typ(expected)) - def test_non_ascii_addr_spec_raises(self): + # XXX renable after fix. + def xest_non_ascii_addr_spec_raises(self): # RFC2047 encoded-word is not permitted in any part of an addr-spec. # (See also test_non_ascii_addr_spec_preserved below.) g = self.genclass(self.ioclass(), policy=self.policy.clone(utf8=False)) @@ -579,7 +580,8 @@ def test_smtp_policy(self): g.flatten(msg) self.assertEqual(s.getvalue(), expected) - def test_non_ascii_addr_spec_preserved(self): + # XXX renable after fix. + def xest_non_ascii_addr_spec_preserved(self): # A defective non-ASCII addr-spec parsed from the original # message is left unchanged when flattening. # (See also test_non_ascii_addr_spec_raises above.) From f47e029a134be21e0e576873ad9702974b3dbc2e Mon Sep 17 00:00:00 2001 From: R David Murray Date: Sun, 26 Apr 2026 13:45:00 -0400 Subject: [PATCH 08/10] Move mime-parameter folding to top of loop. The mime parameter folder doesn't make use of the encoding check done be the code that is now below it, it does its own. So it makes more sense to take that branch first. This will simplify subsequent changes. --- Lib/email/_header_value_parser.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 4c5394ab6353ac..2d234cdfffea7f 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -2858,6 +2858,11 @@ def _refold_parse_tree(parse_tree, *, policy): if part is end_ew_not_allowed: wrap_as_ew_blocked -= 1 continue + if part.token_type == 'mime-parameters': + # Mime parameter folding (using RFC2231) is extra special. + _fold_mime_parameters(part, lines, maxlen, encoding) + last_word_is_ew = False + continue tstr = str(part) if not want_encoding: if part.token_type in ('ptext', 'vtext'): @@ -2879,12 +2884,6 @@ def _refold_parse_tree(parse_tree, *, policy): charset = 'utf-8' want_encoding = True - if part.token_type == 'mime-parameters': - # Mime parameter folding (using RFC2231) is extra special. - _fold_mime_parameters(part, lines, maxlen, encoding) - last_word_is_ew = False - continue - if want_encoding and not wrap_as_ew_blocked: if not part.as_ew_allowed: want_encoding = False From 73c7b6b60036ee28a7195c1bcccf07c621b11f9c Mon Sep 17 00:00:00 2001 From: R David Murray Date: Sun, 26 Apr 2026 14:49:51 -0400 Subject: [PATCH 09/10] Fix the bug, update the doc changes. This is a more complete fix, covering any syntax part where encoded words are not permitted, and the doc changes are adjusted accordingly. There is also no need for a new exception, since HeaderWriteError already exists. The fix itself is to use a separate code loop to fold parts that may not have encoded words, guaranteeing that we do not do incorrect encoding. This opens a door to simplifying the main folding loop, but that is a much bigger refactoring job better left for another time. --- Doc/library/email.errors.rst | 9 --- Doc/library/email.policy.rst | 19 +++-- Lib/email/_header_value_parser.py | 80 ++++++++++++++++--- Lib/email/errors.py | 4 - .../test_email/test__header_value_parser.py | 8 +- Lib/test/test_email/test_generator.py | 42 ++++++---- ...4-07-31-17-22-10.gh-issue-83938.TtUa-c.rst | 2 +- ...-07-31-17-23-06.gh-issue-122476.TtUa-c.rst | 2 +- 8 files changed, 112 insertions(+), 54 deletions(-) diff --git a/Doc/library/email.errors.rst b/Doc/library/email.errors.rst index c65e7e9f96cc23..2f7c9140cfcbe5 100644 --- a/Doc/library/email.errors.rst +++ b/Doc/library/email.errors.rst @@ -59,15 +59,6 @@ The following exception classes are defined in the :mod:`!email.errors` module: headers. -.. exception:: InvalidMailboxError() - - Raised when serializing a message with an address header that contains - a mailbox incompatible with the policy in use. - (See :attr:`email.policy.EmailPolicy.utf8`.) - - .. versionadded:: 3.15 - - .. exception:: MessageDefect() This is the base class for all defects found when parsing email messages. diff --git a/Doc/library/email.policy.rst b/Doc/library/email.policy.rst index b6ebfbd782c30a..8983b406edecb5 100644 --- a/Doc/library/email.policy.rst +++ b/Doc/library/email.policy.rst @@ -409,16 +409,19 @@ added matters. To illustrate:: the ``SMTPUTF8`` extension (:rfc:`6531`). When ``False``, the generator will raise an - :exc:`~email.errors.InvalidMailboxError` if any address header includes - a mailbox ("addr-spec") with non-ASCII characters. To use a mailbox with - an internationalized domain name, first encode the domain using the - third-party :pypi:`idna` or :pypi:`uts46` module or with - :mod:`encodings.idna`. It is not possible to use a non-ASCII username - ("local-part") in a mailbox when ``utf8=False``. + :exc:`~email.errors.HeaderWriteErrr` if any header includes non-ASCII + characters in a context where :rfc:`2047` does not permit encoded words. + This particularly applies to mailboxes ("addr-spec") with non-ASCII + characters, which can be created via :mod:~email.headerregistry.Address`. + To use a mailbox with non-ASCII domain name with ``utf8=False``, first + encode the domain using the third-party :pypi:`idna` or :pypi:`uts46` + module or with :mod:`encodings.idna`. It is not possible to use a + non-ASCII username ("local-part") in a mailbox when ``utf8=False``. .. versionchanged:: 3.14 - Raises :exc:`~email.errors.InvalidMailboxError`. (Earlier versions - incorrectly applied :rfc:`2047` to non-ASCII addr-specs.) + Can trigger the raising of :exc:`~email.errors.HeaderWriteError`. + (Earlier versions incorrectly applied :rfc:`2047` in certain contexts, + mostly notably in addr-specs.) .. attribute:: refold_source diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 2d234cdfffea7f..43216b0af84326 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -157,10 +157,7 @@ def all_defects(self): def startswith_fws(self): return self[0].startswith_fws() - @property - def as_ew_allowed(self): - """True if all top level tokens of this part may be RFC2047 encoded.""" - return all(part.as_ew_allowed for part in self) + as_ew_allowed = True @property def comments(self): @@ -429,6 +426,7 @@ def addr_spec(self): class AngleAddr(TokenList): token_type = 'angle-addr' + as_ew_allowed = False @property def local_part(self): @@ -847,26 +845,22 @@ def params(self): class ContentType(ParameterizedHeaderValue): token_type = 'content-type' - as_ew_allowed = False maintype = 'text' subtype = 'plain' class ContentDisposition(ParameterizedHeaderValue): token_type = 'content-disposition' - as_ew_allowed = False content_disposition = None class ContentTransferEncoding(TokenList): token_type = 'content-transfer-encoding' - as_ew_allowed = False cte = '7bit' class HeaderLabel(TokenList): token_type = 'header-label' - as_ew_allowed = False class MsgID(TokenList): @@ -2838,13 +2832,68 @@ def _steal_trailing_WSP_if_exists(lines): def _refold_parse_tree(parse_tree, *, policy): - """Return string of contents of parse_tree folded according to RFC rules. - - """ # max_line_length 0/None means no limit, ie: infinitely long. maxlen = policy.max_line_length or sys.maxsize encoding = 'utf-8' if policy.utf8 else 'us-ascii' lines = [''] # Folded lines to be output + if parse_tree.as_ew_allowed: + _refold_with_ew(parse_tree, lines, maxlen, encoding, policy=policy) + else: + _refold_without_ew(parse_tree, lines, maxlen, encoding, policy=policy) + return policy.linesep.join(lines) + policy.linesep + +def _refold_without_ew(parse_tree, lines, maxlen, encoding, *, policy): + parts = list(parse_tree) + while parts: + part = parts.pop(0) + tstr = str(part) + try: + tstr.encode(encoding) + except UnicodeEncodeError: + if any(isinstance(x, errors.UndecodableBytesDefect) + for x in part.all_defects): + # There is garbage data from parsing a message in binary mode, + # just pass it through. Not good, but the best we can do. + pass + elif policy.utf8: + # If this happens, it's a programmer error. + raise + else: + raise errors.HeaderWriteError( + f"Non-ASCII {part.token_type} '{part}' is invalid" + " under current policy setting (utf8=False)" + ) + if len(tstr) <= maxlen - len(lines[-1]): + lines[-1] += tstr + continue + # This part is too long to fit. The RFC wants us to break at + # "major syntactic breaks", so unless we don't consider this + # to be one, check if it will fit on the next line by itself. + if (part.syntactic_break and + len(tstr) + 1 <= maxlen): + newline = _steal_trailing_WSP_if_exists(lines) + if newline or part.startswith_fws(): + lines.append(newline + tstr) + continue + if not hasattr(part, 'encode'): + # It's not a terminal, try folding the subparts. + newparts = list(part) + parts = newparts + parts + continue + # We can't figure out how to wrap, it, so give up. + newline = _steal_trailing_WSP_if_exists(lines) + if newline or part.startswith_fws(): + lines.append(newline + tstr) + else: + # We can't fold it onto the next line either... + lines[-1] += tstr + return + + +def _refold_with_ew(parse_tree, lines, maxlen, encoding, *, policy): + """Return string of contents of parse_tree folded according to RFC rules. + + """ last_word_is_ew = False last_ew = None # if there is an encoded word in the last line of lines, # points to the encoded word's first character @@ -2885,7 +2934,10 @@ def _refold_parse_tree(parse_tree, *, policy): want_encoding = True if want_encoding and not wrap_as_ew_blocked: - if not part.as_ew_allowed: + if any( + not x.as_ew_allowed for x in part + if hasattr(x, 'as_ew_allowed') + ): want_encoding = False last_ew = None if part.syntactic_break: @@ -2966,6 +3018,8 @@ def _refold_parse_tree(parse_tree, *, policy): [ValueTerminal(make_quoted_pairs(p), 'ptext') for p in newparts] + [ValueTerminal('"', 'ptext')]) + _refold_without_ew(newparts, lines, maxlen, encoding, policy=policy) + continue if part.token_type == 'comment': newparts = ( [ValueTerminal('(', 'ptext')] + @@ -2993,7 +3047,7 @@ def _refold_parse_tree(parse_tree, *, policy): lines[-1] += tstr last_word_is_ew = last_word_is_ew and not bool(tstr.strip(_WSP)) - return policy.linesep.join(lines) + policy.linesep + return def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, last_word_is_ew): """Fold string to_encode into lines as encoded word, combining if allowed. diff --git a/Lib/email/errors.py b/Lib/email/errors.py index e5601132d024fe..6bc744bd59c5bb 100644 --- a/Lib/email/errors.py +++ b/Lib/email/errors.py @@ -33,10 +33,6 @@ class HeaderWriteError(MessageError): """Error while writing headers.""" -class InvalidMailboxError(MessageError, ValueError): - """A mailbox was not compatible with the policy in use.""" - - # These are parsing defects which the parser was able to work around. class MessageDefect(ValueError): """Base class for a message defect.""" diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index e28fe3892015b9..f8f5c41b4474c8 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -3364,10 +3364,12 @@ def test_fold_unfoldable_element_stealing_whitespace(self): self._test(token, expected, policy=policy) def test_encoded_word_with_undecodable_bytes(self): - self._test(parser.get_address_list( - ' =?utf-8?Q?=E5=AE=A2=E6=88=B6=E6=AD=A3=E8=A6=8F=E4=BA=A4=E7?=' + self._test( + parser.get_address_list( + ' =?utf-8?Q?=E5=AE=A2=E6=88=B6=E6=AD=A3=E8=A6=8F=E4=BA=A4=E7?=' + ' ' )[0], - ' =?unknown-8bit?b?5a6i5oi25q2j6KaP5Lqk5w==?=\n', + ' =?unknown-8bit?b?5a6i5oi25q2j6KaP5Lqk5w==?= \n', ) diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py index fb9d9be3e20df0..f34e1f214a362b 100644 --- a/Lib/test/test_email/test_generator.py +++ b/Lib/test/test_email/test_generator.py @@ -296,30 +296,43 @@ def test_keep_long_encoded_newlines(self): g.flatten(msg) self.assertEqual(s.getvalue(), self.typ(expected)) - # XXX renable after fix. - def xest_non_ascii_addr_spec_raises(self): - # RFC2047 encoded-word is not permitted in any part of an addr-spec. - # (See also test_non_ascii_addr_spec_preserved below.) + def test_non_ascii_addr_spec_raises(self): + # non-ascii is not permitted in any part of an addr-spec. If the + # programmer generated it, it's an error. (See also + # test_non_ascii_addr_spec_preserved below.) g = self.genclass(self.ioclass(), policy=self.policy.clone(utf8=False)) + # XXX The particular part detected here isn't part of a behavioral + # spec and may change in the future. cases = [ - 'wők@example.com', - 'wok@exàmple.com', - 'wők@exàmple.com', - '"Name, for display" ', - 'Näyttönimi ', + ('wők@example.com', 'wők', 'local-part'), + ('wok@exàmple.com', 'exàmple.com', 'domain'), + ('wők@exàmple.com', 'wők', 'local-part'), + ( + '"Name, for display" ', + 'wők@example.com', + 'addr-spec', + ), + ( + 'Näyttönimi ', + 'wők@example.com', + 'addr-spec', + ), ] - for address in cases: + for address, badtoken, partname in cases: with self.subTest(address=address): msg = EmailMessage() msg['To'] = address - addr_spec = msg['To'].addresses[0].addr_spec expected_error = ( - fr"(?i)(?=.*non-ascii)(?=.*{re.escape(addr_spec)})(?=.*policy.*utf8)" + fr"(?i)(?=.*non-ascii)" + fr"(?=.*{re.escape(badtoken)})" + fr"(?=.*{partname})" + fr"(?=.*policy.*utf8)" ) with self.assertRaisesRegex( - email.errors.InvalidMailboxError, expected_error + email.errors.HeaderWriteError, expected_error ): g.flatten(msg) + def _test_boundary_detection(self, linesep): # Generate a boundary token in the same way as _make_boundary token = random.randrange(sys.maxsize) @@ -580,8 +593,7 @@ def test_smtp_policy(self): g.flatten(msg) self.assertEqual(s.getvalue(), expected) - # XXX renable after fix. - def xest_non_ascii_addr_spec_preserved(self): + def test_non_ascii_addr_spec_preserved(self): # A defective non-ASCII addr-spec parsed from the original # message is left unchanged when flattening. # (See also test_non_ascii_addr_spec_raises above.) diff --git a/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst b/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst index d4b07bf06b9236..7082c72f685b05 100644 --- a/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst +++ b/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst @@ -1,7 +1,7 @@ The :mod:`email` module no longer incorrectly uses :rfc:`2047` encoding for a mailbox with non-ASCII characters in its domain. Under a policy with :attr:`~email.policy.EmailPolicy.utf8` set ``False``, attempting to serialize -such a message will now raise an :exc:`~email.errors.InvalidMailboxError`. +such a message will now raise an :exc:`~email.errors.HeaderWriteError`. Either apply an appropriate IDNA encoding to convert the domain to ASCII before serialization, or use :data:`email.policy.SMTPUTF8` (or another policy with ``utf8=True``) to correctly pass through the internationalized domain name diff --git a/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst b/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst index 6fca53c1b2a409..29c076d3a746c6 100644 --- a/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst +++ b/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst @@ -1,7 +1,7 @@ The :mod:`email` module no longer incorrectly uses :rfc:`2047` encoding for a mailbox with non-ASCII characters in its local-part. Under a policy with :attr:`~email.policy.EmailPolicy.utf8` set ``False``, attempting to serialize -such a message will now raise an :exc:`~email.errors.InvalidMailboxError`. +such a message will now raise an :exc:`~email.errors.HeaderWriteError`. There is no valid 7-bit encoding for an internationalized local-part. Use :data:`email.policy.SMTPUTF8` (or another policy with ``utf8=True``) to correctly pass through the local-part as Unicode characters. From 0a259f4800c80db4d72f4a30ce5f266e87d4866c Mon Sep 17 00:00:00 2001 From: R David Murray Date: Sun, 26 Apr 2026 15:17:23 -0400 Subject: [PATCH 10/10] Add some tests where the local part is folded. Behavior when folding in parts versus rendering on one line takes different code paths, so make sure both work. --- Lib/test/test_email/test_generator.py | 28 ++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py index f34e1f214a362b..8d912738029f78 100644 --- a/Lib/test/test_email/test_generator.py +++ b/Lib/test/test_email/test_generator.py @@ -300,7 +300,8 @@ def test_non_ascii_addr_spec_raises(self): # non-ascii is not permitted in any part of an addr-spec. If the # programmer generated it, it's an error. (See also # test_non_ascii_addr_spec_preserved below.) - g = self.genclass(self.ioclass(), policy=self.policy.clone(utf8=False)) + p = self.policy.clone(utf8=False, max_line_length=20) + g = self.genclass(self.ioclass(), policy=p) # XXX The particular part detected here isn't part of a behavioral # spec and may change in the future. cases = [ @@ -317,6 +318,12 @@ def test_non_ascii_addr_spec_raises(self): 'wők@example.com', 'addr-spec', ), + ( + '"a lőng quoted string as the local part"@example.com', + 'a lőng quoted string as the local part', + 'local-part', + ), + ] for address, badtoken, partname in cases: with self.subTest(address=address): @@ -333,6 +340,25 @@ def test_non_ascii_addr_spec_raises(self): ): g.flatten(msg) + def test_local_part_quoted_string_wrapped_correctly(self): + msg = self.msgmaker(self.typ(textwrap.dedent("""\ + To: <"a long local part in a quoted string"@example.com> + Subject: test + + None + """)), policy=self.policy.clone(max_line_length=20)) + expected = textwrap.dedent("""\ + To: <"a long local part in a + quoted string"@example.com> + Subject: test + + None + """) + s = self.ioclass() + g = self.genclass(s, policy=self.policy.clone(max_line_length=30)) + g.flatten(msg) + self.assertEqual(s.getvalue(), self.typ(expected)) + def _test_boundary_detection(self, linesep): # Generate a boundary token in the same way as _make_boundary token = random.randrange(sys.maxsize)