Skip to content
82 changes: 45 additions & 37 deletions Lib/email/_header_value_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@
# Useful constants and functions
#

WSP = set(' \t')
_WSP = ' \t'
WSP = set(_WSP)
CFWS_LEADER = WSP | set('(')
SPECIALS = set(r'()<>@,:;.\"[]')
ATOM_ENDS = SPECIALS | WSP
Expand Down Expand Up @@ -2835,6 +2836,13 @@ def _steal_trailing_WSP_if_exists(lines):
lines.pop()
return wsp


def _last_word_is_sill_ew(_last_word_is_ew, added_str):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find this name a bit confusing. What it actually does is "return true if last_word_is_ew is true and added_str consists of whitespace only". I can't think of a name that is shorter and as descriptive as the actual code. So I'm not in favor of this function.

# If the last word is an encoded word, and the added string is all WSP,
# then (and only then) is the last word is still an encoded word.
return _last_word_is_ew and not bool(added_str.strip(_WSP))


def _refold_parse_tree(parse_tree, *, policy):
"""Return string of contents of parse_tree folded according to RFC rules.

Expand All @@ -2843,11 +2851,9 @@ def _refold_parse_tree(parse_tree, *, policy):
maxlen = policy.max_line_length or sys.maxsize
encoding = 'utf-8' if policy.utf8 else 'us-ascii'
lines = [''] # Folded lines to be output
leading_whitespace = '' # When we have whitespace between two encoded
# words, we may need to encode the whitespace
# at the beginning of the second word.
last_ew = None # Points to the last encoded character if there's an ew on
# the line
last_word_is_ew = False
last_ew = None # if there is an encoded word in the last line of lines,
# points to the encoded word's first character
last_charset = None
wrap_as_ew_blocked = 0
want_encoding = False # This is set to True if we need to encode this part
Expand Down Expand Up @@ -2882,6 +2888,7 @@ def _refold_parse_tree(parse_tree, *, policy):
if part.token_type == 'mime-parameters':
# Mime parameter folding (using RFC2231) is extra special.
_fold_mime_parameters(part, lines, maxlen, encoding)
last_word_is_ew = False
continue

if want_encoding and not wrap_as_ew_blocked:
Expand All @@ -2898,6 +2905,7 @@ def _refold_parse_tree(parse_tree, *, policy):
# XXX what if encoded_part has no leading FWS?
lines.append(newline)
lines[-1] += encoded_part
last_word_is_ew = False
continue
# Either this is not a major syntactic break, so we don't
# want it on a line by itself even if it fits, or it
Expand All @@ -2916,11 +2924,16 @@ def _refold_parse_tree(parse_tree, *, policy):
(last_charset == 'unknown-8bit' or
last_charset == 'utf-8' and charset != 'us-ascii')):
last_ew = None
last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
part.ew_combine_allowed, charset, leading_whitespace)
# This whitespace has been added to the lines in _fold_as_ew()
# so clear it now.
leading_whitespace = ''
last_ew = _fold_as_ew(
tstr,
lines,
maxlen,
last_ew,
part.ew_combine_allowed,
charset,
last_word_is_ew,
)
last_word_is_ew = True
last_charset = charset
want_encoding = False
continue
Expand All @@ -2933,28 +2946,20 @@ def _refold_parse_tree(parse_tree, *, policy):

if len(tstr) <= maxlen - len(lines[-1]):
lines[-1] += tstr
last_word_is_ew = _last_word_is_sill_ew(last_word_is_ew, tstr)
continue

# This part is too long to fit. The RFC wants us to break at
# "major syntactic breaks", so unless we don't consider this
# to be one, check if it will fit on the next line by itself.
leading_whitespace = ''
if (part.syntactic_break and
len(tstr) + 1 <= maxlen):
newline = _steal_trailing_WSP_if_exists(lines)
if newline or part.startswith_fws():
# We're going to fold the data onto a new line here. Due to
# the way encoded strings handle continuation lines, we need to
# be prepared to encode any whitespace if the next line turns
# out to start with an encoded word.
lines.append(newline + tstr)

whitespace_accumulator = []
for char in lines[-1]:
if char not in WSP:
break
whitespace_accumulator.append(char)
leading_whitespace = ''.join(whitespace_accumulator)
last_word_is_ew = _last_word_is_sill_ew(
last_word_is_ew, lines[-1]
)
last_ew = None
continue
if not hasattr(part, 'encode'):
Expand Down Expand Up @@ -2994,10 +2999,11 @@ def _refold_parse_tree(parse_tree, *, policy):
else:
# We can't fold it onto the next line either...
lines[-1] += tstr
last_word_is_ew = _last_word_is_sill_ew(last_word_is_ew, tstr)

return policy.linesep.join(lines) + policy.linesep

def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, leading_whitespace):
def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, last_word_is_ew):
"""Fold string to_encode into lines as encoded word, combining if allowed.
Return the new value for last_ew, or None if ew_combine_allowed is False.

Expand All @@ -3012,14 +3018,24 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset,
to_encode = str(
get_unstructured(lines[-1][last_ew:] + to_encode))
lines[-1] = lines[-1][:last_ew]
elif to_encode[0] in WSP:
elif to_encode[0] in WSP and not last_word_is_ew:
# We're joining this to non-encoded text, so don't encode
# the leading blank.
leading_wsp = to_encode[0]
to_encode = to_encode[1:]
if (len(lines[-1]) == maxlen):
lines.append(_steal_trailing_WSP_if_exists(lines))
lines[-1] += leading_wsp
elif last_word_is_ew:
# If we are following up an encoded word with another encoded word,
# any white space between the two will be ignored when decoded.
# Therefore, we encode all to-be-displayed whitespace in the second
# encoded word.
len_without_wsp = len(lines[-1].rstrip(_WSP))
leading_whitespace = lines[-1][len_without_wsp:]
lines[-1] = (lines[-1][:len_without_wsp]
+ (' ' if leading_whitespace else ''))
to_encode = leading_whitespace + to_encode

trailing_wsp = ''
if to_encode[-1] in WSP:
Expand All @@ -3040,20 +3056,13 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset,

while to_encode:
remaining_space = maxlen - len(lines[-1])
text_space = remaining_space - chrome_len - len(leading_whitespace)
text_space = remaining_space - chrome_len
if text_space <= 0:
lines.append(' ')
newline = _steal_trailing_WSP_if_exists(lines)
lines.append(newline or ' ')
new_last_ew = len(lines[-1])
continue
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One of my new tests revealed a pre-existing problem here:

Suggested change
continue
if text_space <= 0:
newline = _steal_trailing_WSP_if_exists(lines)
lines.append(newline or ' ')
new_last_ew = len(lines[-1])
continue

Copy link
Contributor Author

@robsdedude robsdedude Feb 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This suggestion does not seem to fix the test cases test_ew_folding_round_trip_1 and test_ew_folding_round_trip_2 you posted in a comment below 🤔

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm. Maybe there's a bug in your steal all whitespace function? Using my suggestion the tests pass for me, but I see you didn't make that change, at least in what I can see. On the other hand, github also doesn't show the fix applied, so I'm not sure where you are at, exactly.

However, I also see that I failed to run the full test suite before posting the review, and there's a whitespace change in three existing tests. I think it's a bugfix, but I'll have to look closer when I get a chance later today.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I only had the change applied locally. I think I found the issue. The suggestion here looks like the lines.append(' ') right after if text_space <= 0: should remain there. I suppose it shouldn't. Makes more sense that way when reading the code, too 🙃

With the suggestion applied correctly to my local working tree I now see the same: your new tests pass, 3 old tests started failing.

I went ahead and adjusted the test expectations. Let me know what you think once you've had a chance to have a closer look.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah that suggestion looks borked. I don't see any way to dismiss it, unfortunately.

The test expectation changes are correct, that extra space after Subject was a bug. I may even have opened an issue about it, but if so I can't find it. Probably just made a "note to self" that I've since lost.


# If we are at the start of a continuation line, prepend whitespace
# (we only want to do this when the line starts with an encoded word
# but if we're folding in this helper function, then we know that we
# are going to be writing out an encoded word.)
if len(lines) > 1 and len(lines[-1]) == 1 and leading_whitespace:
encoded_word = _ew.encode(leading_whitespace, charset=encode_as)
lines[-1] += encoded_word
leading_whitespace = ''

to_encode_word = to_encode[:text_space]
encoded_word = _ew.encode(to_encode_word, charset=encode_as)
excess = len(encoded_word) - remaining_space
Expand All @@ -3065,7 +3074,6 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset,
excess = len(encoded_word) - remaining_space
lines[-1] += encoded_word
to_encode = to_encode[len(to_encode_word):]
leading_whitespace = ''

if to_encode:
lines.append(' ')
Expand Down
44 changes: 44 additions & 0 deletions Lib/test/test_email/test_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,50 @@ def test_defaults_handle_spaces_at_start_of_continuation_line(self):
g.flatten(msg)
self.assertEqual(s.getvalue(), expected)

# gh-144156: fold between non-encoded and encoded words don't need to encoded
# the separating space
def test_defaults_handle_spaces_at_start_of_continuation_line_2(self):
source = ("Re: [SOS-1495488] Commande et livraison - Demande de retour - "
"bibijolie - 251210-AABBCC - Abo actualités digitales 20 semaines "
"d’abonnement à 24 heures, Bilan, Tribune de Genève et tous les titres Tamedia")
expected = (
b"Subject: "
b"Re: [SOS-1495488] Commande et livraison - Demande de retour -\n"
b" bibijolie - 251210-AABBCC - Abo =?utf-8?q?actualit=C3=A9s?= digitales 20\n"
b" semaines =?utf-8?q?d=E2=80=99abonnement_=C3=A0?= 24 heures, Bilan, Tribune de\n"
b" =?utf-8?q?Gen=C3=A8ve?= et tous les titres Tamedia\n\n"
)
msg = EmailMessage()
msg['Subject'] = source
s = io.BytesIO()
g = BytesGenerator(s)
g.flatten(msg)
self.assertEqual(s.getvalue(), expected)

def test_ew_folding_round_trip_1(self):
print()
source = "aaaaaaaaa фффффффф "
msg = EmailMessage()
msg['Subject'] = source
s = io.BytesIO()
g = BytesGenerator(s, maxheaderlen=30)
g.flatten(msg)
flat = s.getvalue()
reparsed = message_from_bytes(flat, policy=policy.default)['Subject']
self.assertMultiLineEqual(reparsed, source)

def test_ew_folding_round_trip_2(self):
print()
source = "aaa aaaaaaa aaa ффф фффф "
msg = EmailMessage()
msg['Subject'] = source
s = io.BytesIO()
g = BytesGenerator(s, maxheaderlen=30)
g.flatten(msg)
flat = s.getvalue()
reparsed = message_from_bytes(flat, policy=policy.default)['Subject']
self.assertMultiLineEqual(reparsed, source)

def test_cte_type_7bit_handles_unknown_8bit(self):
source = ("Subject: Maintenant je vous présente mon "
"collègue\n\n").encode('utf-8')
Expand Down
4 changes: 2 additions & 2 deletions Lib/test/test_email/test_headerregistry.py
Original file line number Diff line number Diff line change
Expand Up @@ -1711,7 +1711,7 @@ def test_fold_unstructured_with_overlong_word(self):
'singlewordthatwontfit')
self.assertEqual(
h.fold(policy=policy.default.clone(max_line_length=20)),
'Subject: \n'
'Subject:\n'
' =?utf-8?q?thisisa?=\n'
' =?utf-8?q?verylon?=\n'
' =?utf-8?q?glineco?=\n'
Expand All @@ -1727,7 +1727,7 @@ def test_fold_unstructured_with_two_overlong_words(self):
'singlewordthatwontfit plusanotherverylongwordthatwontfit')
self.assertEqual(
h.fold(policy=policy.default.clone(max_line_length=20)),
'Subject: \n'
'Subject:\n'
' =?utf-8?q?thisisa?=\n'
' =?utf-8?q?verylon?=\n'
' =?utf-8?q?glineco?=\n'
Expand Down
2 changes: 1 addition & 1 deletion Lib/test/test_email/test_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ def test_non_ascii_chars_do_not_cause_inf_loop(self):
actual = policy.fold('Subject', 'ą' * 12)
self.assertEqual(
actual,
'Subject: \n' +
'Subject:\n' +
12 * ' =?utf-8?q?=C4=85?=\n')

def test_short_maxlen_error(self):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix the folding of headers by the :mod:`email` library when :rfc:`2047` encoded words are used. Now whitespace is correctly preserved and also correctly added between adjacent encoded words. The latter property was broken by the fix for gh-92081, which mostly fixed previous failures to preserve whitespace.
Loading