@@ -460,6 +460,36 @@ def check_dot_atom(label: str, start_descr: str, end_descr: str, is_hostname: bo
460460 raise EmailSyntaxError ("An email address cannot have a period and a hyphen next to each other." )
461461
462462
463+ def uts46_valid_char (c ):
464+ # By exhaustively searching for characters rejected by
465+ # for c in (chr(i) for i in range(0x110000)):
466+ # idna.uts46_remap(c, std3_rules=False, transitional=False)
467+ # I found the following rules are pretty close.
468+ c = ord (c )
469+ if 0x80 <= c <= 0x9f :
470+ # 8-bit ASCII range.
471+ return False
472+ elif (0x2010 <= c <= 0x2060 and not (0x2024 <= c <= 0x2026 ) and not (0x2028 <= c <= 0x202E )) \
473+ or c in (0x00AD , 0x2064 , 0xFF0E ) \
474+ or 0x200B <= c <= 0x200D \
475+ or 0x1BCA0 <= c <= 0x1BCA3 :
476+ # Characters that are permitted but fall into one of the
477+ # tests below.
478+ return True
479+ elif unicodedata .category (chr (c )) in ("Cf" , "Cn" , "Co" , "Cs" , "Zs" , "Zl" , "Zp" ):
480+ # There are a bunch of Zs characters including regular space
481+ # that are allowed by UTS46 but are not allowed in domain
482+ # names anyway.
483+ #
484+ # There are some Cn (unassigned) characters that the idna
485+ # package doesn't reject but we can, I think.
486+ return False
487+ elif "002E" in unicodedata .decomposition (chr (c )).split (" " ):
488+ # Characters that decompose into a sequence with a dot.
489+ return False
490+ return True
491+
492+
463493class DomainNameValidationResult (TypedDict ):
464494 ascii_domain : str
465495 domain : str
@@ -484,6 +514,15 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob
484514 # they may not be valid, safe, or sensible Unicode strings.
485515 check_unsafe_chars (domain )
486516
517+ # Reject characters that would be rejected by UTS-46 normalization next but
518+ # with an error message under our control.
519+ bad_chars = {
520+ safe_character_display (c ) for c in domain
521+ if not uts46_valid_char (c )
522+ }
523+ if bad_chars :
524+ raise EmailSyntaxError ("The part after the @-sign contains invalid characters: " + ", " .join (sorted (bad_chars )) + "." )
525+
487526 # Perform UTS-46 normalization, which includes casefolding, NFC normalization,
488527 # and converting all label separators (the period/full stop, fullwidth full stop,
489528 # ideographic full stop, and halfwidth ideographic full stop) to regular dots.
0 commit comments