Add explicit checks for internationalized domain name characters invalid under UTS-46 to improve the error message

JoshData · JoshData · commit 5f1f6dfbe95e · 2024-11-27T21:13:17.000-05:00
diff --git a/email_validator/syntax.py b/email_validator/syntax.py
@@ -460,6 +460,36 @@ def check_dot_atom(label: str, start_descr: str, end_descr: str, is_hostname: bo
             raise EmailSyntaxError("An email address cannot have a period and a hyphen next to each other.")
 
 
+def uts46_valid_char(c):
+  # By exhaustively searching for characters rejected by
+  # for c in (chr(i) for i in range(0x110000)):
+  #   idna.uts46_remap(c, std3_rules=False, transitional=False)
+  # I found the following rules are pretty close.
+  c = ord(c)
+  if 0x80 <= c <= 0x9f:
+    # 8-bit ASCII range.
+    return False
+  elif (0x2010 <= c <= 0x2060 and not (0x2024 <= c <= 0x2026) and not (0x2028 <= c <= 0x202E)) \
+   or c in (0x00AD, 0x2064, 0xFF0E) \
+   or 0x200B <= c <= 0x200D \
+   or 0x1BCA0 <= c <= 0x1BCA3:
+    # Characters that are permitted but fall into one of the
+    # tests below.
+    return True
+  elif unicodedata.category(chr(c)) in ("Cf", "Cn", "Co", "Cs", "Zs", "Zl", "Zp"):
+    # There are a bunch of Zs characters including regular space
+    # that are allowed by UTS46 but are not allowed in domain
+    # names anyway.
+    #
+    # There are some Cn (unassigned) characters that the idna
+    # package doesn't reject but we can, I think.
+    return False
+  elif "002E" in unicodedata.decomposition(chr(c)).split(" "):
+    # Characters that decompose into a sequence with a dot.
+    return False
+  return True
+
+
 class DomainNameValidationResult(TypedDict):
     ascii_domain: str
     domain: str
@@ -484,6 +514,15 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob
     # they may not be valid, safe, or sensible Unicode strings.
     check_unsafe_chars(domain)
 
+    # Reject characters that would be rejected by UTS-46 normalization next but
+    # with an error message under our control.
+    bad_chars = {
+      safe_character_display(c) for c in domain
+      if not uts46_valid_char(c)
+    }
+    if bad_chars:
+      raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".")
+
     # Perform UTS-46 normalization, which includes casefolding, NFC normalization,
     # and converting all label separators (the period/full stop, fullwidth full stop,
     # ideographic full stop, and halfwidth ideographic full stop) to regular dots.
diff --git a/tests/test_syntax.py b/tests/test_syntax.py
@@ -402,9 +402,7 @@ def test_domain_literal() -> None:
         ('.leadingdot@domain.com', 'An email address cannot start with a period.'),
         ('twodots..here@domain.com', 'An email address cannot have two periods in a row.'),
         ('trailingdot.@domain.email', 'An email address cannot have a period immediately before the @-sign.'),
-        ('me@⒈wouldbeinvalid.com',
-         "The part after the @-sign contains invalid characters (Codepoint U+2488 not allowed "
-         "at position 1 in '⒈wouldbeinvalid.com')."),
+        ('me@⒈wouldbeinvalid.com', "The part after the @-sign contains invalid characters: '⒈'."),
         ('me@\u037e.com', "The part after the @-sign contains invalid characters after Unicode normalization: ';'."),
         ('me@\u1fef.com', "The part after the @-sign contains invalid characters after Unicode normalization: '`'."),
         ('@example.com', 'There must be something before the @-sign.'),