]> git.eshelyaron.com Git - emacs.git/commitdiff
Add textsec functions for verifying email addresses
authorLars Ingebrigtsen <larsi@gnus.org>
Tue, 18 Jan 2022 12:19:55 +0000 (13:19 +0100)
committerLars Ingebrigtsen <larsi@gnus.org>
Tue, 18 Jan 2022 12:20:04 +0000 (13:20 +0100)
* lisp/international/characters.el (bidi-control-characters):
Rename from glyphless--bidi-control-characters for use in textsec,
and add LRM/RLM/ALM.
(update-glyphless-char-display): Adjust the code.

* lisp/international/textsec.el (textsec-local-address-suspicious-p)
(textsec-name-suspicious-p, textsec-suspicious-nonspacing-p)
(textsec-email-suspicious-p): New functions.

lisp/international/characters.el
lisp/international/textsec.el
test/lisp/international/textsec-tests.el

index 3ff280f480235374d9529401faf7a90986b52386..ce23e995c116903dd6456337b764e361fb2c3900 100644 (file)
@@ -1526,8 +1526,11 @@ Setup `char-width-table' appropriate for non-CJK language environment."
 
 ;; We can't use the \N{name} things here, because this file is used
 ;; too early in the build process.
-(defvar glyphless--bidi-control-characters
-  '(#x202a                          ; ?\N{left-to-right embedding}
+(defvar bidi-control-characters
+  '(#x200e                           ; ?\N{left-to-right mark}
+    #x200f                           ; ?\N{right-to-left mark}
+    #x061c                           ; ?\N{arabic letter mark}
+    #x202a                          ; ?\N{left-to-right embedding}
     #x202b                          ; ?\N{right-to-left embedding}
     #x202d                          ; ?\N{left-to-right override}
     #x202e                          ; ?\N{right-to-left override}
@@ -1535,7 +1538,8 @@ Setup `char-width-table' appropriate for non-CJK language environment."
     #x2067                          ; ?\N{right-to-left isolate}
     #x2068                          ; ?\N{first strong isolate}
     #x202c                          ; ?\N{pop directional formatting}
-    #x2069))                         ; ?\N{pop directional isolate})
+    #x2069)                          ; ?\N{pop directional isolate}
+  "List of bidirectional control characters.")
 
 (defun update-glyphless-char-display (&optional variable value)
   "Make the setting of `glyphless-char-display-control' take effect.
@@ -1582,8 +1586,7 @@ option `glyphless-char-display'."
                                   (or (aref char-acronym-table from)
                                       "UNK")))
                           (when (or (eq target 'format-control)
-                                    (memq from
-                                          glyphless--bidi-control-characters))
+                                    (memq from bidi-control-characters))
                             (set-char-table-range glyphless-char-display
                                                   from this-method)))
                         (setq from (1+ from))))))
index d0d435ed7dc591f0e5b00fc7730f2d4d9e469712..55e4ce9d86c162fad1867bb1c28371510c4cb6f1 100644 (file)
@@ -28,6 +28,7 @@
 (require 'ucs-normalize)
 (require 'idna-mapping)
 (require 'puny)
+(require 'mail-parse)
 
 (defvar textsec--char-scripts nil)
 
@@ -225,6 +226,9 @@ STRING isn't a single script string."
        (textsec-single-script-p string2)))
 
 (defun textsec-domain-suspicious-p (domain)
+  "Say whether DOMAIN looks suspicious.
+If it isn't, nil is returned.  If it is, a string explaining the
+problem is returned."
   (catch 'found
     (seq-do
      (lambda (char)
@@ -236,6 +240,79 @@ STRING isn't a single script string."
       (throw 'found "%s is not highly restrictive"))
     nil))
 
+(defun textsec-local-address-suspicious-p (local)
+  "Say whether LOCAL looks suspicious.
+LOCAL is the bit before \"@\" in an email address.
+
+If it suspicious, nil is returned.  If it is, a string explaining
+the problem is returned."
+  (cond
+   ((not (equal local (ucs-normalize-NFKC-string local)))
+    (format "`%s' is not in normalized format `%s'"
+            local (ucs-normalize-NFKC-string local)))
+   ((textsec-mixed-numbers-p local)
+    (format "`%s' contains numbers from different number systems" local))
+   ((eq (textsec-restriction-level local) 'unrestricted)
+    (format "`%s' isn't restrictive enough" local))
+   ((string-match-p "\\`\\.\\|\\.\\'\\|\\.\\." local)
+    (format "`%s' contains invalid dots" local))))
+
+(defun textsec-name-suspicious-p (name)
+  "Say whether NAME looks suspicious.
+NAME is (for instance) the free-text name from an email address.
+
+If it suspicious, nil is returned.  If it is, a string explaining
+the problem is returned."
+  (cond
+   ((not (equal name (ucs-normalize-NFC-string name)))
+    (format "`%s' is not in normalized format `%s'"
+            name (ucs-normalize-NFC-string name)))
+   ((seq-find (lambda (char)
+                (and (member char bidi-control-characters)
+                     (not (member char
+                                  '( ?\N{left-to-right mark}
+                                     ?\N{right-to-left mark}
+                                     ?\N{arabic letter mark})))))
+              name)
+    (format "The string contains bidirectional control characters"))
+   ((textsec-suspicious-nonspacing-p name))))
+
+(defun textsec-suspicious-nonspacing-p (string)
+  "Say whether STRING has a suspicious use of nonspacing characters.
+If it suspicious, nil is returned.  If it is, a string explaining
+the problem is returned."
+  (let ((prev nil)
+        (nonspace-count 0))
+    (catch 'found
+      (seq-do
+       (lambda (char)
+         (let ((nonspacing
+                (memq (get-char-code-property char 'general-category)
+                      '(Cf Cc Mn))))
+           (when (and nonspacing
+                      (equal char prev))
+             (throw 'found "Two identical nonspacing characters in a row"))
+           (setq nonspace-count (if nonspacing
+                                    (1+ nonspace-count)
+                                  0))
+           (when (> nonspace-count 4)
+             (throw 'found
+                    "Excessive number of nonspacing characters in a row"))
+           (setq prev char)))
+       string)
+      nil)))
+
+(defun textsec-email-suspicious-p (email)
+  "Say whether EMAIL looks suspicious.
+If it isn't, nil is returned.  If it is, a string explaining the
+problem is returned."
+  (pcase-let* ((`(,address . ,name) (mail-header-parse-address email t))
+               (`(,local ,domain) (split-string address "@")))
+    (or
+     (textsec-domain-suspicious-p domain)
+     (textsec-local-address-suspicious-p local)
+     (textsec-name-suspicious-p name))))
+
 (provide 'textsec)
 
 ;;; textsec.el ends here
index c946d8506907607aed701598f874bae538eda51e..aeb8bc7283928421eec4f00f3ab6565177249ea7 100644 (file)
   (should-not (textsec-domain-suspicious-p "foo.org"))
   (should (textsec-domain-suspicious-p "f\N{LEFT-TO-RIGHT ISOLATE}oo.org")))
 
+(ert-deftest test-suspicious-local ()
+  (should-not (textsec-local-address-suspicious-p "larsi"))
+  (should (textsec-local-address-suspicious-p ".larsi"))
+  (should (textsec-local-address-suspicious-p "larsi."))
+  (should-not (textsec-local-address-suspicious-p "la.rsi"))
+  (should (textsec-local-address-suspicious-p "lar..si"))
+
+  (should-not (textsec-local-address-suspicious-p "LÅRSI"))
+  (should (textsec-local-address-suspicious-p "LÅRSI"))
+
+  (should (textsec-local-address-suspicious-p "larsi8৪")))
+
+(ert-deftest test-suspicious-name ()
+  (should-not (textsec-name-suspicious-p "Lars Ingebrigtsen"))
+  (should (textsec-name-suspicious-p "LÅRS INGEBRIGTSEN"))
+  (should-not (textsec-name-suspicious-p "LÅRS INGEBRIGTSEN"))
+
+  (should (textsec-name-suspicious-p
+           "Lars Ingebrigtsen\N{LEFT-TO-RIGHT ISOLATE}"))
+  (should-not (textsec-name-suspicious-p
+               "Lars Ingebrigtsen\N{LEFT-TO-RIGHT MARK}"))
+
+  (should (textsec-name-suspicious-p
+           "\N{LEFT-TO-RIGHT MARK}\N{LEFT-TO-RIGHT MARK}Lars Ingebrigtsen"))
+  (should-not (textsec-name-suspicious-p
+               "\N{LEFT-TO-RIGHT MARK}\N{RIGHT-TO-LEFT MARK}Lars Ingebrigtsen"))
+  (should (textsec-name-suspicious-p
+               "\N{LEFT-TO-RIGHT MARK}\N{RIGHT-TO-LEFT MARK}\N{LEFT-TO-RIGHT MARK}\N{RIGHT-TO-LEFT MARK}\N{LEFT-TO-RIGHT MARK}Lars Ingebrigtsen")))
+
+(ert-deftest test-suspicious-email ()
+  (should-not
+   (textsec-email-suspicious-p "Lars Ingebrigtsen <larsi@gnus.org>"))
+  (should
+   (textsec-email-suspicious-p "LÅrs Ingebrigtsen <larsi@gnus.org>"))
+  (should
+   (textsec-email-suspicious-p "Lars Ingebrigtsen <.larsi@gnus.org>"))
+  (should
+   (textsec-email-suspicious-p "Lars Ingebrigtsen <larsi@gn\N{LEFT-TO-RIGHT ISOLATE}us.org>")))
+
 ;;; textsec-tests.el ends here