From ce63f9102545fa50abbe08a4083b332a9101c243 Mon Sep 17 00:00:00 2001 From: Lars Ingebrigtsen Date: Tue, 18 Jan 2022 13:19:55 +0100 Subject: [PATCH] Add textsec functions for verifying email addresses * lisp/international/characters.el (bidi-control-characters): Rename from glyphless--bidi-control-characters for use in textsec, and add LRM/RLM/ALM. (update-glyphless-char-display): Adjust the code. * lisp/international/textsec.el (textsec-local-address-suspicious-p) (textsec-name-suspicious-p, textsec-suspicious-nonspacing-p) (textsec-email-suspicious-p): New functions. --- lisp/international/characters.el | 13 ++-- lisp/international/textsec.el | 77 ++++++++++++++++++++++++ test/lisp/international/textsec-tests.el | 39 ++++++++++++ 3 files changed, 124 insertions(+), 5 deletions(-) diff --git a/lisp/international/characters.el b/lisp/international/characters.el index 3ff280f4802..ce23e995c11 100644 --- a/lisp/international/characters.el +++ b/lisp/international/characters.el @@ -1526,8 +1526,11 @@ Setup `char-width-table' appropriate for non-CJK language environment." ;; We can't use the \N{name} things here, because this file is used ;; too early in the build process. -(defvar glyphless--bidi-control-characters - '(#x202a ; ?\N{left-to-right embedding} +(defvar bidi-control-characters + '(#x200e ; ?\N{left-to-right mark} + #x200f ; ?\N{right-to-left mark} + #x061c ; ?\N{arabic letter mark} + #x202a ; ?\N{left-to-right embedding} #x202b ; ?\N{right-to-left embedding} #x202d ; ?\N{left-to-right override} #x202e ; ?\N{right-to-left override} @@ -1535,7 +1538,8 @@ Setup `char-width-table' appropriate for non-CJK language environment." #x2067 ; ?\N{right-to-left isolate} #x2068 ; ?\N{first strong isolate} #x202c ; ?\N{pop directional formatting} - #x2069)) ; ?\N{pop directional isolate}) + #x2069) ; ?\N{pop directional isolate} + "List of bidirectional control characters.") (defun update-glyphless-char-display (&optional variable value) "Make the setting of `glyphless-char-display-control' take effect. @@ -1582,8 +1586,7 @@ option `glyphless-char-display'." (or (aref char-acronym-table from) "UNK"))) (when (or (eq target 'format-control) - (memq from - glyphless--bidi-control-characters)) + (memq from bidi-control-characters)) (set-char-table-range glyphless-char-display from this-method))) (setq from (1+ from)))))) diff --git a/lisp/international/textsec.el b/lisp/international/textsec.el index d0d435ed7dc..55e4ce9d86c 100644 --- a/lisp/international/textsec.el +++ b/lisp/international/textsec.el @@ -28,6 +28,7 @@ (require 'ucs-normalize) (require 'idna-mapping) (require 'puny) +(require 'mail-parse) (defvar textsec--char-scripts nil) @@ -225,6 +226,9 @@ STRING isn't a single script string." (textsec-single-script-p string2))) (defun textsec-domain-suspicious-p (domain) + "Say whether DOMAIN looks suspicious. +If it isn't, nil is returned. If it is, a string explaining the +problem is returned." (catch 'found (seq-do (lambda (char) @@ -236,6 +240,79 @@ STRING isn't a single script string." (throw 'found "%s is not highly restrictive")) nil)) +(defun textsec-local-address-suspicious-p (local) + "Say whether LOCAL looks suspicious. +LOCAL is the bit before \"@\" in an email address. + +If it suspicious, nil is returned. If it is, a string explaining +the problem is returned." + (cond + ((not (equal local (ucs-normalize-NFKC-string local))) + (format "`%s' is not in normalized format `%s'" + local (ucs-normalize-NFKC-string local))) + ((textsec-mixed-numbers-p local) + (format "`%s' contains numbers from different number systems" local)) + ((eq (textsec-restriction-level local) 'unrestricted) + (format "`%s' isn't restrictive enough" local)) + ((string-match-p "\\`\\.\\|\\.\\'\\|\\.\\." local) + (format "`%s' contains invalid dots" local)))) + +(defun textsec-name-suspicious-p (name) + "Say whether NAME looks suspicious. +NAME is (for instance) the free-text name from an email address. + +If it suspicious, nil is returned. If it is, a string explaining +the problem is returned." + (cond + ((not (equal name (ucs-normalize-NFC-string name))) + (format "`%s' is not in normalized format `%s'" + name (ucs-normalize-NFC-string name))) + ((seq-find (lambda (char) + (and (member char bidi-control-characters) + (not (member char + '( ?\N{left-to-right mark} + ?\N{right-to-left mark} + ?\N{arabic letter mark}))))) + name) + (format "The string contains bidirectional control characters")) + ((textsec-suspicious-nonspacing-p name)))) + +(defun textsec-suspicious-nonspacing-p (string) + "Say whether STRING has a suspicious use of nonspacing characters. +If it suspicious, nil is returned. If it is, a string explaining +the problem is returned." + (let ((prev nil) + (nonspace-count 0)) + (catch 'found + (seq-do + (lambda (char) + (let ((nonspacing + (memq (get-char-code-property char 'general-category) + '(Cf Cc Mn)))) + (when (and nonspacing + (equal char prev)) + (throw 'found "Two identical nonspacing characters in a row")) + (setq nonspace-count (if nonspacing + (1+ nonspace-count) + 0)) + (when (> nonspace-count 4) + (throw 'found + "Excessive number of nonspacing characters in a row")) + (setq prev char))) + string) + nil))) + +(defun textsec-email-suspicious-p (email) + "Say whether EMAIL looks suspicious. +If it isn't, nil is returned. If it is, a string explaining the +problem is returned." + (pcase-let* ((`(,address . ,name) (mail-header-parse-address email t)) + (`(,local ,domain) (split-string address "@"))) + (or + (textsec-domain-suspicious-p domain) + (textsec-local-address-suspicious-p local) + (textsec-name-suspicious-p name)))) + (provide 'textsec) ;;; textsec.el ends here diff --git a/test/lisp/international/textsec-tests.el b/test/lisp/international/textsec-tests.el index c946d850690..aeb8bc72839 100644 --- a/test/lisp/international/textsec-tests.el +++ b/test/lisp/international/textsec-tests.el @@ -115,4 +115,43 @@ (should-not (textsec-domain-suspicious-p "foo.org")) (should (textsec-domain-suspicious-p "f\N{LEFT-TO-RIGHT ISOLATE}oo.org"))) +(ert-deftest test-suspicious-local () + (should-not (textsec-local-address-suspicious-p "larsi")) + (should (textsec-local-address-suspicious-p ".larsi")) + (should (textsec-local-address-suspicious-p "larsi.")) + (should-not (textsec-local-address-suspicious-p "la.rsi")) + (should (textsec-local-address-suspicious-p "lar..si")) + + (should-not (textsec-local-address-suspicious-p "LÅRSI")) + (should (textsec-local-address-suspicious-p "LÅRSI")) + + (should (textsec-local-address-suspicious-p "larsi8৪"))) + +(ert-deftest test-suspicious-name () + (should-not (textsec-name-suspicious-p "Lars Ingebrigtsen")) + (should (textsec-name-suspicious-p "LÅRS INGEBRIGTSEN")) + (should-not (textsec-name-suspicious-p "LÅRS INGEBRIGTSEN")) + + (should (textsec-name-suspicious-p + "Lars Ingebrigtsen\N{LEFT-TO-RIGHT ISOLATE}")) + (should-not (textsec-name-suspicious-p + "Lars Ingebrigtsen\N{LEFT-TO-RIGHT MARK}")) + + (should (textsec-name-suspicious-p + "\N{LEFT-TO-RIGHT MARK}\N{LEFT-TO-RIGHT MARK}Lars Ingebrigtsen")) + (should-not (textsec-name-suspicious-p + "\N{LEFT-TO-RIGHT MARK}\N{RIGHT-TO-LEFT MARK}Lars Ingebrigtsen")) + (should (textsec-name-suspicious-p + "\N{LEFT-TO-RIGHT MARK}\N{RIGHT-TO-LEFT MARK}\N{LEFT-TO-RIGHT MARK}\N{RIGHT-TO-LEFT MARK}\N{LEFT-TO-RIGHT MARK}Lars Ingebrigtsen"))) + +(ert-deftest test-suspicious-email () + (should-not + (textsec-email-suspicious-p "Lars Ingebrigtsen ")) + (should + (textsec-email-suspicious-p "LÅrs Ingebrigtsen ")) + (should + (textsec-email-suspicious-p "Lars Ingebrigtsen <.larsi@gnus.org>")) + (should + (textsec-email-suspicious-p "Lars Ingebrigtsen "))) + ;;; textsec-tests.el ends here -- 2.39.2