From be4d6b043fa79e2d9a9911ca1c48bdcc84e3bba9 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Mattias=20Engdeg=C3=A5rd?= Date: Tue, 8 Dec 2020 12:47:58 +0100 Subject: [PATCH] Fix [:upper:] and [:lower:] for Unicode characters (bug#11309) * src/regex-emacs.c (execute_charset): Add canon_table argument to allow expression of a correct predicate for [:upper:] and [:lower:]. (mutually_exclusive_p, re_match_2_internal): Pass extra argument. * test/src/regex-emacs-tests.el (regexp-case-fold, regexp-eszett): New tests. Parts of regexp-eszett still fail and are commented out. --- src/regex-emacs.c | 17 ++++++----- test/src/regex-emacs-tests.el | 57 +++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 8 deletions(-) diff --git a/src/regex-emacs.c b/src/regex-emacs.c index 971a5f63749..904ca0c7b95 100644 --- a/src/regex-emacs.c +++ b/src/regex-emacs.c @@ -3575,9 +3575,11 @@ skip_noops (re_char *p, re_char *pend) opcode. When the function finishes, *PP will be advanced past that opcode. C is character to test (possibly after translations) and CORIG is original character (i.e. without any translations). UNIBYTE denotes whether c is - unibyte or multibyte character. */ + unibyte or multibyte character. + CANON_TABLE is the canonicalisation table for case folding or Qnil. */ static bool -execute_charset (re_char **pp, int c, int corig, bool unibyte) +execute_charset (re_char **pp, int c, int corig, bool unibyte, + Lisp_Object canon_table) { eassume (0 <= c && 0 <= corig); re_char *p = *pp, *rtp = NULL; @@ -3617,11 +3619,9 @@ execute_charset (re_char **pp, int c, int corig, bool unibyte) (class_bits & BIT_BLANK && ISBLANK (c)) || (class_bits & BIT_WORD && ISWORD (c)) || ((class_bits & BIT_UPPER) && - (ISUPPER (c) || (corig != c && - c == downcase (corig) && ISLOWER (c)))) || + (ISUPPER (corig) || (!NILP (canon_table) && ISLOWER (corig)))) || ((class_bits & BIT_LOWER) && - (ISLOWER (c) || (corig != c && - c == upcase (corig) && ISUPPER(c)))) || + (ISLOWER (corig) || (!NILP (canon_table) && ISUPPER (corig)))) || (class_bits & BIT_PUNCT && ISPUNCT (c)) || (class_bits & BIT_GRAPH && ISGRAPH (c)) || (class_bits & BIT_PRINT && ISPRINT (c))) @@ -3696,7 +3696,8 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1, else if ((re_opcode_t) *p1 == charset || (re_opcode_t) *p1 == charset_not) { - if (!execute_charset (&p1, c, c, !multibyte || ASCII_CHAR_P (c))) + if (!execute_charset (&p1, c, c, !multibyte || ASCII_CHAR_P (c), + Qnil)) { DEBUG_PRINT (" No match => fast loop.\n"); return true; @@ -4367,7 +4368,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, } p -= 1; - if (!execute_charset (&p, c, corig, unibyte_char)) + if (!execute_charset (&p, c, corig, unibyte_char, translate)) goto fail; d += len; diff --git a/test/src/regex-emacs-tests.el b/test/src/regex-emacs-tests.el index f9372e37b11..576630aa5af 100644 --- a/test/src/regex-emacs-tests.el +++ b/test/src/regex-emacs-tests.el @@ -803,4 +803,61 @@ This evaluates the TESTS test cases from glibc." (should-not (string-match "å" "\xe5")) (should-not (string-match "[å]" "\xe5"))) +(ert-deftest regexp-case-fold () + "Test case-sensitive and case-insensitive matching." + (let ((case-fold-search nil)) + (should (equal (string-match "aB" "ABaB") 2)) + (should (equal (string-match "åÄ" "ÅäåäÅÄåÄ") 6)) + (should (equal (string-match "λΛ" "lΛλλΛ") 3)) + (should (equal (string-match "шШ" "zШшшШ") 3)) + (should (equal (string-match "[[:alpha:]]+" ".3aBåÄßλΛшШ中﷽") 2)) + (should (equal (match-end 0) 12)) + (should (equal (string-match "[[:alnum:]]+" ".3aBåÄßλΛшШ中﷽") 1)) + (should (equal (match-end 0) 12)) + (should (equal (string-match "[[:upper:]]+" ".3aåλшBÄΛШ中﷽") 6)) + (should (equal (match-end 0) 10)) + (should (equal (string-match "[[:lower:]]+" ".3BÄΛШaåλш中﷽") 6)) + (should (equal (match-end 0) 10))) + (let ((case-fold-search t)) + (should (equal (string-match "aB" "ABaB") 0)) + (should (equal (string-match "åÄ" "ÅäåäÅÄåÄ") 0)) + (should (equal (string-match "λΛ" "lΛλλΛ") 1)) + (should (equal (string-match "шШ" "zШшшШ") 1)) + (should (equal (string-match "[[:alpha:]]+" ".3aBåÄßλΛшШ中﷽") 2)) + (should (equal (match-end 0) 12)) + (should (equal (string-match "[[:alnum:]]+" ".3aBåÄßλΛшШ中﷽") 1)) + (should (equal (match-end 0) 12)) + (should (equal (string-match "[[:upper:]]+" ".3aåλшBÄΛШ中﷽") 2)) + (should (equal (match-end 0) 10)) + (should (equal (string-match "[[:lower:]]+" ".3BÄΛШaåλш中﷽") 2)) + (should (equal (match-end 0) 10)))) + +(ert-deftest regexp-eszett () + "Test matching of ß and ẞ." + ;; ß is a lower-case letter (Ll); ẞ is an upper-case letter (Lu). + (let ((case-fold-search nil)) + (should (equal (string-match "ß" "ß") 0)) + (should (equal (string-match "ß" "ẞ") nil)) + (should (equal (string-match "ẞ" "ß") nil)) + (should (equal (string-match "ẞ" "ẞ") 0)) + (should (equal (string-match "[[:alpha:]]" "ß") 0)) + ;; bug#11309 + ;;(should (equal (string-match "[[:lower:]]" "ß") 0)) + ;;(should (equal (string-match "[[:upper:]]" "ß") nil)) + (should (equal (string-match "[[:alpha:]]" "ẞ") 0)) + (should (equal (string-match "[[:lower:]]" "ẞ") nil)) + (should (equal (string-match "[[:upper:]]" "ẞ") 0))) + (let ((case-fold-search t)) + (should (equal (string-match "ß" "ß") 0)) + (should (equal (string-match "ß" "ẞ") 0)) + (should (equal (string-match "ẞ" "ß") 0)) + (should (equal (string-match "ẞ" "ẞ") 0)) + (should (equal (string-match "[[:alpha:]]" "ß") 0)) + ;; bug#11309 + ;;(should (equal (string-match "[[:lower:]]" "ß") 0)) + ;;(should (equal (string-match "[[:upper:]]" "ß") 0)) + (should (equal (string-match "[[:alpha:]]" "ẞ") 0)) + (should (equal (string-match "[[:lower:]]" "ẞ") 0)) + (should (equal (string-match "[[:upper:]]" "ẞ") 0)))) + ;;; regex-emacs-tests.el ends here -- 2.39.2