From be4d6b043fa79e2d9a9911ca1c48bdcc84e3bba9 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Mattias=20Engdeg=C3=A5rd?= <mattiase@acm.org>
Date: Tue, 8 Dec 2020 12:47:58 +0100
Subject: [PATCH] Fix [:upper:] and [:lower:] for Unicode characters
 (bug#11309)

* src/regex-emacs.c (execute_charset): Add canon_table argument to
allow expression of a correct predicate for [:upper:] and [:lower:].
(mutually_exclusive_p, re_match_2_internal): Pass extra argument.
* test/src/regex-emacs-tests.el (regexp-case-fold, regexp-eszett):
New tests.  Parts of regexp-eszett still fail and are commented out.
---
 src/regex-emacs.c             | 17 ++++++-----
 test/src/regex-emacs-tests.el | 57 +++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 8 deletions(-)

diff --git a/src/regex-emacs.c b/src/regex-emacs.c
index 971a5f63749..904ca0c7b95 100644
--- a/src/regex-emacs.c
+++ b/src/regex-emacs.c
@@ -3575,9 +3575,11 @@ skip_noops (re_char *p, re_char *pend)
    opcode.  When the function finishes, *PP will be advanced past that opcode.
    C is character to test (possibly after translations) and CORIG is original
    character (i.e. without any translations).  UNIBYTE denotes whether c is
-   unibyte or multibyte character. */
+   unibyte or multibyte character.
+   CANON_TABLE is the canonicalisation table for case folding or Qnil.  */
 static bool
-execute_charset (re_char **pp, int c, int corig, bool unibyte)
+execute_charset (re_char **pp, int c, int corig, bool unibyte,
+                 Lisp_Object canon_table)
 {
   eassume (0 <= c && 0 <= corig);
   re_char *p = *pp, *rtp = NULL;
@@ -3617,11 +3619,9 @@ execute_charset (re_char **pp, int c, int corig, bool unibyte)
           (class_bits & BIT_BLANK && ISBLANK (c)) ||
 	  (class_bits & BIT_WORD  && ISWORD  (c)) ||
 	  ((class_bits & BIT_UPPER) &&
-	   (ISUPPER (c) || (corig != c &&
-			    c == downcase (corig) && ISLOWER (c)))) ||
+	   (ISUPPER (corig) || (!NILP (canon_table) && ISLOWER (corig)))) ||
 	  ((class_bits & BIT_LOWER) &&
-	   (ISLOWER (c) || (corig != c &&
-			    c == upcase (corig) && ISUPPER(c)))) ||
+	   (ISLOWER (corig) || (!NILP (canon_table) && ISUPPER (corig)))) ||
 	  (class_bits & BIT_PUNCT && ISPUNCT (c)) ||
 	  (class_bits & BIT_GRAPH && ISGRAPH (c)) ||
 	  (class_bits & BIT_PRINT && ISPRINT (c)))
@@ -3696,7 +3696,8 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
 	else if ((re_opcode_t) *p1 == charset
 		 || (re_opcode_t) *p1 == charset_not)
 	  {
-	    if (!execute_charset (&p1, c, c, !multibyte || ASCII_CHAR_P (c)))
+	    if (!execute_charset (&p1, c, c, !multibyte || ASCII_CHAR_P (c),
+                                  Qnil))
 	      {
 		DEBUG_PRINT ("	 No match => fast loop.\n");
 		return true;
@@ -4367,7 +4368,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
 	      }
 
 	    p -= 1;
-	    if (!execute_charset (&p, c, corig, unibyte_char))
+	    if (!execute_charset (&p, c, corig, unibyte_char, translate))
 	      goto fail;
 
 	    d += len;
diff --git a/test/src/regex-emacs-tests.el b/test/src/regex-emacs-tests.el
index f9372e37b11..576630aa5af 100644
--- a/test/src/regex-emacs-tests.el
+++ b/test/src/regex-emacs-tests.el
@@ -803,4 +803,61 @@ This evaluates the TESTS test cases from glibc."
   (should-not (string-match "Ã¥" "\xe5"))
   (should-not (string-match "[Ã¥]" "\xe5")))
 
+(ert-deftest regexp-case-fold ()
+  "Test case-sensitive and case-insensitive matching."
+  (let ((case-fold-search nil))
+    (should (equal (string-match "aB" "ABaB") 2))
+    (should (equal (string-match "Ã¥Ã" "ÃÃ¤Ã¥Ã¤ÃÃÃ¥Ã") 6))
+    (should (equal (string-match "Î»Î" "lÎÎ»Î»Î") 3))
+    (should (equal (string-match "ÑÐ¨" "zÐ¨ÑÑÐ¨") 3))
+    (should (equal (string-match "[[:alpha:]]+" ".3aBÃ¥ÃÃÎ»ÎÑÐ¨ä¸­ï·½") 2))
+    (should (equal (match-end 0) 12))
+    (should (equal (string-match "[[:alnum:]]+" ".3aBÃ¥ÃÃÎ»ÎÑÐ¨ä¸­ï·½") 1))
+    (should (equal (match-end 0) 12))
+    (should (equal (string-match "[[:upper:]]+" ".3aÃ¥Î»ÑBÃÎÐ¨ä¸­ï·½") 6))
+    (should (equal (match-end 0) 10))
+    (should (equal (string-match "[[:lower:]]+" ".3BÃÎÐ¨aÃ¥Î»Ñä¸­ï·½") 6))
+    (should (equal (match-end 0) 10)))
+  (let ((case-fold-search t))
+    (should (equal (string-match "aB" "ABaB") 0))
+    (should (equal (string-match "Ã¥Ã" "ÃÃ¤Ã¥Ã¤ÃÃÃ¥Ã") 0))
+    (should (equal (string-match "Î»Î" "lÎÎ»Î»Î") 1))
+    (should (equal (string-match "ÑÐ¨" "zÐ¨ÑÑÐ¨") 1))
+    (should (equal (string-match "[[:alpha:]]+" ".3aBÃ¥ÃÃÎ»ÎÑÐ¨ä¸­ï·½") 2))
+    (should (equal (match-end 0) 12))
+    (should (equal (string-match "[[:alnum:]]+" ".3aBÃ¥ÃÃÎ»ÎÑÐ¨ä¸­ï·½") 1))
+    (should (equal (match-end 0) 12))
+    (should (equal (string-match "[[:upper:]]+" ".3aÃ¥Î»ÑBÃÎÐ¨ä¸­ï·½") 2))
+    (should (equal (match-end 0) 10))
+    (should (equal (string-match "[[:lower:]]+" ".3BÃÎÐ¨aÃ¥Î»Ñä¸­ï·½") 2))
+    (should (equal (match-end 0) 10))))
+
+(ert-deftest regexp-eszett ()
+  "Test matching of Ã and áº."
+  ;; Ã is a lower-case letter (Ll); áº is an upper-case letter (Lu).
+  (let ((case-fold-search nil))
+    (should (equal (string-match "Ã" "Ã") 0))
+    (should (equal (string-match "Ã" "áº") nil))
+    (should (equal (string-match "áº" "Ã") nil))
+    (should (equal (string-match "áº" "áº") 0))
+    (should (equal (string-match "[[:alpha:]]" "Ã") 0))
+    ;; bug#11309
+    ;;(should (equal (string-match "[[:lower:]]" "Ã") 0))
+    ;;(should (equal (string-match "[[:upper:]]" "Ã") nil))
+    (should (equal (string-match "[[:alpha:]]" "áº") 0))
+    (should (equal (string-match "[[:lower:]]" "áº") nil))
+    (should (equal (string-match "[[:upper:]]" "áº") 0)))
+  (let ((case-fold-search t))
+    (should (equal (string-match "Ã" "Ã") 0))
+    (should (equal (string-match "Ã" "áº") 0))
+    (should (equal (string-match "áº" "Ã") 0))
+    (should (equal (string-match "áº" "áº") 0))
+    (should (equal (string-match "[[:alpha:]]" "Ã") 0))
+    ;; bug#11309
+    ;;(should (equal (string-match "[[:lower:]]" "Ã") 0))
+    ;;(should (equal (string-match "[[:upper:]]" "Ã") 0))
+    (should (equal (string-match "[[:alpha:]]" "áº") 0))
+    (should (equal (string-match "[[:lower:]]" "áº") 0))
+    (should (equal (string-match "[[:upper:]]" "áº") 0))))
+
 ;;; regex-emacs-tests.el ends here
-- 
2.39.5