Correct regexp matching of raw bytes

author Mattias Engdegård <mattiase@acm.org>

Fri, 28 Jun 2019 08:20:55 +0000 (10:20 +0200)

committer Mattias Engdegård <mattiase@acm.org>

Fri, 28 Jun 2019 15:30:18 +0000 (17:30 +0200)
author Mattias Engdegård <mattiase@acm.org>
Fri, 28 Jun 2019 08:20:55 +0000 (10:20 +0200)
committer Mattias Engdegård <mattiase@acm.org>
Fri, 28 Jun 2019 15:30:18 +0000 (17:30 +0200)
diff --git a/src/regex-emacs.c b/src/regex-emacs.c

index c353a78fb4f47c19429e5b9ed1c3edc12c7b85eb..5887eaa30c72108a511e6dfe7aff65aab927fcea 100644 (file)
--- a/src/regex-emacs.c
+++ b/src/regex-emacs.c
@@ -2794,6 +2794,7 @@ static int
  analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte)
  {
    int j, k;
+  int nbits;
    bool not;
  
    /* If all elements for base leading-codes in fastmap is set, this
@@ -2854,7 +2855,14 @@ analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte)
                  each byte is a character.  Thus, this works in both
                  cases. */
               fastmap[p[1]] = 1;
-             if (! multibyte)
+             if (multibyte)
+               {
+                 /* Cover the case of matching a raw char in a
+                    multibyte regexp against unibyte.  */
+                 if (CHAR_BYTE8_HEAD_P (p[1]))
+                   fastmap[CHAR_TO_BYTE8 (STRING_CHAR (p + 1))] = 1;
+               }
+             else
                 {
                   /* For the case of matching this unibyte regex
                      against multibyte, we must set a leading code of
@@ -2886,11 +2894,18 @@ analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte)
         case charset:
           if (!fastmap) break;
           not = (re_opcode_t) *(p - 1) == charset_not;
-         for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
-              j >= 0; j--)
+         nbits = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
+         p++;
+         for (j = 0; j < nbits; j++)
             if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
               fastmap[j] = 1;
  
+         /* To match raw bytes (in the 80..ff range) against multibyte
+            strings, add their leading bytes to the fastmap.  */
+         for (j = 0x80; j < nbits; j++)
+           if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
+             fastmap[CHAR_LEADING_CODE (BYTE8_TO_CHAR (j))] = 1;
+
           if (/* Any leading code can possibly start a character
                  which doesn't match the specified set of characters.  */
               not
@@ -4251,8 +4266,9 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
                   }
                 p += pat_charlen;
                 d++;
+               mcnt -= pat_charlen;
               }
-           while (--mcnt);
+           while (mcnt > 0);
  
           break;
  
diff --git a/test/src/regex-emacs-tests.el b/test/src/regex-emacs-tests.el

index 0ae50c94d4c3fd8ebb2fad524d964c474435d1e1..50ed3e870a55dd7b9f828b00cd9319f8f4eb530e 100644 (file)
--- a/test/src/regex-emacs-tests.el
+++ b/test/src/regex-emacs-tests.el
@@ -683,4 +683,124 @@ This evaluates the TESTS test cases from glibc."
    (should-not (string-match "\\`x\\{65535\\}" (make-string 65534 ?x)))
    (should-error (string-match "\\`x\\{65536\\}" "X") :type 'invalid-regexp))
  
+(ert-deftest regexp-unibyte-unibyte ()
+  "Test matching a unibyte regexp against a unibyte string."
+  ;; Sanity check
+  (should-not (multibyte-string-p "ab"))
+  (should-not (multibyte-string-p "\xff"))
+  ;; ASCII
+  (should (string-match "a[b]" "ab"))
+  ;; Raw
+  (should (string-match "\xf1" "\xf1"))
+  (should-not (string-match "\xf1" "\xc1\xb1"))
+  ;; Raw, char alt
+  (should (string-match "[\xf1]" "\xf1"))
+  (should-not (string-match "[\xf1]" "\xc1\xb1"))
+  ;; Raw range
+  (should (string-match "[\x82-\xd3]" "\xbb"))
+  (should-not (string-match "[\x82-\xd3]" "a"))
+  (should-not (string-match "[\x82-\xd3]" "\x81"))
+  (should-not (string-match "[\x82-\xd3]" "\xd4"))
+  ;; ASCII-raw range
+  (should (string-match "[f-\xd3]" "q"))
+  (should (string-match "[f-\xd3]" "\xbb"))
+  (should-not (string-match "[f-\xd3]" "e"))
+  (should-not (string-match "[f-\xd3]" "\xd4")))
+
+(ert-deftest regexp-multibyte-multibyte ()
+  "Test matching a multibyte regexp against a multibyte string."
+  ;; Sanity check
+  (should (multibyte-string-p "åü"))
+  ;; ASCII
+  (should (string-match (string-to-multibyte "a[b]")
+                        (string-to-multibyte "ab")))
+  ;; Unicode
+  (should (string-match "å[ü]z" "åüz"))
+  (should-not (string-match "ü" (string-to-multibyte "\xc3\xbc")))
+  ;; Raw
+  (should (string-match (string-to-multibyte "\xf1")
+                        (string-to-multibyte "\xf1")))
+  (should-not (string-match (string-to-multibyte "\xf1")
+                            (string-to-multibyte "\xc1\xb1")))
+  (should-not (string-match (string-to-multibyte "\xc1\xb1")
+                            (string-to-multibyte "\xf1")))
+  ;; Raw, char alt
+  (should (string-match (string-to-multibyte "[\xf1]")
+                        (string-to-multibyte "\xf1")))
+  ;; Raw range
+  (should (string-match (string-to-multibyte "[\x82-\xd3]")
+                        (string-to-multibyte "\xbb")))
+  (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "a"))
+  (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "Å"))
+  (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "ü"))
+  (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "\x81"))
+  (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "\xd4"))
+  ;; ASCII-raw range: should exclude U+0100..U+10FFFF
+  (should (string-match (string-to-multibyte "[f-\xd3]")
+                        (string-to-multibyte "q")))
+  (should (string-match (string-to-multibyte "[f-\xd3]")
+                        (string-to-multibyte "\xbb")))
+  (should-not (string-match (string-to-multibyte "[f-\xd3]") "e"))
+  (should-not (string-match (string-to-multibyte "[f-\xd3]") "Å"))
+  (should-not (string-match (string-to-multibyte "[f-\xd3]") "ü"))
+  (should-not (string-match (string-to-multibyte "[f-\xd3]") "\xd4"))
+  ;; Unicode-raw range: should be empty
+  (should-not (string-match "[å-\xd3]" "å"))
+  (should-not (string-match "[å-\xd3]" (string-to-multibyte "\xd3")))
+  (should-not (string-match "[å-\xd3]" (string-to-multibyte "\xbb")))
+  (should-not (string-match "[å-\xd3]" "ü"))
+  ;; No equivalence between raw bytes and latin-1
+  (should-not (string-match "å" (string-to-multibyte "\xe5")))
+  (should-not (string-match "[å]" (string-to-multibyte "\xe5")))
+  (should-not (string-match "\xe5" "å"))
+  (should-not (string-match "[\xe5]" "å")))
+
+(ert-deftest regexp-unibyte-multibyte ()
+  "Test matching a unibyte regexp against a multibyte string."
+  ;; ASCII
+  (should (string-match "a[b]" (string-to-multibyte "ab")))
+  ;; Unicode
+  (should (string-match "a.[^b]c" (string-to-multibyte "aåüc")))
+  ;; Raw
+  (should (string-match "\xf1" (string-to-multibyte "\xf1")))
+  (should-not (string-match "\xc1\xb1" (string-to-multibyte "\xf1")))
+  ;; Raw, char alt
+  (should (string-match "[\xf1]" (string-to-multibyte "\xf1")))
+  (should-not (string-match "[\xc1][\xb1]" (string-to-multibyte "\xf1")))
+  ;; ASCII-raw range: should exclude U+0100..U+10FFFF
+  (should (string-match "[f-\xd3]" (string-to-multibyte "q")))
+  (should (string-match "[f-\xd3]" (string-to-multibyte "\xbb")))
+  (should-not (string-match "[f-\xd3]" "e"))
+  (should-not (string-match "[f-\xd3]" "Å"))
+  (should-not (string-match "[f-\xd3]" "ü"))
+  (should-not (string-match "[f-\xd3]" "\xd4"))
+  ;; No equivalence between raw bytes and latin-1
+  (should-not (string-match "\xe5" "å"))
+  (should-not (string-match "[\xe5]" "å")))
+
+(ert-deftest regexp-multibyte-unibyte ()
+  "Test matching a multibyte regexp against a unibyte string."
+  ;; ASCII
+  (should (string-match (string-to-multibyte "a[b]") "ab"))
+  ;; Unicode
+  (should (string-match "a[^ü]c" "abc"))
+  (should-not (string-match "ü" "\xc3\xbc"))
+  ;; Raw
+  (should (string-match (string-to-multibyte "\xf1") "\xf1"))
+  (should-not (string-match (string-to-multibyte "\xf1") "\xc1\xb1"))
+  ;; Raw, char alt
+  (should (string-match (string-to-multibyte "[\xf1]") "\xf1"))
+  (should-not (string-match (string-to-multibyte "[\xf1]") "\xc1\xb1"))
+  ;; ASCII-raw range: should exclude U+0100..U+10FFFF
+  (should (string-match (string-to-multibyte "[f-\xd3]") "q"))
+  (should (string-match (string-to-multibyte "[f-\xd3]") "\xbb"))
+  (should-not (string-match (string-to-multibyte "[f-\xd3]") "e"))
+  (should-not (string-match (string-to-multibyte "[f-\xd3]") "\xd4"))
+  ;; Unicode-raw range: should be empty
+  (should-not (string-match "[å-\xd3]" "\xd3"))
+  (should-not (string-match "[å-\xd3]" "\xbb"))
+  ;; No equivalence between raw bytes and latin-1
+  (should-not (string-match "å" "\xe5"))
+  (should-not (string-match "[å]" "\xe5")))
+
  ;;; regex-emacs-tests.el ends here
author	Mattias Engdegård <mattiase@acm.org>
	Fri, 28 Jun 2019 08:20:55 +0000 (10:20 +0200)
committer	Mattias Engdegård <mattiase@acm.org>
	Fri, 28 Jun 2019 15:30:18 +0000 (17:30 +0200)
src/regex-emacs.c		patch \| blob \| history
test/src/regex-emacs-tests.el		patch \| blob \| history