From: Mattias Engdegård Date: Sat, 22 Jul 2023 15:26:11 +0000 (+0200) Subject: Fix regexp character class syntax property ghost matching bug X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=5d2d28458d0eb378a7e94363ef716e8648ef129a;p=emacs.git Fix regexp character class syntax property ghost matching bug The syntax-table-dependent regexp character classes [:space:], [:word:] and [:punct:] always use the buffer-local syntax table for performance reasons. Fix a bug that could cause ghost (mis)matches from use of lingering state by constructs that do use syntax properties, such as `\sX`. * src/regex-emacs.c (BUFFER_SYNTAX): New macro. (ISPUNCT, ISSPACE, ISWORD): Use BUFFER_SYNTAX instead of SYNTAX. (regex_compile): Delete syntax table setup code that is no longer needed. * test/src/regex-emacs-tests.el (regex-emacs-syntax-properties): New regression test. --- diff --git a/src/regex-emacs.c b/src/regex-emacs.c index 51fc2b0558d..7e75f0ac597 100644 --- a/src/regex-emacs.c +++ b/src/regex-emacs.c @@ -47,6 +47,9 @@ /* Make syntax table lookup grant data in gl_state. */ #define SYNTAX(c) syntax_property (c, 1) +/* Explicit syntax lookup using the buffer-local table. */ +#define BUFFER_SYNTAX(c) syntax_property (c, 0) + #define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte) #define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte) #define RE_STRING_CHAR(p, multibyte) \ @@ -132,18 +135,22 @@ #define ISLOWER(c) lowercasep (c) +#define ISUPPER(c) uppercasep (c) + +/* The following predicates use the buffer-local syntax table and + ignore syntax properties, for consistency with the up-front + assumptions made at compile time. */ + #define ISPUNCT(c) (IS_REAL_ASCII (c) \ ? ((c) > ' ' && (c) < 0177 \ && !(((c) >= 'a' && (c) <= 'z') \ || ((c) >= 'A' && (c) <= 'Z') \ || ((c) >= '0' && (c) <= '9'))) \ - : SYNTAX (c) != Sword) + : BUFFER_SYNTAX (c) != Sword) -#define ISSPACE(c) (SYNTAX (c) == Swhitespace) +#define ISSPACE(c) (BUFFER_SYNTAX (c) == Swhitespace) -#define ISUPPER(c) uppercasep (c) - -#define ISWORD(c) (SYNTAX (c) == Sword) +#define ISWORD(c) (BUFFER_SYNTAX (c) == Sword) /* Use alloca instead of malloc. This is because using malloc in re_search* or re_match* could cause memory leaks when C-g is used @@ -2048,13 +2055,6 @@ regex_compile (re_char *pattern, ptrdiff_t size, is_xdigit, since they can only match ASCII characters. We don't need to handle them for multibyte. */ - /* Setup the gl_state object to its buffer-defined value. - This hardcodes the buffer-global syntax-table for ASCII - chars, while the other chars will obey syntax-table - properties. It's not ideal, but it's the way it's been - done until now. */ - SETUP_BUFFER_SYNTAX_TABLE (); - for (c = 0; c < 0x80; ++c) if (re_iswctype (c, cc)) { diff --git a/test/src/regex-emacs-tests.el b/test/src/regex-emacs-tests.el index 08a93dbf30e..4e2c0f67a44 100644 --- a/test/src/regex-emacs-tests.el +++ b/test/src/regex-emacs-tests.el @@ -949,4 +949,20 @@ This evaluates the TESTS test cases from glibc." (should (equal (smatch "a\\=*b" "ab") 0)) )) +(ert-deftest regex-emacs-syntax-properties () + ;; Verify absence of character class syntax property ghost matching bug. + (let ((re "\\s-[[:space:]]") + (s (concat "a" + (propertize "b" 'syntax-table '(0)) ; whitespace + "éz")) + (parse-sexp-lookup-properties t)) + ;; Test matching in a string... + (should (equal (string-match re s) nil)) + ;; ... and in a buffer. + (should (equal (with-temp-buffer + (insert s) + (goto-char (point-min)) + (re-search-forward re nil t)) + nil)))) + ;;; regex-emacs-tests.el ends here