From be91192ecb1e0dff794582cd463f0a6480d160ef Mon Sep 17 00:00:00 2001 From: =?utf8?q?Mattias=20Engdeg=C3=A5rd?= Date: Tue, 20 Jun 2023 12:12:50 +0200 Subject: [PATCH] Straighten regexp postfix operator after zero-width assertion parse The zero-width assertions \` \' \b \B were parsed in a sloppy way so that a following postfix repetition operator could yield surprising results. For instance, "\\b*" would act as "\\b\\*", and "xy\\b*" would act as "\\(?:xy\\b\\)*". Except for \` and ^, any following postfix operator now applies to the zero-width assertion itself only, which is predictable and consistent with other assertions, if useless in practice. For historical compatibility, an operator character following \` and ^ always becomes a literal. (Bug#64128) * src/regex-emacs.c (regex_compile): Set `laststart` appropriately for each zero-width assertion instead of leaving it with whatever value it had before. Remove a redundant condition. * test/src/regex-emacs-tests.el (regexp-tests-zero-width-assertion-repetition): New test. * doc/lispref/searching.texi (Regexp Special): Say that repetition operators are not special after \`, and that they work as expected after other backslash escapes. * etc/NEWS: Announce. --- doc/lispref/searching.texi | 6 +--- etc/NEWS | 8 +++++ src/regex-emacs.c | 15 ++++++-- test/src/regex-emacs-tests.el | 66 +++++++++++++++++++++++++++++++++++ 4 files changed, 88 insertions(+), 7 deletions(-) diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi index 28230cea643..7c9893054d9 100644 --- a/doc/lispref/searching.texi +++ b/doc/lispref/searching.texi @@ -546,15 +546,11 @@ example, the regular expression that matches the @samp{\} character is For historical compatibility, a repetition operator is treated as ordinary if it appears at the start of a regular expression -or after @samp{^}, @samp{\(}, @samp{\(?:} or @samp{\|}. +or after @samp{^}, @samp{\`}, @samp{\(}, @samp{\(?:} or @samp{\|}. For example, @samp{*foo} is treated as @samp{\*foo}, and @samp{two\|^\@{2\@}} is treated as @samp{two\|^@{2@}}. It is poor practice to depend on this behavior; use proper backslash escaping anyway, regardless of where the repetition operator appears. -Also, a repetition operator should not immediately follow a backslash escape -that matches only empty strings, as Emacs has bugs in this area. -For example, it is unwise to use @samp{\b*}, which can be omitted -without changing the documented meaning of the regular expression. As a @samp{\} is not special inside a bracket expression, it can never remove the special meaning of @samp{-}, @samp{^} or @samp{]}. diff --git a/etc/NEWS b/etc/NEWS index d703b7e77be..7552640663f 100644 --- a/etc/NEWS +++ b/etc/NEWS @@ -475,6 +475,14 @@ symbol, and either that symbol is ':eval' and the second element of the list evaluates to 'nil' or the symbol's value as a variable is 'nil' or void. ++++ +** Regexp zero-width assertions followed by operators are better defined. +Previously, regexps such as "xy\\B*" would have ill-defined behaviour. +Now any operator following a zero-width assertion applies to that +assertion only (which is useless). For historical compatibility, an +operator character following '^' or '\`' becomes literal, but we +advise against relying on this. + * Lisp Changes in Emacs 30.1 diff --git a/src/regex-emacs.c b/src/regex-emacs.c index fea34df991b..9e298b81ebb 100644 --- a/src/regex-emacs.c +++ b/src/regex-emacs.c @@ -1716,7 +1716,8 @@ regex_compile (re_char *pattern, ptrdiff_t size, /* Address of start of the most recently finished expression. This tells, e.g., postfix * where to find the start of its - operand. Reset at the beginning of groups and alternatives. */ + operand. Reset at the beginning of groups and alternatives, + and after ^ and \` for dusty-deck compatibility. */ unsigned char *laststart = 0; /* Address of beginning of regexp, or inside of last group. */ @@ -1847,12 +1848,16 @@ regex_compile (re_char *pattern, ptrdiff_t size, case '^': if (! (p == pattern + 1 || at_begline_loc_p (pattern, p))) goto normal_char; + /* Special case for compatibility: postfix ops after ^ become + literals. */ + laststart = 0; BUF_PUSH (begline); break; case '$': if (! (p == pend || at_endline_loc_p (p, pend))) goto normal_char; + laststart = b; BUF_PUSH (endline); break; @@ -1892,7 +1897,7 @@ regex_compile (re_char *pattern, ptrdiff_t size, /* Star, etc. applied to an empty pattern is equivalent to an empty pattern. */ - if (!laststart || laststart == b) + if (laststart == b) break; /* Now we know whether or not zero matches is allowed @@ -2544,18 +2549,24 @@ regex_compile (re_char *pattern, ptrdiff_t size, break; case 'b': + laststart = b; BUF_PUSH (wordbound); break; case 'B': + laststart = b; BUF_PUSH (notwordbound); break; case '`': + /* Special case for compatibility: postfix ops after \` become + literals, as for ^ (see above). */ + laststart = 0; BUF_PUSH (begbuf); break; case '\'': + laststart = b; BUF_PUSH (endbuf); break; diff --git a/test/src/regex-emacs-tests.el b/test/src/regex-emacs-tests.el index 52d43775b8e..08a93dbf30e 100644 --- a/test/src/regex-emacs-tests.el +++ b/test/src/regex-emacs-tests.el @@ -883,4 +883,70 @@ This evaluates the TESTS test cases from glibc." (should (looking-at "x*\\(=\\|:\\)*")) (should (looking-at "x*=*?")))) +(ert-deftest regexp-tests-zero-width-assertion-repetition () + ;; Check compatibility behaviour with repetition operators after + ;; certain zero-width assertions (bug#64128). + + ;; This function is just to hide ugly regexps from relint so that it + ;; doesn't complain about them. + (cl-flet ((smatch (re str) (string-match re str))) + ;; Postfix operators after ^ and \` become literals, for historical + ;; compatibility. Only the first character of a lazy operator (like *?) + ;; becomes a literal. + (should (equal (smatch "^*a" "x\n*a") 2)) + (should (equal (smatch "^*?a" "x\n*a") 2)) + (should (equal (smatch "^*?a" "x\na") 2)) + (should (equal (smatch "^*?a" "x\n**a") nil)) + + (should (equal (smatch "\\`*a" "*a") 0)) + (should (equal (smatch "\\`*?a" "*a") 0)) + (should (equal (smatch "\\`*?a" "a") 0)) + (should (equal (smatch "\\`*?a" "**a") nil)) + + ;; Other zero-width assertions are treated as normal elements, so postfix + ;; operators apply to them alone (which is pointless but valid). + (should (equal (smatch "\\b*!" "*!") 1)) + (should (equal (smatch "!\\b+;" "!;") nil)) + (should (equal (smatch "!\\b+a" "!a") 0)) + + (should (equal (smatch "\\B*!" "*!") 1)) + (should (equal (smatch "!\\B+;" "!;") 0)) + (should (equal (smatch "!\\B+a" "!a") nil)) + + (should (equal (smatch "\\<*b" "*b") 1)) + (should (equal (smatch "a\\<*b" "ab") 0)) + (should (equal (smatch ";\\<*b" ";b") 0)) + (should (equal (smatch "a\\<+b" "ab") nil)) + (should (equal (smatch ";\\<+b" ";b") 0)) + + (should (equal (smatch "\\>*;" "*;") 1)) + (should (equal (smatch "a\\>*b" "ab") 0)) + (should (equal (smatch "a\\>*;" "a;") 0)) + (should (equal (smatch "a\\>+b" "ab") nil)) + (should (equal (smatch "a\\>+;" "a;") 0)) + + (should (equal (smatch "a\\'" "ab") nil)) + (should (equal (smatch "b\\'" "ab") 1)) + (should (equal (smatch "a\\'*b" "ab") 0)) + (should (equal (smatch "a\\'+" "ab") nil)) + (should (equal (smatch "b\\'+" "ab") 1)) + (should (equal (smatch "\\'+" "+") 1)) + + (should (equal (smatch "\\_<*b" "*b") 1)) + (should (equal (smatch "a\\_<*b" "ab") 0)) + (should (equal (smatch " \\_<*b" " b") 0)) + (should (equal (smatch "a\\_<+b" "ab") nil)) + (should (equal (smatch " \\_<+b" " b") 0)) + + (should (equal (smatch "\\_>*;" "*;") 1)) + (should (equal (smatch "a\\_>*b" "ab") 0)) + (should (equal (smatch "a\\_>* " "a ") 0)) + (should (equal (smatch "a\\_>+b" "ab") nil)) + (should (equal (smatch "a\\_>+ " "a ") 0)) + + (should (equal (smatch "\\=*b" "*b") 1)) + (should (equal (smatch "a\\=*b" "a*b") nil)) + (should (equal (smatch "a\\=*b" "ab") 0)) + )) + ;;; regex-emacs-tests.el ends here -- 2.39.2