From 37130fd500fbf78ff0d0037aa6275f0f70a415dd Mon Sep 17 00:00:00 2001 From: Stefan Monnier Date: Tue, 3 Oct 2023 10:10:57 -0400 Subject: [PATCH] regex.c: Fix recent regression with mutually_exclusive_p The new analysis code ended up increasing the scope of an optimization a bit too far. Reign it in. * src/regex-emacs.c (struct mutexcl_data): Add `unconstrained` field. (mutually_exclusive_one): Use and set it. (mutually_exclusive_p): Initialize it. * test/src/regex-emacs-tests.el (regexp-tests-backtrack-optimization): Add test. --- src/regex-emacs.c | 41 ++++++++++++++++++++++++++++------- test/src/regex-emacs-tests.el | 9 ++++---- 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/src/regex-emacs.c b/src/regex-emacs.c index ffb8891d3a6..95c3366652d 100644 --- a/src/regex-emacs.c +++ b/src/regex-emacs.c @@ -3899,6 +3899,7 @@ mutually_exclusive_charset (struct re_pattern_buffer *bufp, re_char *p1, struct mutexcl_data { struct re_pattern_buffer *bufp; re_char *p1; + bool unconstrained; }; static bool @@ -3907,7 +3908,32 @@ mutually_exclusive_one (re_char *p2, void *arg) struct mutexcl_data *data = arg; switch (*p2) { + case succeed: + /* If `p1` matches, `succeed` can still match, so we should return + `false`. *BUT* when N iterations of `p1` and N+1 iterations of `p1` + match, the `succeed` that comes after N+1 always takes precedence + over the one after N because we always prefer a longer match, so + the succeed after N can actually be replaced by a "fail" without + changing the end result. + In this sense, "if `p1` matches, `succeed` can't match". + So we can return `true`. + *BUT* this only holds if we're sure that the N+1 will indeed succeed, + so we need to make sure there is no other matching operator between + the exit of the iteration and the `succeed`. */ + return data->unconstrained; + +/* Remember that there may be an empty matching operator on the way. + If we return true, this is the "end" of this control flow path, + so it can't get in the way of a subsequent `succeed. */ +#define RETURN_CONSTRAIN(v) \ + { bool tmp = (v); \ + if (!tmp) \ + data->unconstrained = false; \ + return tmp; \ + } + case endline: + RETURN_CONSTRAIN (mutually_exclusive_exactn (data->bufp, data->p1, p2)); case exactn: return mutually_exclusive_exactn (data->bufp, data->p1, p2); case charset: @@ -3945,18 +3971,17 @@ mutually_exclusive_one (re_char *p2, void *arg) return (*data->p1 == categoryspec && data->p1[1] == p2[1]); case endbuf: - case succeed: return true; case wordbeg: - return (*data->p1 == notsyntaxspec && data->p1[1] == Sword); + RETURN_CONSTRAIN (*data->p1 == notsyntaxspec && data->p1[1] == Sword); case wordend: - return (*data->p1 == syntaxspec && data->p1[1] == Sword); + RETURN_CONSTRAIN (*data->p1 == syntaxspec && data->p1[1] == Sword); case symbeg: - return (*data->p1 == notsyntaxspec - && (data->p1[1] == Ssymbol || data->p1[1] == Sword)); + RETURN_CONSTRAIN (*data->p1 == notsyntaxspec + && (data->p1[1] == Ssymbol || data->p1[1] == Sword)); case symend: - return (*data->p1 == syntaxspec - && (data->p1[1] == Ssymbol || data->p1[1] == Sword)); + RETURN_CONSTRAIN (*data->p1 == syntaxspec + && (data->p1[1] == Ssymbol || data->p1[1] == Sword)); case duplicate: /* At this point, we know nothing about what this can match, sadly. */ @@ -3976,7 +4001,7 @@ static bool mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1, re_char *p2) { - struct mutexcl_data data = { bufp, p1 }; + struct mutexcl_data data = { bufp, p1, true }; return forall_firstchar (bufp, p2, NULL, mutually_exclusive_one, &data); } diff --git a/test/src/regex-emacs-tests.el b/test/src/regex-emacs-tests.el index 621e4dbe2c0..615d905e140 100644 --- a/test/src/regex-emacs-tests.el +++ b/test/src/regex-emacs-tests.el @@ -555,10 +555,10 @@ known/benign differences in behavior.") (defconst regex-tests-PTESTS-whitelist [ - ;; emacs doesn't see DEL (0x7f) as a [:cntrl:] character + ;; Emacs doesn't see DEL (0x7f) as a [:cntrl:] character 138 - ;; emacs doesn't barf on weird ranges such as [b-a], but simply + ;; Emacs doesn't barf on weird ranges such as [b-a], but simply ;; fails to match 168 ] @@ -872,14 +872,14 @@ This evaluates the TESTS test cases from glibc." (should (equal (string-match "\\`\\(?:ab\\)*\\'" "a") nil)) (should (equal (string-match "\\`a\\{2\\}*\\'" "a") nil))) -(ert-deftest regexp-tests-backtrack-optimization () ;bug#61514 +(ert-deftest regexp-tests-backtrack-optimization () ;; Make sure we don't use up the regexp stack needlessly. (with-current-buffer (get-buffer-create "*bug*") (erase-buffer) (insert (make-string 1000000 ?x) "=") (goto-char (point-min)) ;; Make sure we do perform the optimization (if we don't, the - ;; below will burp with regexp-stack-overflow). + ;; below will burp with regexp-stack-overflow). ;bug#61514 (should (looking-at "x*=*")) (should (looking-at "x*\\(=\\|:\\)")) (should (looking-at "x*\\(=\\|:\\)*")) @@ -908,6 +908,7 @@ This evaluates the TESTS test cases from glibc." (should (eq 0 (string-match "\\(ca*\\|ab\\)+d" "cabd"))) (should (string-match "\\(aa*\\|b\\)*c" "ababc")) (should (string-match " \\sw*\\bfoo" " foo")) + (should (string-match ".*\\>" "hello ")) )) (ert-deftest regexp-tests-zero-width-assertion-repetition () -- 2.39.2