From: Stefan Monnier Date: Fri, 15 Sep 2023 18:44:59 +0000 (-0400) Subject: (mutually_exclusive_p): Fix the regression from commit 6fad73d7cc53 X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=e7e925f062f8d50375daa40ad7981c6d44cd7f05;p=emacs.git (mutually_exclusive_p): Fix the regression from commit 6fad73d7cc53 Commit 6fad73d7cc53 throws away some useful optimization because it misfired in some cases (as seen in bug#657260). Here we try to recover those useful optimizations with a slightly more careful algorithm. * src/regex-emacs.c (mutually_exclusive_aux): Rename from `mutually_exclusive_p`. Add two new args. Improve the case where we need to recurse. (mutually_exclusive_p): New function defined on top of it. * test/src/regex-emacs-tests.el (regexp-tests-backtrack-optimization): Re-enable the test. --- diff --git a/src/regex-emacs.c b/src/regex-emacs.c index 52f240bdaf6..55d0a6e8df8 100644 --- a/src/regex-emacs.c +++ b/src/regex-emacs.c @@ -3743,9 +3743,20 @@ mutually_exclusive_charset (struct re_pattern_buffer *bufp, re_char *p1, } /* True if "p1 matches something" implies "p2 fails". */ +/* Avoiding inf-loops: + We're trying to follow all paths reachable from `p2`, but since some + loops can match the empty string, this can loop back to `p2`. + To avoid inf-looping, we keep track of points that have been considered + "already". Instead of keeping a list of such points, `done_beg` and + `done_end` delimit a chunk of bytecode we already considered. + To guarantee termination, a lexical ordering between `done_*` and `p2` + should be obeyed: + At each recursion, either `done_beg` gets smaller, + or `done_beg` is unchanged and `done_end` gets larger + or they're both unchanged and `p2` gets larger. */ static bool -mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1, - re_char *p2) +mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1, + re_char *p2, re_char *done_beg, re_char *done_end) { re_opcode_t op2; unsigned char *pend = bufp->buffer + bufp->used; @@ -3754,6 +3765,9 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1, eassert (p1 >= bufp->buffer && p1 < pend && p2 >= bufp->buffer && p2 <= pend); + eassert (done_beg <= done_end); + eassert (done_end <= p2); + /* Skip over open/close-group commands. If what follows this loop is a ...+ construct, look at what begins its body, since we will have to @@ -3844,12 +3858,29 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1, int mcnt; p2++; EXTRACT_NUMBER_AND_INCR (mcnt, p2); - /* Don't just test `mcnt > 0` because non-greedy loops have - their test at the end with an unconditional jump at the start. */ - if (p2 > p2_orig && mcnt >= 0) /* Ensure forward progress. */ - return (mutually_exclusive_p (bufp, p1, p2) - && mutually_exclusive_p (bufp, p1, p2 + mcnt)); - break; + re_char *p2_other = p2 + mcnt; + + /* When we jump backward we bump `done_end` up to `p3` under + the assumption that any other position between `done_end` + and `p3` is either: + - checked by the other call to RECURSE. + - not reachable from here (e.g. for positions before the + `on_failure_jump`), or at least not without first + jumping before `done_beg`. + This should hold because our state machines are not arbitrary: + they consists of syntaxically nested loops with limited + control flow. + FIXME: This can fail (i.e. return true when it shouldn't) + if we start generating bytecode with a different shape, + so maybe we should bite the bullet and replace done_beg/end + with an actual list of positions we've already processed. */ +#define RECURSE(p3) \ + ((p3) < done_beg ? mutually_exclusive_aux (bufp, p1, p3, p3, p3) \ + : (p3) <= done_end ? true \ + : mutually_exclusive_aux (bufp, p1, p3, done_beg, \ + (p3) > p2_orig ? done_end : (p3))) + + return RECURSE (p2) && RECURSE (p2_other); } default: @@ -3860,6 +3891,12 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1, return false; } +static bool +mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1, + re_char *p2) +{ + return mutually_exclusive_aux (bufp, p1, p2, p2, p2); +} /* Matching routines. */ diff --git a/test/src/regex-emacs-tests.el b/test/src/regex-emacs-tests.el index ed0dc4c5a9d..f2bee713864 100644 --- a/test/src/regex-emacs-tests.el +++ b/test/src/regex-emacs-tests.el @@ -873,7 +873,6 @@ This evaluates the TESTS test cases from glibc." (should (equal (string-match "\\`a\\{2\\}*\\'" "a") nil))) (ert-deftest regexp-tests-backtrack-optimization () ;bug#61514 - :expected-result :failed ;; Make sure we don't use up the regexp stack needlessly. (with-current-buffer (get-buffer-create "*bug*") (erase-buffer)