From: Daniel Colascione Date: Sat, 16 Jun 2018 20:46:10 +0000 (-0700) Subject: Make regex matching reentrant; update syntax during match X-Git-Tag: emacs-27.0.90~4827 X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=938d252d1c6c5e2027aa250c649deb024154f936;p=emacs.git Make regex matching reentrant; update syntax during match * src/lisp.h (compile_pattern): Remove prototype of now-internal function. * src/regex.c (POS_AS_IN_BUFFER): Consult gl_state instead of re_match_object: the latter can change in Lisp. (re_match_2_internal): Switch back to UPDATE_SYNTAX_* FROM UPDATE_SYNTAX_FAST*, allowing calls into Lisp. * src/regex.h (re_match_object): Uncomment declaration. * src/search.c (struct regexp_cache): Add `busy' field. (thaw_buffer_relocation): Delete; rely on unbind. (compile_pattern_1): Assert pattern isn't busy. (shrink_regexp_cache): Don't shrink busy patterns. (clear_regexp_cache): Don't nuke busy patterns. (unfreeze_pattern, freeze_pattern): New functions. (compile_pattern): Return a regexp_cache pointer instead of the re_pattern_buffer, allowing callers to use `freeze_pattern' if needed. Do not consider busy patterns as cache hit candidates; error if we run out of non-busy cache entries. (looking_at_1, fast_looking_at): Snapshot Vinhibit_changing_match_data; mark pattern busy while we're matching it; unbind. (string_match_1, fast_string_match_internal) (fast_c_string_match_ignore_case): Adjust for compile_pattern return type. (search_buffer_re): Regex code from old search_buffer moved here; snapshot Vinhibit_changing_match_data; mark pattern busy while we're matching it; unbind. (search_buffer_non_re): Non-regex code from old search_buffer moved here. (search_buffer): Split into search_buffer_re, search_buffer_non_re. (syms_of_search): Staticpro re_match_object, even though we really shouldn't have to. * src/syntax.h (UPDATE_SYNTAX_TABLE_FORWARD_FAST): (UPDATE_SYNTAX_TABLE_FAST): Remove. * src/thread.h (struct thread_state): Remove m_re_match_object, which is global again. (It never needs to be preserved across thread switch.) --- diff --git a/src/lisp.h b/src/lisp.h index ff708ebf60e..d0c52d85672 100644 --- a/src/lisp.h +++ b/src/lisp.h @@ -4029,10 +4029,6 @@ extern void restore_search_regs (void); extern void update_search_regs (ptrdiff_t oldstart, ptrdiff_t oldend, ptrdiff_t newend); extern void record_unwind_save_match_data (void); -struct re_registers; -extern struct re_pattern_buffer *compile_pattern (Lisp_Object, - struct re_registers *, - Lisp_Object, bool, bool); extern ptrdiff_t fast_string_match_internal (Lisp_Object, Lisp_Object, Lisp_Object); diff --git a/src/regex.c b/src/regex.c index 85e63feea10..b8c6f3f19b2 100644 --- a/src/regex.c +++ b/src/regex.c @@ -155,7 +155,8 @@ # define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d)) /* Strings are 0-indexed, buffers are 1-indexed; we pun on the boolean result to get the right base index. */ -# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object))) +# define POS_AS_IN_BUFFER(p) \ + ((p) + (NILP (gl_state.object) || BUFFERP (gl_state.object))) # define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte) # define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte) @@ -1233,6 +1234,15 @@ static const char *re_error_msgid[] = # undef MATCH_MAY_ALLOCATE #endif +/* While regex matching of a single compiled pattern isn't reentrant + (because we compile regexes to bytecode programs, and the bytecode + programs are self-modifying), the regex machinery must nevertheless + be reentrant with respect to _different_ patterns, and we do that + by avoiding global variables and using MATCH_MAY_ALLOCATE. */ +#if !defined MATCH_MAY_ALLOCATE && defined emacs +# error "Emacs requires MATCH_MAY_ALLOCATE" +#endif + /* Failure stack declarations and macros; both re_compile_fastmap and re_match_2 use a failure stack. These have to be macros because of @@ -5895,12 +5905,12 @@ re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1, #ifdef emacs ssize_t offset = PTR_TO_OFFSET (d - 1); ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); - UPDATE_SYNTAX_TABLE_FAST (charpos); + UPDATE_SYNTAX_TABLE (charpos); #endif GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); s1 = SYNTAX (c1); #ifdef emacs - UPDATE_SYNTAX_TABLE_FORWARD_FAST (charpos + 1); + UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1); #endif PREFETCH_NOLIMIT (); GET_CHAR_AFTER (c2, d, dummy); @@ -5937,7 +5947,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1, #ifdef emacs ssize_t offset = PTR_TO_OFFSET (d); ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); - UPDATE_SYNTAX_TABLE_FAST (charpos); + UPDATE_SYNTAX_TABLE (charpos); #endif PREFETCH (); GET_CHAR_AFTER (c2, d, dummy); @@ -5982,7 +5992,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1, #ifdef emacs ssize_t offset = PTR_TO_OFFSET (d) - 1; ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); - UPDATE_SYNTAX_TABLE_FAST (charpos); + UPDATE_SYNTAX_TABLE (charpos); #endif GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); s1 = SYNTAX (c1); @@ -5997,7 +6007,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1, PREFETCH_NOLIMIT (); GET_CHAR_AFTER (c2, d, dummy); #ifdef emacs - UPDATE_SYNTAX_TABLE_FORWARD_FAST (charpos); + UPDATE_SYNTAX_TABLE_FORWARD (charpos); #endif s2 = SYNTAX (c2); @@ -6026,7 +6036,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1, #ifdef emacs ssize_t offset = PTR_TO_OFFSET (d); ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); - UPDATE_SYNTAX_TABLE_FAST (charpos); + UPDATE_SYNTAX_TABLE (charpos); #endif PREFETCH (); c2 = RE_STRING_CHAR (d, target_multibyte); @@ -6069,7 +6079,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1, #ifdef emacs ssize_t offset = PTR_TO_OFFSET (d) - 1; ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); - UPDATE_SYNTAX_TABLE_FAST (charpos); + UPDATE_SYNTAX_TABLE (charpos); #endif GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); s1 = SYNTAX (c1); @@ -6084,7 +6094,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1, PREFETCH_NOLIMIT (); c2 = RE_STRING_CHAR (d, target_multibyte); #ifdef emacs - UPDATE_SYNTAX_TABLE_FORWARD_FAST (charpos + 1); + UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1); #endif s2 = SYNTAX (c2); @@ -6107,7 +6117,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1, { ssize_t offset = PTR_TO_OFFSET (d); ssize_t pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset); - UPDATE_SYNTAX_TABLE_FAST (pos1); + UPDATE_SYNTAX_TABLE (pos1); } #endif { diff --git a/src/regex.h b/src/regex.h index 082f7e010d8..3a2d74d86a1 100644 --- a/src/regex.h +++ b/src/regex.h @@ -181,8 +181,15 @@ typedef unsigned long reg_syntax_t; string; if it's nil, we are matching text in the current buffer; if it's t, we are matching text in a C string. - This is defined as a macro in thread.h, which see. */ -/* extern Lisp_Object re_match_object; */ + This value is effectively another parameter to re_search_2 and + re_match_2. No calls into Lisp or thread switches are allowed + before setting re_match_object and calling into the regex search + and match functions. These functions capture the current value of + re_match_object into gl_state on entry. + + TODO: once we get rid of the !emacs case in this code, turn into an + actual function parameter. */ +extern Lisp_Object re_match_object; #endif /* Roughly the maximum number of failure points on the stack. */ diff --git a/src/search.c b/src/search.c index a21c01ca4b4..ccdb659776d 100644 --- a/src/search.c +++ b/src/search.c @@ -48,6 +48,8 @@ struct regexp_cache char fastmap[0400]; /* True means regexp was compiled to do full POSIX backtracking. */ bool posix; + /* True means we're inside a buffer match. */ + bool busy; }; /* The instances of that struct. */ @@ -93,6 +95,8 @@ static EMACS_INT search_buffer (Lisp_Object, ptrdiff_t, ptrdiff_t, ptrdiff_t, ptrdiff_t, EMACS_INT, int, Lisp_Object, Lisp_Object, bool); +Lisp_Object re_match_object; + static _Noreturn void matcher_overflow (void) { @@ -110,14 +114,6 @@ freeze_buffer_relocation (void) #endif } -static void -thaw_buffer_relocation (void) -{ -#ifdef REL_ALLOC - unbind_to (SPECPDL_INDEX () - 1, Qnil); -#endif -} - /* Compile a regexp and signal a Lisp error if anything goes wrong. PATTERN is the pattern to compile. CP is the place to put the result. @@ -134,6 +130,7 @@ compile_pattern_1 (struct regexp_cache *cp, Lisp_Object pattern, const char *whitespace_regexp; char *val; + eassert (!cp->busy); cp->regexp = Qnil; cp->buf.translate = (! NILP (translate) ? translate : make_number (0)); cp->posix = posix; @@ -170,10 +167,11 @@ shrink_regexp_cache (void) struct regexp_cache *cp; for (cp = searchbuf_head; cp != 0; cp = cp->next) - { - cp->buf.allocated = cp->buf.used; - cp->buf.buffer = xrealloc (cp->buf.buffer, cp->buf.used); - } + if (!cp->busy) + { + cp->buf.allocated = cp->buf.used; + cp->buf.buffer = xrealloc (cp->buf.buffer, cp->buf.used); + } } /* Clear the regexp cache w.r.t. a particular syntax table, @@ -190,10 +188,25 @@ clear_regexp_cache (void) /* It's tempting to compare with the syntax-table we've actually changed, but it's not sufficient because char-table inheritance means that modifying one syntax-table can change others at the same time. */ - if (!EQ (searchbufs[i].syntax_table, Qt)) + if (!searchbufs[i].busy && !EQ (searchbufs[i].syntax_table, Qt)) searchbufs[i].regexp = Qnil; } +static void +unfreeze_pattern (void *arg) +{ + struct regexp_cache *searchbuf = arg; + searchbuf->busy = false; +} + +static void +freeze_pattern (struct regexp_cache *searchbuf) +{ + eassert (!searchbuf->busy); + record_unwind_protect_ptr (unfreeze_pattern, searchbuf); + searchbuf->busy = true; +} + /* Compile a regexp if necessary, but first check to see if there's one in the cache. PATTERN is the pattern to compile. @@ -205,7 +218,7 @@ clear_regexp_cache (void) POSIX is true if we want full backtracking (POSIX style) for this pattern. False means backtrack only enough to get a valid match. */ -struct re_pattern_buffer * +static struct regexp_cache * compile_pattern (Lisp_Object pattern, struct re_registers *regp, Lisp_Object translate, bool posix, bool multibyte) { @@ -222,6 +235,7 @@ compile_pattern (Lisp_Object pattern, struct re_registers *regp, if (NILP (cp->regexp)) goto compile_it; if (SCHARS (cp->regexp) == SCHARS (pattern) + && !cp->busy && STRING_MULTIBYTE (cp->regexp) == STRING_MULTIBYTE (pattern) && !NILP (Fstring_equal (cp->regexp, pattern)) && EQ (cp->buf.translate, (! NILP (translate) ? translate : make_number (0))) @@ -237,7 +251,10 @@ compile_pattern (Lisp_Object pattern, struct re_registers *regp, string value. */ if (cp->next == 0) { + if (cp->busy) + error ("Too much matching reentrancy"); compile_it: + eassert (!cp->busy); compile_pattern_1 (cp, pattern, translate, posix); break; } @@ -258,8 +275,7 @@ compile_pattern (Lisp_Object pattern, struct re_registers *regp, /* The compiled pattern can be used both for multibyte and unibyte target. But, we have to tell which the pattern is used for. */ cp->buf.target_multibyte = multibyte; - - return &cp->buf; + return cp; } @@ -270,7 +286,6 @@ looking_at_1 (Lisp_Object string, bool posix) unsigned char *p1, *p2; ptrdiff_t s1, s2; register ptrdiff_t i; - struct re_pattern_buffer *bufp; if (running_asynch_code) save_search_regs (); @@ -280,13 +295,17 @@ looking_at_1 (Lisp_Object string, bool posix) BVAR (current_buffer, case_eqv_table)); CHECK_STRING (string); - bufp = compile_pattern (string, - (NILP (Vinhibit_changing_match_data) - ? &search_regs : NULL), - (!NILP (BVAR (current_buffer, case_fold_search)) - ? BVAR (current_buffer, case_canon_table) : Qnil), - posix, - !NILP (BVAR (current_buffer, enable_multibyte_characters))); + + /* Snapshot in case Lisp changes the value. */ + bool preserve_match_data = NILP (Vinhibit_changing_match_data); + + struct regexp_cache *cache_entry = compile_pattern ( + string, + preserve_match_data ? &search_regs : NULL, + (!NILP (BVAR (current_buffer, case_fold_search)) + ? BVAR (current_buffer, case_canon_table) : Qnil), + posix, + !NILP (BVAR (current_buffer, enable_multibyte_characters))); /* Do a pending quit right away, to avoid paradoxical behavior */ maybe_quit (); @@ -310,21 +329,20 @@ looking_at_1 (Lisp_Object string, bool posix) s2 = 0; } - re_match_object = Qnil; - + ptrdiff_t count = SPECPDL_INDEX (); freeze_buffer_relocation (); - i = re_match_2 (bufp, (char *) p1, s1, (char *) p2, s2, + freeze_pattern (cache_entry); + re_match_object = Qnil; + i = re_match_2 (&cache_entry->buf, (char *) p1, s1, (char *) p2, s2, PT_BYTE - BEGV_BYTE, - (NILP (Vinhibit_changing_match_data) - ? &search_regs : NULL), + preserve_match_data ? &search_regs : NULL, ZV_BYTE - BEGV_BYTE); - thaw_buffer_relocation (); if (i == -2) matcher_overflow (); val = (i >= 0 ? Qt : Qnil); - if (NILP (Vinhibit_changing_match_data) && i >= 0) + if (preserve_match_data && i >= 0) { for (i = 0; i < search_regs.num_regs; i++) if (search_regs.start[i] >= 0) @@ -338,7 +356,7 @@ looking_at_1 (Lisp_Object string, bool posix) XSETBUFFER (last_thing_searched, current_buffer); } - return val; + return unbind_to (count, val); } DEFUN ("looking-at", Flooking_at, Slooking_at, 1, 1, 0, @@ -396,15 +414,14 @@ string_match_1 (Lisp_Object regexp, Lisp_Object string, Lisp_Object start, set_char_table_extras (BVAR (current_buffer, case_canon_table), 2, BVAR (current_buffer, case_eqv_table)); - bufp = compile_pattern (regexp, - (NILP (Vinhibit_changing_match_data) - ? &search_regs : NULL), - (!NILP (BVAR (current_buffer, case_fold_search)) - ? BVAR (current_buffer, case_canon_table) : Qnil), - posix, - STRING_MULTIBYTE (string)); + bufp = &compile_pattern (regexp, + (NILP (Vinhibit_changing_match_data) + ? &search_regs : NULL), + (!NILP (BVAR (current_buffer, case_fold_search)) + ? BVAR (current_buffer, case_canon_table) : Qnil), + posix, + STRING_MULTIBYTE (string))->buf; re_match_object = string; - val = re_search (bufp, SSDATA (string), SBYTES (string), pos_byte, SBYTES (string) - pos_byte, @@ -471,10 +488,9 @@ fast_string_match_internal (Lisp_Object regexp, Lisp_Object string, ptrdiff_t val; struct re_pattern_buffer *bufp; - bufp = compile_pattern (regexp, 0, table, - 0, STRING_MULTIBYTE (string)); + bufp = &compile_pattern (regexp, 0, table, + 0, STRING_MULTIBYTE (string))->buf; re_match_object = string; - val = re_search (bufp, SSDATA (string), SBYTES (string), 0, SBYTES (string), 0); @@ -494,10 +510,10 @@ fast_c_string_match_ignore_case (Lisp_Object regexp, struct re_pattern_buffer *bufp; regexp = string_make_unibyte (regexp); + bufp = &compile_pattern (regexp, 0, + Vascii_canon_table, 0, + 0)->buf; re_match_object = Qt; - bufp = compile_pattern (regexp, 0, - Vascii_canon_table, 0, - 0); val = re_search (bufp, string, len, 0, len, 0); return val; } @@ -513,7 +529,6 @@ fast_looking_at (Lisp_Object regexp, ptrdiff_t pos, ptrdiff_t pos_byte, ptrdiff_t limit, ptrdiff_t limit_byte, Lisp_Object string) { bool multibyte; - struct re_pattern_buffer *buf; unsigned char *p1, *p2; ptrdiff_t s1, s2; ptrdiff_t len; @@ -528,7 +543,6 @@ fast_looking_at (Lisp_Object regexp, ptrdiff_t pos, ptrdiff_t pos_byte, s1 = 0; p2 = SDATA (string); s2 = SBYTES (string); - re_match_object = string; multibyte = STRING_MULTIBYTE (string); } else @@ -554,16 +568,19 @@ fast_looking_at (Lisp_Object regexp, ptrdiff_t pos, ptrdiff_t pos_byte, s1 = ZV_BYTE - BEGV_BYTE; s2 = 0; } - re_match_object = Qnil; multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters)); } - buf = compile_pattern (regexp, 0, Qnil, 0, multibyte); + struct regexp_cache *cache_entry = + compile_pattern (regexp, 0, Qnil, 0, multibyte); + ptrdiff_t count = SPECPDL_INDEX (); freeze_buffer_relocation (); - len = re_match_2 (buf, (char *) p1, s1, (char *) p2, s2, + freeze_pattern (cache_entry); + re_match_object = STRINGP (string) ? string : Qnil; + len = re_match_2 (&cache_entry->buf, (char *) p1, s1, (char *) p2, s2, pos_byte, NULL, limit_byte); - thaw_buffer_relocation (); + unbind_to (count, Qnil); return len; } @@ -1151,355 +1168,372 @@ while (0) static struct re_registers search_regs_1; static EMACS_INT -search_buffer (Lisp_Object string, ptrdiff_t pos, ptrdiff_t pos_byte, - ptrdiff_t lim, ptrdiff_t lim_byte, EMACS_INT n, - int RE, Lisp_Object trt, Lisp_Object inverse_trt, bool posix) +search_buffer_re (Lisp_Object string, ptrdiff_t pos, ptrdiff_t pos_byte, + ptrdiff_t lim, ptrdiff_t lim_byte, EMACS_INT n, + Lisp_Object trt, Lisp_Object inverse_trt, bool posix) { - ptrdiff_t len = SCHARS (string); - ptrdiff_t len_byte = SBYTES (string); - register ptrdiff_t i; + unsigned char *p1, *p2; + ptrdiff_t s1, s2; - if (running_asynch_code) - save_search_regs (); + /* Snapshot in case Lisp changes the value. */ + bool preserve_match_data = NILP (Vinhibit_changing_match_data); - /* Searching 0 times means don't move. */ - /* Null string is found at starting position. */ - if (len == 0 || n == 0) + struct regexp_cache *cache_entry = + compile_pattern (string, + preserve_match_data ? &search_regs : &search_regs_1, + trt, posix, + !NILP (BVAR (current_buffer, enable_multibyte_characters))); + struct re_pattern_buffer *bufp = &cache_entry->buf; + + maybe_quit (); /* Do a pending quit right away, + to avoid paradoxical behavior */ + /* Get pointers and sizes of the two strings + that make up the visible portion of the buffer. */ + + p1 = BEGV_ADDR; + s1 = GPT_BYTE - BEGV_BYTE; + p2 = GAP_END_ADDR; + s2 = ZV_BYTE - GPT_BYTE; + if (s1 < 0) { - set_search_regs (pos_byte, 0); - return pos; + p2 = p1; + s2 = ZV_BYTE - BEGV_BYTE; + s1 = 0; } - - if (RE && !(trivial_regexp_p (string) && NILP (Vsearch_spaces_regexp))) + if (s2 < 0) { - unsigned char *p1, *p2; - ptrdiff_t s1, s2; - struct re_pattern_buffer *bufp; + s1 = ZV_BYTE - BEGV_BYTE; + s2 = 0; + } - bufp = compile_pattern (string, - (NILP (Vinhibit_changing_match_data) - ? &search_regs : &search_regs_1), - trt, posix, - !NILP (BVAR (current_buffer, enable_multibyte_characters))); + ptrdiff_t count = SPECPDL_INDEX (); + freeze_buffer_relocation (); + freeze_pattern (cache_entry); - maybe_quit (); /* Do a pending quit right away, - to avoid paradoxical behavior */ - /* Get pointers and sizes of the two strings - that make up the visible portion of the buffer. */ + while (n < 0) + { + ptrdiff_t val; - p1 = BEGV_ADDR; - s1 = GPT_BYTE - BEGV_BYTE; - p2 = GAP_END_ADDR; - s2 = ZV_BYTE - GPT_BYTE; - if (s1 < 0) - { - p2 = p1; - s2 = ZV_BYTE - BEGV_BYTE; - s1 = 0; - } - if (s2 < 0) - { - s1 = ZV_BYTE - BEGV_BYTE; - s2 = 0; - } re_match_object = Qnil; + val = re_search_2 (bufp, (char *) p1, s1, (char *) p2, s2, + pos_byte - BEGV_BYTE, lim_byte - pos_byte, + preserve_match_data ? &search_regs : &search_regs_1, + /* Don't allow match past current point */ + pos_byte - BEGV_BYTE); + if (val == -2) + { + matcher_overflow (); + } + if (val >= 0) + { + if (preserve_match_data) + { + pos_byte = search_regs.start[0] + BEGV_BYTE; + for (ptrdiff_t i = 0; i < search_regs.num_regs; i++) + if (search_regs.start[i] >= 0) + { + search_regs.start[i] + = BYTE_TO_CHAR (search_regs.start[i] + BEGV_BYTE); + search_regs.end[i] + = BYTE_TO_CHAR (search_regs.end[i] + BEGV_BYTE); + } + XSETBUFFER (last_thing_searched, current_buffer); + /* Set pos to the new position. */ + pos = search_regs.start[0]; + } + else + { + pos_byte = search_regs_1.start[0] + BEGV_BYTE; + /* Set pos to the new position. */ + pos = BYTE_TO_CHAR (search_regs_1.start[0] + BEGV_BYTE); + } + } + else + { + unbind_to (count, Qnil); + return (n); + } + n++; + maybe_quit (); + } + while (n > 0) + { + ptrdiff_t val; - freeze_buffer_relocation (); + re_match_object = Qnil; + val = re_search_2 (bufp, (char *) p1, s1, (char *) p2, s2, + pos_byte - BEGV_BYTE, lim_byte - pos_byte, + preserve_match_data ? &search_regs : &search_regs_1, + lim_byte - BEGV_BYTE); + if (val == -2) + { + matcher_overflow (); + } + if (val >= 0) + { + if (preserve_match_data) + { + pos_byte = search_regs.end[0] + BEGV_BYTE; + for (ptrdiff_t i = 0; i < search_regs.num_regs; i++) + if (search_regs.start[i] >= 0) + { + search_regs.start[i] + = BYTE_TO_CHAR (search_regs.start[i] + BEGV_BYTE); + search_regs.end[i] + = BYTE_TO_CHAR (search_regs.end[i] + BEGV_BYTE); + } + XSETBUFFER (last_thing_searched, current_buffer); + pos = search_regs.end[0]; + } + else + { + pos_byte = search_regs_1.end[0] + BEGV_BYTE; + pos = BYTE_TO_CHAR (search_regs_1.end[0] + BEGV_BYTE); + } + } + else + { + unbind_to (count, Qnil); + return (0 - n); + } + n--; + maybe_quit (); + } + unbind_to (count, Qnil); + return (pos); +} - while (n < 0) - { - ptrdiff_t val; - - val = re_search_2 (bufp, (char *) p1, s1, (char *) p2, s2, - pos_byte - BEGV_BYTE, lim_byte - pos_byte, - (NILP (Vinhibit_changing_match_data) - ? &search_regs : &search_regs_1), - /* Don't allow match past current point */ - pos_byte - BEGV_BYTE); - if (val == -2) - { - matcher_overflow (); - } - if (val >= 0) - { - if (NILP (Vinhibit_changing_match_data)) - { - pos_byte = search_regs.start[0] + BEGV_BYTE; - for (i = 0; i < search_regs.num_regs; i++) - if (search_regs.start[i] >= 0) - { - search_regs.start[i] - = BYTE_TO_CHAR (search_regs.start[i] + BEGV_BYTE); - search_regs.end[i] - = BYTE_TO_CHAR (search_regs.end[i] + BEGV_BYTE); - } - XSETBUFFER (last_thing_searched, current_buffer); - /* Set pos to the new position. */ - pos = search_regs.start[0]; - } - else - { - pos_byte = search_regs_1.start[0] + BEGV_BYTE; - /* Set pos to the new position. */ - pos = BYTE_TO_CHAR (search_regs_1.start[0] + BEGV_BYTE); - } - } - else - { - thaw_buffer_relocation (); - return (n); - } - n++; - maybe_quit (); - } - while (n > 0) - { - ptrdiff_t val; - - val = re_search_2 (bufp, (char *) p1, s1, (char *) p2, s2, - pos_byte - BEGV_BYTE, lim_byte - pos_byte, - (NILP (Vinhibit_changing_match_data) - ? &search_regs : &search_regs_1), - lim_byte - BEGV_BYTE); - if (val == -2) - { - matcher_overflow (); - } - if (val >= 0) - { - if (NILP (Vinhibit_changing_match_data)) - { - pos_byte = search_regs.end[0] + BEGV_BYTE; - for (i = 0; i < search_regs.num_regs; i++) - if (search_regs.start[i] >= 0) - { - search_regs.start[i] - = BYTE_TO_CHAR (search_regs.start[i] + BEGV_BYTE); - search_regs.end[i] - = BYTE_TO_CHAR (search_regs.end[i] + BEGV_BYTE); - } - XSETBUFFER (last_thing_searched, current_buffer); - pos = search_regs.end[0]; - } - else - { - pos_byte = search_regs_1.end[0] + BEGV_BYTE; - pos = BYTE_TO_CHAR (search_regs_1.end[0] + BEGV_BYTE); - } - } - else - { - thaw_buffer_relocation (); - return (0 - n); - } - n--; - maybe_quit (); - } - thaw_buffer_relocation (); - return (pos); +static EMACS_INT +search_buffer_non_re (Lisp_Object string, ptrdiff_t pos, + ptrdiff_t pos_byte, ptrdiff_t lim, ptrdiff_t lim_byte, + EMACS_INT n, int RE, Lisp_Object trt, Lisp_Object inverse_trt, + bool posix) +{ + unsigned char *raw_pattern, *pat; + ptrdiff_t raw_pattern_size; + ptrdiff_t raw_pattern_size_byte; + unsigned char *patbuf; + bool multibyte = !NILP (BVAR (current_buffer, enable_multibyte_characters)); + unsigned char *base_pat; + /* Set to positive if we find a non-ASCII char that need + translation. Otherwise set to zero later. */ + int char_base = -1; + bool boyer_moore_ok = 1; + USE_SAFE_ALLOCA; + + /* MULTIBYTE says whether the text to be searched is multibyte. + We must convert PATTERN to match that, or we will not really + find things right. */ + + if (multibyte == STRING_MULTIBYTE (string)) + { + raw_pattern = SDATA (string); + raw_pattern_size = SCHARS (string); + raw_pattern_size_byte = SBYTES (string); } - else /* non-RE case */ + else if (multibyte) { - unsigned char *raw_pattern, *pat; - ptrdiff_t raw_pattern_size; - ptrdiff_t raw_pattern_size_byte; - unsigned char *patbuf; - bool multibyte = !NILP (BVAR (current_buffer, enable_multibyte_characters)); - unsigned char *base_pat; - /* Set to positive if we find a non-ASCII char that need - translation. Otherwise set to zero later. */ - int char_base = -1; - bool boyer_moore_ok = 1; - USE_SAFE_ALLOCA; - - /* MULTIBYTE says whether the text to be searched is multibyte. - We must convert PATTERN to match that, or we will not really - find things right. */ - - if (multibyte == STRING_MULTIBYTE (string)) - { - raw_pattern = SDATA (string); - raw_pattern_size = SCHARS (string); - raw_pattern_size_byte = SBYTES (string); - } - else if (multibyte) - { - raw_pattern_size = SCHARS (string); - raw_pattern_size_byte - = count_size_as_multibyte (SDATA (string), - raw_pattern_size); - raw_pattern = SAFE_ALLOCA (raw_pattern_size_byte + 1); - copy_text (SDATA (string), raw_pattern, - SCHARS (string), 0, 1); - } - else - { - /* Converting multibyte to single-byte. - - ??? Perhaps this conversion should be done in a special way - by subtracting nonascii-insert-offset from each non-ASCII char, - so that only the multibyte chars which really correspond to - the chosen single-byte character set can possibly match. */ - raw_pattern_size = SCHARS (string); - raw_pattern_size_byte = SCHARS (string); - raw_pattern = SAFE_ALLOCA (raw_pattern_size + 1); - copy_text (SDATA (string), raw_pattern, - SBYTES (string), 1, 0); - } + raw_pattern_size = SCHARS (string); + raw_pattern_size_byte + = count_size_as_multibyte (SDATA (string), + raw_pattern_size); + raw_pattern = SAFE_ALLOCA (raw_pattern_size_byte + 1); + copy_text (SDATA (string), raw_pattern, + SCHARS (string), 0, 1); + } + else + { + /* Converting multibyte to single-byte. + + ??? Perhaps this conversion should be done in a special way + by subtracting nonascii-insert-offset from each non-ASCII char, + so that only the multibyte chars which really correspond to + the chosen single-byte character set can possibly match. */ + raw_pattern_size = SCHARS (string); + raw_pattern_size_byte = SCHARS (string); + raw_pattern = SAFE_ALLOCA (raw_pattern_size + 1); + copy_text (SDATA (string), raw_pattern, + SBYTES (string), 1, 0); + } - /* Copy and optionally translate the pattern. */ - len = raw_pattern_size; - len_byte = raw_pattern_size_byte; - SAFE_NALLOCA (patbuf, MAX_MULTIBYTE_LENGTH, len); - pat = patbuf; - base_pat = raw_pattern; - if (multibyte) - { - /* Fill patbuf by translated characters in STRING while - checking if we can use boyer-moore search. If TRT is - non-nil, we can use boyer-moore search only if TRT can be - represented by the byte array of 256 elements. For that, - all non-ASCII case-equivalents of all case-sensitive - characters in STRING must belong to the same character - group (two characters belong to the same group iff their - multibyte forms are the same except for the last byte; - i.e. every 64 characters form a group; U+0000..U+003F, - U+0040..U+007F, U+0080..U+00BF, ...). */ - - while (--len >= 0) - { - unsigned char str_base[MAX_MULTIBYTE_LENGTH], *str; - int c, translated, inverse; - int in_charlen, charlen; - - /* If we got here and the RE flag is set, it's because we're - dealing with a regexp known to be trivial, so the backslash - just quotes the next character. */ - if (RE && *base_pat == '\\') - { - len--; - raw_pattern_size--; - len_byte--; - base_pat++; - } + /* Copy and optionally translate the pattern. */ + ptrdiff_t len = raw_pattern_size; + ptrdiff_t len_byte = raw_pattern_size_byte; + SAFE_NALLOCA (patbuf, MAX_MULTIBYTE_LENGTH, len); + pat = patbuf; + base_pat = raw_pattern; + if (multibyte) + { + /* Fill patbuf by translated characters in STRING while + checking if we can use boyer-moore search. If TRT is + non-nil, we can use boyer-moore search only if TRT can be + represented by the byte array of 256 elements. For that, + all non-ASCII case-equivalents of all case-sensitive + characters in STRING must belong to the same character + group (two characters belong to the same group iff their + multibyte forms are the same except for the last byte; + i.e. every 64 characters form a group; U+0000..U+003F, + U+0040..U+007F, U+0080..U+00BF, ...). */ + + while (--len >= 0) + { + unsigned char str_base[MAX_MULTIBYTE_LENGTH], *str; + int c, translated, inverse; + int in_charlen, charlen; + + /* If we got here and the RE flag is set, it's because we're + dealing with a regexp known to be trivial, so the backslash + just quotes the next character. */ + if (RE && *base_pat == '\\') + { + len--; + raw_pattern_size--; + len_byte--; + base_pat++; + } - c = STRING_CHAR_AND_LENGTH (base_pat, in_charlen); + c = STRING_CHAR_AND_LENGTH (base_pat, in_charlen); - if (NILP (trt)) - { - str = base_pat; - charlen = in_charlen; - } - else - { - /* Translate the character. */ - TRANSLATE (translated, trt, c); - charlen = CHAR_STRING (translated, str_base); - str = str_base; - - /* Check if C has any other case-equivalents. */ - TRANSLATE (inverse, inverse_trt, c); - /* If so, check if we can use boyer-moore. */ - if (c != inverse && boyer_moore_ok) - { - /* Check if all equivalents belong to the same - group of characters. Note that the check of C - itself is done by the last iteration. */ - int this_char_base = -1; + if (NILP (trt)) + { + str = base_pat; + charlen = in_charlen; + } + else + { + /* Translate the character. */ + TRANSLATE (translated, trt, c); + charlen = CHAR_STRING (translated, str_base); + str = str_base; + + /* Check if C has any other case-equivalents. */ + TRANSLATE (inverse, inverse_trt, c); + /* If so, check if we can use boyer-moore. */ + if (c != inverse && boyer_moore_ok) + { + /* Check if all equivalents belong to the same + group of characters. Note that the check of C + itself is done by the last iteration. */ + int this_char_base = -1; + + while (boyer_moore_ok) + { + if (ASCII_CHAR_P (inverse)) + { + if (this_char_base > 0) + boyer_moore_ok = 0; + else + this_char_base = 0; + } + else if (CHAR_BYTE8_P (inverse)) + /* Boyer-moore search can't handle a + translation of an eight-bit + character. */ + boyer_moore_ok = 0; + else if (this_char_base < 0) + { + this_char_base = inverse & ~0x3F; + if (char_base < 0) + char_base = this_char_base; + else if (this_char_base != char_base) + boyer_moore_ok = 0; + } + else if ((inverse & ~0x3F) != this_char_base) + boyer_moore_ok = 0; + if (c == inverse) + break; + TRANSLATE (inverse, inverse_trt, inverse); + } + } + } - while (boyer_moore_ok) - { - if (ASCII_CHAR_P (inverse)) - { - if (this_char_base > 0) - boyer_moore_ok = 0; - else - this_char_base = 0; - } - else if (CHAR_BYTE8_P (inverse)) - /* Boyer-moore search can't handle a - translation of an eight-bit - character. */ - boyer_moore_ok = 0; - else if (this_char_base < 0) - { - this_char_base = inverse & ~0x3F; - if (char_base < 0) - char_base = this_char_base; - else if (this_char_base != char_base) - boyer_moore_ok = 0; - } - else if ((inverse & ~0x3F) != this_char_base) - boyer_moore_ok = 0; - if (c == inverse) - break; - TRANSLATE (inverse, inverse_trt, inverse); - } - } - } + /* Store this character into the translated pattern. */ + memcpy (pat, str, charlen); + pat += charlen; + base_pat += in_charlen; + len_byte -= in_charlen; + } - /* Store this character into the translated pattern. */ - memcpy (pat, str, charlen); - pat += charlen; - base_pat += in_charlen; - len_byte -= in_charlen; - } + /* If char_base is still negative we didn't find any translated + non-ASCII characters. */ + if (char_base < 0) + char_base = 0; + } + else + { + /* Unibyte buffer. */ + char_base = 0; + while (--len >= 0) + { + int c, translated, inverse; - /* If char_base is still negative we didn't find any translated - non-ASCII characters. */ - if (char_base < 0) - char_base = 0; - } - else - { - /* Unibyte buffer. */ - char_base = 0; - while (--len >= 0) - { - int c, translated, inverse; + /* If we got here and the RE flag is set, it's because we're + dealing with a regexp known to be trivial, so the backslash + just quotes the next character. */ + if (RE && *base_pat == '\\') + { + len--; + raw_pattern_size--; + base_pat++; + } + c = *base_pat++; + TRANSLATE (translated, trt, c); + *pat++ = translated; + /* Check that none of C's equivalents violates the + assumptions of boyer_moore. */ + TRANSLATE (inverse, inverse_trt, c); + while (1) + { + if (inverse >= 0200) + { + boyer_moore_ok = 0; + break; + } + if (c == inverse) + break; + TRANSLATE (inverse, inverse_trt, inverse); + } + } + } - /* If we got here and the RE flag is set, it's because we're - dealing with a regexp known to be trivial, so the backslash - just quotes the next character. */ - if (RE && *base_pat == '\\') - { - len--; - raw_pattern_size--; - base_pat++; - } - c = *base_pat++; - TRANSLATE (translated, trt, c); - *pat++ = translated; - /* Check that none of C's equivalents violates the - assumptions of boyer_moore. */ - TRANSLATE (inverse, inverse_trt, c); - while (1) - { - if (inverse >= 0200) - { - boyer_moore_ok = 0; - break; - } - if (c == inverse) - break; - TRANSLATE (inverse, inverse_trt, inverse); - } - } - } + len_byte = pat - patbuf; + pat = base_pat = patbuf; + + EMACS_INT result + = (boyer_moore_ok + ? boyer_moore (n, pat, len_byte, trt, inverse_trt, + pos_byte, lim_byte, + char_base) + : simple_search (n, pat, raw_pattern_size, len_byte, trt, + pos, pos_byte, lim, lim_byte)); + SAFE_FREE (); + return result; +} + +static EMACS_INT +search_buffer (Lisp_Object string, ptrdiff_t pos, ptrdiff_t pos_byte, + ptrdiff_t lim, ptrdiff_t lim_byte, EMACS_INT n, + int RE, Lisp_Object trt, Lisp_Object inverse_trt, bool posix) +{ + if (running_asynch_code) + save_search_regs (); - len_byte = pat - patbuf; - pat = base_pat = patbuf; - - EMACS_INT result - = (boyer_moore_ok - ? boyer_moore (n, pat, len_byte, trt, inverse_trt, - pos_byte, lim_byte, - char_base) - : simple_search (n, pat, raw_pattern_size, len_byte, trt, - pos, pos_byte, lim, lim_byte)); - SAFE_FREE (); - return result; + /* Searching 0 times means don't move. */ + /* Null string is found at starting position. */ + if (n == 0 || SCHARS (string) == 0) + { + set_search_regs (pos_byte, 0); + return pos; } + + if (RE && !(trivial_regexp_p (string) && NILP (Vsearch_spaces_regexp))) + pos = search_buffer_re (string, pos, pos_byte, lim, lim_byte, + n, trt, inverse_trt, posix); + else + pos = search_buffer_non_re (string, pos, pos_byte, lim, lim_byte, + n, RE, trt, inverse_trt, posix); + + return pos; } /* Do a simple string search N times for the string PAT, @@ -3353,6 +3387,7 @@ the buffer. If the buffer doesn't have a cache, the value is nil. */) return val; } + void syms_of_search (void) { @@ -3365,6 +3400,7 @@ syms_of_search (void) searchbufs[i].buf.fastmap = searchbufs[i].fastmap; searchbufs[i].regexp = Qnil; searchbufs[i].f_whitespace_regexp = Qnil; + searchbufs[i].busy = false; searchbufs[i].syntax_table = Qnil; staticpro (&searchbufs[i].regexp); staticpro (&searchbufs[i].f_whitespace_regexp); @@ -3405,6 +3441,9 @@ syms_of_search (void) saved_last_thing_searched = Qnil; staticpro (&saved_last_thing_searched); + re_match_object = Qnil; + staticpro (&re_match_object); + DEFVAR_LISP ("search-spaces-regexp", Vsearch_spaces_regexp, doc: /* Regexp to substitute for bunches of spaces in regexp search. Some commands use this for user-specified regexps. diff --git a/src/syntax.h b/src/syntax.h index 2171cbbba45..f02a17ce8d6 100644 --- a/src/syntax.h +++ b/src/syntax.h @@ -186,13 +186,6 @@ UPDATE_SYNTAX_TABLE_FORWARD (ptrdiff_t charpos) false, gl_state.object); } -INLINE void -UPDATE_SYNTAX_TABLE_FORWARD_FAST (ptrdiff_t charpos) -{ - if (parse_sexp_lookup_properties && charpos >= gl_state.e_property) - update_syntax_table (charpos + gl_state.offset, 1, false, gl_state.object); -} - /* Make syntax table state (gl_state) good for CHARPOS, assuming it is currently good for a position after CHARPOS. */ @@ -212,13 +205,6 @@ UPDATE_SYNTAX_TABLE (ptrdiff_t charpos) UPDATE_SYNTAX_TABLE_FORWARD (charpos); } -INLINE void -UPDATE_SYNTAX_TABLE_FAST (ptrdiff_t charpos) -{ - UPDATE_SYNTAX_TABLE_BACKWARD (charpos); - UPDATE_SYNTAX_TABLE_FORWARD_FAST (charpos); -} - /* Set up the buffer-global syntax table. */ INLINE void diff --git a/src/thread.h b/src/thread.h index 2c8914e1b28..c10e5ecb758 100644 --- a/src/thread.h +++ b/src/thread.h @@ -137,15 +137,6 @@ struct thread_state struct re_registers m_saved_search_regs; #define saved_search_regs (current_thread->m_saved_search_regs) - /* This is the string or buffer in which we - are matching. It is used for looking up syntax properties. - - If the value is a Lisp string object, we are matching text in that - string; if it's nil, we are matching text in the current buffer; if - it's t, we are matching text in a C string. */ - Lisp_Object m_re_match_object; -#define re_match_object (current_thread->m_re_match_object) - /* This member is different from waiting_for_input. It is used to communicate to a lisp process-filter/sentinel (via the function Fwaiting_for_user_input_p) whether Emacs was waiting