From 365958144ea38255d543a4232b926ca81e849fa9 Mon Sep 17 00:00:00 2001 From: Stefan Monnier Date: Fri, 23 Aug 2002 22:21:51 +0000 Subject: [PATCH] (PATFETCH): Remove the translating fetch. (PATFETCH_RAW): Rename to PATFETCH. (set_image_of_range): New fun. (SET_RANGE_TABLE_WORK_AREA): Use it. (regex_compile): Don't translate the pattern chars so eagerly. Only do it when inserting an `exactn' bytecode or when handling a char-range. (mutually_exclusive_p): Avoid empty statement. --- src/ChangeLog | 22 +++++++++++---- src/regex.c | 76 +++++++++++++++++++++++++++++++++------------------ 2 files changed, 66 insertions(+), 32 deletions(-) diff --git a/src/ChangeLog b/src/ChangeLog index 6dcc95b7f8d..c6180468193 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,14 @@ +2002-08-23 Stefan Monnier + + * regex.c (PATFETCH): Remove the translating fetch. + (PATFETCH_RAW): Rename to PATFETCH. + (set_image_of_range): New fun. + (SET_RANGE_TABLE_WORK_AREA): Use it. + (regex_compile): Don't translate the pattern chars so eagerly. + Only do it when inserting an `exactn' bytecode or when handling + a char-range. + (mutually_exclusive_p): Avoid empty statement. + 2002-08-22 Kim F. Storm * xdisp.c (redisplay_window): Do not `goto try_to_scroll' when we @@ -511,11 +522,10 @@ (parse_solitary_modifier, Fexecute_extended_command): Likewise. * textprop.c (validate_interval_range, interval_of): Likewise. - * fontset.c (Fset_fontset_font): Use SDATA instead of - XSTRING()->data. + * fontset.c (Fset_fontset_font): Use SDATA instead of XSTRING()->data. - * charset.h (FETCH_STRING_CHAR_ADVANCE, - FETCH_STRING_CHAR_ADVANCE_NO_CHECK): Use SBYTES instead of + * charset.h (FETCH_STRING_CHAR_ADVANCE) + (FETCH_STRING_CHAR_ADVANCE_NO_CHECK): Use SBYTES instead of XSTRING()->size_byte. * lisp.h (SDATA, SREF): Produce rvalue. @@ -524,8 +534,8 @@ * buffer.c (Fother_buffer): Use SREF when retrieving a byte from a string. * casefiddle.c (casify_object): Use SSET. - * charset.h (FETCH_STRING_CHAR_ADVANCE, - FETCH_STRING_CHAR_ADVANCE_NO_CHECK): Use SDATA when getting + * charset.h (FETCH_STRING_CHAR_ADVANCE) + (FETCH_STRING_CHAR_ADVANCE_NO_CHECK): Use SDATA when getting address of string contents. * data.c (Faref): Use SDATA. (Faset): Use SDATA, SSET. diff --git a/src/regex.c b/src/regex.c index 591d6f14e12..e01259cc85a 100644 --- a/src/regex.c +++ b/src/regex.c @@ -19,7 +19,9 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ -/* TODO: +/* BUGS: + - (x?)*y\1z should match both xxxxyxz and xxxyz. + TODO: - structure the opcode space into opcode+flag. - merge with glibc's regex.[ch]. - replace (succeed_n + jump_n + set_number_at) with something that doesn't @@ -1682,17 +1684,9 @@ static re_char *skip_one_char _RE_ARGS ((re_char *p)); static int analyse_first _RE_ARGS ((re_char *p, re_char *pend, char *fastmap, const int multibyte)); -/* Fetch the next character in the uncompiled pattern---translating it - if necessary. */ -#define PATFETCH(c) \ - do { \ - PATFETCH_RAW (c); \ - c = TRANSLATE (c); \ - } while (0) - /* Fetch the next character in the uncompiled pattern, with no translation. */ -#define PATFETCH_RAW(c) \ +#define PATFETCH(c) \ do { \ int len; \ if (p == pend) return REG_EEND; \ @@ -1914,12 +1908,13 @@ struct range_table_work_area #define BIT_UPPER 0x10 #define BIT_MULTIBYTE 0x20 -/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */ -#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \ - do { \ - EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2); \ - (work_area).table[(work_area).used++] = (range_start); \ - (work_area).table[(work_area).used++] = (range_end); \ +/* Set a range START..END to WORK_AREA. + The range is passed through TRANSLATE, so START and END + should be untranslated. */ +#define SET_RANGE_TABLE_WORK_AREA(work_area, start, end) \ + do { \ + EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2); \ + set_image_of_range (&work_area, start, end, translate); \ } while (0) /* Free allocated memory for WORK_AREA. */ @@ -2077,6 +2072,31 @@ re_wctype_to_bit (cc) } #endif + + +/* We need to find the image of the range start..end when passed through + TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end) + and is not even necessarily contiguous. + We approximate it with the smallest contiguous range that contains + all the chars we need. */ +static void +set_image_of_range (work_area, start, end, translate) + RE_TRANSLATE_TYPE translate; + struct range_table_work_area *work_area; + re_wchar_t start, end; +{ + re_wchar_t cmin = TRANSLATE (start), cmax = TRANSLATE (end); + if (RE_TRANSLATE_P (translate)) + for (; start <= end; start++) + { + re_wchar_t c = TRANSLATE (start); + cmin = MIN (cmin, c); + cmax = MAX (cmax, c); + } + work_area->table[work_area->used++] = (cmin); + work_area->table[work_area->used++] = (cmax); +} + /* Explicit quit checking is only used on NTemacs. */ #if defined WINDOWSNT && defined emacs && defined QUIT extern int immediate_quit; @@ -2525,6 +2545,10 @@ regex_compile (pattern, size, syntax, bufp) if (p == pend) FREE_STACK_RETURN (REG_EBRACK); + /* Don't translate yet. The range TRANSLATE(X..Y) cannot + always be determined from TRANSLATE(X) and TRANSLATE(Y) + So the translation is done later in a loop. Example: + (let ((case-fold-search t)) (string-match "[A-_]" "A")) */ PATFETCH (c); /* \ might escape characters inside [...] and [^...]. */ @@ -2584,7 +2608,7 @@ regex_compile (pattern, size, syntax, bufp) them). */ if (c == ':' && *p == ']') { - int ch; + re_wchar_t ch; re_wctype_t cc; cc = re_wctype (str); @@ -2653,8 +2677,8 @@ regex_compile (pattern, size, syntax, bufp) starting at the smallest character in the charset of C1 and ending at C1. */ int charset = CHAR_CHARSET (c1); - int c2 = MAKE_CHAR (charset, 0, 0); - + re_wchar_t c2 = MAKE_CHAR (charset, 0, 0); + SET_RANGE_TABLE_WORK_AREA (range_table_work, c2, c1); c1 = 0377; @@ -2672,7 +2696,7 @@ regex_compile (pattern, size, syntax, bufp) /* ... into bitmap. */ { re_wchar_t this_char; - int range_start = c, range_end = c1; + re_wchar_t range_start = c, range_end = c1; /* If the start is after the end, the range is empty. */ if (range_start > range_end) @@ -2769,7 +2793,7 @@ regex_compile (pattern, size, syntax, bufp) /* Do not translate the character after the \, so that we can distinguish, e.g., \B from \b, even if we normally would translate, e.g., B to b. */ - PATFETCH_RAW (c); + PATFETCH (c); switch (c) { @@ -3129,13 +3153,13 @@ regex_compile (pattern, size, syntax, bufp) case 'c': laststart = b; - PATFETCH_RAW (c); + PATFETCH (c); BUF_PUSH_2 (categoryspec, c); break; case 'C': laststart = b; - PATFETCH_RAW (c); + PATFETCH (c); BUF_PUSH_2 (notcategoryspec, c); break; #endif /* emacs */ @@ -3225,7 +3249,6 @@ regex_compile (pattern, size, syntax, bufp) /* You might think it would be useful for \ to mean not to translate; but if we don't translate it it will never match anything. */ - c = TRANSLATE (c); goto normal_char; } break; @@ -3234,7 +3257,7 @@ regex_compile (pattern, size, syntax, bufp) default: /* Expects the character in `c'. */ normal_char: - /* If no exactn currently being built. */ + /* If no exactn currently being built. */ if (!pending_exact /* If last exactn not at current position. */ @@ -3265,6 +3288,7 @@ regex_compile (pattern, size, syntax, bufp) { int len; + c = TRANSLATE (c); if (multibyte) len = CHAR_STRING (c, b); else @@ -4427,7 +4451,7 @@ mutually_exclusive_p (bufp, p1, p2) they don't overlap. The union of the two sets of excluded chars should cover all possible chars, which, as a matter of fact, is virtually impossible in multibyte buffers. */ - ; + break; } break; -- 2.39.5