From 14473664b8532c58469a74b2f6503d543e195b88 Mon Sep 17 00:00:00 2001 From: Stefan Monnier Date: Mon, 4 Sep 2000 04:24:00 +0000 Subject: [PATCH] (WIDE_CHAR_SUPPORT): New macro. (btowc, iswctype, wctype) [_LIBC]: Redefine to __. (BIT_ALPHA, BIT_ALNUM, BIT_ASCII, BIT_NONASCII, BIT_GRAPH, BIT_PRINT) (BIT_UNIBYTE): Remove. (re_match_2_internal): Delete corresponding code and streamline the BIT_MULTIBYTE case to not bother checking ISUNIBYTE. (CHAR_CLASS_MAX_LENGTH) [!WIDE_CHAR_SUPPORT]: Set to 9 rather than 6. (re_wctype_t): New type. (re_wctype, re_iswctype, re_wctype_to_bit): New functions. (regex_compile): Use them and fix handling of overly long char classes. --- src/ChangeLog | 13 +++ src/regex.c | 268 +++++++++++++++++++++++++++++--------------------- 2 files changed, 169 insertions(+), 112 deletions(-) diff --git a/src/ChangeLog b/src/ChangeLog index 2af573f6c50..8e2e938782e 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,16 @@ +2000-09-04 Stefan Monnier + + * regex.c (WIDE_CHAR_SUPPORT): New macro. + (btowc, iswctype, wctype) [_LIBC]: Redefine to __. + (BIT_ALPHA, BIT_ALNUM, BIT_ASCII, BIT_NONASCII, BIT_GRAPH, BIT_PRINT) + (BIT_UNIBYTE): Remove. + (re_match_2_internal): Delete corresponding code and streamline the + BIT_MULTIBYTE case to not bother checking ISUNIBYTE. + (CHAR_CLASS_MAX_LENGTH) [!WIDE_CHAR_SUPPORT]: Set to 9 rather than 6. + (re_wctype_t): New type. + (re_wctype, re_iswctype, re_wctype_to_bit): New functions. + (regex_compile): Use them and fix handling of overly long char classes. + 2000-09-03 Andrew Innes * makefile.w32-in: Change to DOS line endings. diff --git a/src/regex.c b/src/regex.c index f779d9d82e1..82fd4e2062d 100644 --- a/src/regex.c +++ b/src/regex.c @@ -46,6 +46,19 @@ # include #endif +/* Whether to use ISO C Amendment 1 wide char functions. + Those should not be used for Emacs since it uses its own. */ +#define WIDE_CHAR_SUPPORT \ + (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs) + +/* For platform which support the ISO C amendement 1 functionality we + support user defined character classes. */ +#if defined _LIBC || WIDE_CHAR_SUPPORT +/* Solaris 2.5 has a bug: must be included before . */ +# include +# include +#endif + #ifdef _LIBC /* We have to keep the namespace clean. */ # define regfree(preg) __regfree (preg) @@ -68,6 +81,11 @@ __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop) # define re_compile_fastmap(bufp) __re_compile_fastmap (bufp) +/* Make sure we call libc's function even if the user overrides them. */ +# define btowc __btowc +# define iswctype __iswctype +# define wctype __wctype + # define WEAK_ALIAS(a,b) weak_alias (a, b) /* We are also using some library internals. */ @@ -253,7 +271,7 @@ enum syntaxcode { Swhitespace = 0, Sword = 1 }; ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \ : 1) -# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \ +# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \ ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \ : 1) @@ -1858,21 +1876,14 @@ struct range_table_work_area #define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \ (work_area).bits |= (bit) -/* These bits represent the various character classes such as [:alnum:] - in a charset's range table. */ -#define BIT_ALNUM 0x1 -#define BIT_ALPHA 0x2 -#define BIT_WORD 0x4 -#define BIT_ASCII 0x8 -#define BIT_NONASCII 0x10 -#define BIT_GRAPH 0x20 -#define BIT_LOWER 0x40 -#define BIT_PRINT 0x80 -#define BIT_PUNCT 0x100 -#define BIT_SPACE 0x200 -#define BIT_UPPER 0x400 -#define BIT_UNIBYTE 0x800 -#define BIT_MULTIBYTE 0x1000 +/* Bits used to implement the multibyte-part of the various character classes + such as [:alnum:] in a charset's range table. */ +#define BIT_WORD 0x1 +#define BIT_LOWER 0x2 +#define BIT_PUNCT 0x4 +#define BIT_SPACE 0x8 +#define BIT_UPPER 0x10 +#define BIT_MULTIBYTE 0x20 /* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */ #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \ @@ -1918,18 +1929,110 @@ struct range_table_work_area } \ } while (0) -#define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ +#if defined _LIBC || WIDE_CHAR_SUPPORT +/* The GNU C library provides support for user-defined character classes + and the functions from ISO C amendement 1. */ +# ifdef CHARCLASS_NAME_MAX +# define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX +# else +/* This shouldn't happen but some implementation might still have this + problem. Use a reasonable default value. */ +# define CHAR_CLASS_MAX_LENGTH 256 +# endif +typedef wctype_t re_wctype_t; +# define re_wctype wctype +# define re_iswctype iswctype +# define re_wctype_to_bit(cc) 0 +#else +# define CHAR_CLASS_MAX_LENGTH 9 /* Namely, `multibyte'. */ +# define btowc(c) c + +/* Character classes' indices. */ +typedef enum { RECC_ERROR = 0, + RECC_ALNUM, RECC_ALPHA, RECC_WORD, + RECC_GRAPH, RECC_PRINT, + RECC_LOWER, RECC_UPPER, + RECC_PUNCT, RECC_CNTRL, + RECC_DIGIT, RECC_XDIGIT, + RECC_BLANK, RECC_SPACE, + RECC_MULTIBYTE, RECC_NONASCII, + RECC_ASCII, RECC_UNIBYTE +} re_wctype_t; + +/* Map a string to the char class it names (if any). */ +static re_wctype_t +re_wctype (string) + unsigned char *string; +{ + if (STREQ (string, "alnum")) return RECC_ALNUM; + else if (STREQ (string, "alpha")) return RECC_ALPHA; + else if (STREQ (string, "word")) return RECC_WORD; + else if (STREQ (string, "ascii")) return RECC_ASCII; + else if (STREQ (string, "nonascii")) return RECC_NONASCII; + else if (STREQ (string, "graph")) return RECC_GRAPH; + else if (STREQ (string, "lower")) return RECC_LOWER; + else if (STREQ (string, "print")) return RECC_PRINT; + else if (STREQ (string, "punct")) return RECC_PUNCT; + else if (STREQ (string, "space")) return RECC_SPACE; + else if (STREQ (string, "upper")) return RECC_UPPER; + else if (STREQ (string, "unibyte")) return RECC_UNIBYTE; + else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE; + else if (STREQ (string, "digit")) return RECC_DIGIT; + else if (STREQ (string, "xdigit")) return RECC_XDIGIT; + else if (STREQ (string, "cntrl")) return RECC_CNTRL; + else if (STREQ (string, "blank")) return RECC_BLANK; + else return 0; +} + +/* True iff CH is in the char class CC. */ +static boolean +re_iswctype (ch, cc) + int ch; + re_wctype_t cc; +{ + switch (cc) + { + case RECC_ALNUM: return ISALNUM (ch); + case RECC_ALPHA: return ISALPHA (ch); + case RECC_BLANK: return ISBLANK (ch); + case RECC_CNTRL: return ISCNTRL (ch); + case RECC_DIGIT: return ISDIGIT (ch); + case RECC_GRAPH: return ISGRAPH (ch); + case RECC_LOWER: return ISLOWER (ch); + case RECC_PRINT: return ISPRINT (ch); + case RECC_PUNCT: return ISPUNCT (ch); + case RECC_SPACE: return ISSPACE (ch); + case RECC_UPPER: return ISUPPER (ch); + case RECC_XDIGIT: return ISXDIGIT (ch); + case RECC_ASCII: return IS_REAL_ASCII (ch); + case RECC_NONASCII: return !IS_REAL_ASCII (ch); + case RECC_UNIBYTE: return ISUNIBYTE (ch); + case RECC_MULTIBYTE: return !ISUNIBYTE (ch); + case RECC_WORD: return ISWORD (ch); + case RECC_ERROR: return false; + } +} -#define IS_CHAR_CLASS(string) \ - (STREQ (string, "alpha") || STREQ (string, "upper") \ - || STREQ (string, "lower") || STREQ (string, "digit") \ - || STREQ (string, "alnum") || STREQ (string, "xdigit") \ - || STREQ (string, "space") || STREQ (string, "print") \ - || STREQ (string, "punct") || STREQ (string, "graph") \ - || STREQ (string, "cntrl") || STREQ (string, "blank") \ - || STREQ (string, "word") \ - || STREQ (string, "ascii") || STREQ (string, "nonascii") \ - || STREQ (string, "unibyte") || STREQ (string, "multibyte")) +/* Return a bit-pattern to use in the range-table bits to match multibyte + chars of class CC. */ +static int +re_wctype_to_bit (cc) + re_wctype_t cc; +{ + switch (cc) + { + case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH: + case RECC_MULTIBYTE: return BIT_MULTIBYTE; + case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD; + case RECC_LOWER: return BIT_LOWER; + case RECC_UPPER: return BIT_UPPER; + case RECC_PUNCT: return BIT_PUNCT; + case RECC_SPACE: return BIT_SPACE; + case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL: + case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0; + } +} +#endif /* QUIT is only used on NTemacs. */ #if !defined WINDOWSNT || !defined emacs || !defined QUIT @@ -2405,7 +2508,7 @@ regex_compile (pattern, size, syntax, bufp) syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') { /* Leave room for the null. */ - char str[CHAR_CLASS_MAX_LENGTH + 1]; + unsigned char str[CHAR_CLASS_MAX_LENGTH + 1]; const unsigned char *class_beg; PATFETCH (c); @@ -2417,11 +2520,14 @@ regex_compile (pattern, size, syntax, bufp) for (;;) { - PATFETCH (c); - if (c == ':' || c == ']' || p == pend - || c1 == CHAR_CLASS_MAX_LENGTH) - break; - str[c1++] = c; + PATFETCH (c); + if ((c == ':' && *p == ']') || p == pend) + break; + if (c1 < CHAR_CLASS_MAX_LENGTH) + str[c1++] = c; + else + /* This is in any case an invalid class name. */ + str[0] = '\0'; } str[c1] = '\0'; @@ -2432,89 +2538,34 @@ regex_compile (pattern, size, syntax, bufp) if (c == ':' && *p == ']') { int ch; - boolean is_alnum = STREQ (str, "alnum"); - boolean is_alpha = STREQ (str, "alpha"); - boolean is_ascii = STREQ (str, "ascii"); - boolean is_blank = STREQ (str, "blank"); - boolean is_cntrl = STREQ (str, "cntrl"); - boolean is_digit = STREQ (str, "digit"); - boolean is_graph = STREQ (str, "graph"); - boolean is_lower = STREQ (str, "lower"); - boolean is_multibyte = STREQ (str, "multibyte"); - boolean is_nonascii = STREQ (str, "nonascii"); - boolean is_print = STREQ (str, "print"); - boolean is_punct = STREQ (str, "punct"); - boolean is_space = STREQ (str, "space"); - boolean is_unibyte = STREQ (str, "unibyte"); - boolean is_upper = STREQ (str, "upper"); - boolean is_word = STREQ (str, "word"); - boolean is_xdigit = STREQ (str, "xdigit"); - - if (!IS_CHAR_CLASS (str)) + re_wctype_t cc; + + cc = re_wctype (str); + + if (cc == 0) FREE_STACK_RETURN (REG_ECTYPE); - /* Throw away the ] at the end of the character - class. */ - PATFETCH (c); + /* Throw away the ] at the end of the character + class. */ + PATFETCH (c); - if (p == pend) FREE_STACK_RETURN (REG_EBRACK); + if (p == pend) FREE_STACK_RETURN (REG_EBRACK); /* Most character classes in a multibyte match just set a flag. Exceptions are is_blank, is_digit, is_cntrl, and is_xdigit, since they can only match ASCII characters. We - don't need to handle them for multibyte. */ + don't need to handle them for multibyte. + They are distinguished by a negative wctype. */ if (multibyte) - { - int bit = 0; - - if (is_alnum) bit = BIT_ALNUM; - if (is_alpha) bit = BIT_ALPHA; - if (is_ascii) bit = BIT_ASCII; - if (is_graph) bit = BIT_GRAPH; - if (is_lower) bit = BIT_LOWER; - if (is_multibyte) bit = BIT_MULTIBYTE; - if (is_nonascii) bit = BIT_NONASCII; - if (is_print) bit = BIT_PRINT; - if (is_punct) bit = BIT_PUNCT; - if (is_space) bit = BIT_SPACE; - if (is_unibyte) bit = BIT_UNIBYTE; - if (is_upper) bit = BIT_UPPER; - if (is_word) bit = BIT_WORD; - if (bit) - SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work, - bit); - } + SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work, + re_wctype_to_bit (cc)); - /* Handle character classes for ASCII characters. */ - for (ch = 0; ch < 1 << BYTEWIDTH; ch++) + for (ch = 0; ch < 1 << BYTEWIDTH; ++ch) { int translated = TRANSLATE (ch); - /* This was split into 3 if's to - avoid an arbitrary limit in some compiler. */ - if ( (is_alnum && ISALNUM (ch)) - || (is_alpha && ISALPHA (ch)) - || (is_blank && ISBLANK (ch)) - || (is_cntrl && ISCNTRL (ch))) - SET_LIST_BIT (translated); - if ( (is_digit && ISDIGIT (ch)) - || (is_graph && ISGRAPH (ch)) - || (is_lower && ISLOWER (ch)) - || (is_print && ISPRINT (ch))) - SET_LIST_BIT (translated); - if ( (is_punct && ISPUNCT (ch)) - || (is_space && ISSPACE (ch)) - || (is_upper && ISUPPER (ch)) - || (is_xdigit && ISXDIGIT (ch))) - SET_LIST_BIT (translated); - if ( (is_ascii && IS_REAL_ASCII (ch)) - || (is_nonascii && !IS_REAL_ASCII (ch)) - || (is_unibyte && ISUNIBYTE (ch)) - || (is_multibyte && !ISUNIBYTE (ch))) - SET_LIST_BIT (translated); - - if ( (is_word && ISWORD (ch))) + if (re_iswctype (btowc (ch), cc)) SET_LIST_BIT (translated); } @@ -4972,17 +5023,10 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) { int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]); - if ( (class_bits & BIT_ALNUM && ISALNUM (c)) - | (class_bits & BIT_ALPHA && ISALPHA (c)) - | (class_bits & BIT_ASCII && IS_REAL_ASCII (c)) - | (class_bits & BIT_GRAPH && ISGRAPH (c)) - | (class_bits & BIT_LOWER && ISLOWER (c)) - | (class_bits & BIT_MULTIBYTE && !ISUNIBYTE (c)) - | (class_bits & BIT_NONASCII && !IS_REAL_ASCII (c)) - | (class_bits & BIT_PRINT && ISPRINT (c)) + if ( (class_bits & BIT_LOWER && ISLOWER (c)) + | (class_bits & BIT_MULTIBYTE) | (class_bits & BIT_PUNCT && ISPUNCT (c)) | (class_bits & BIT_SPACE && ISSPACE (c)) - | (class_bits & BIT_UNIBYTE && ISUNIBYTE (c)) | (class_bits & BIT_UPPER && ISUPPER (c)) | (class_bits & BIT_WORD && ISWORD (c))) not = !not; -- 2.39.2