From 14473664b8532c58469a74b2f6503d543e195b88 Mon Sep 17 00:00:00 2001
From: Stefan Monnier <monnier@iro.umontreal.ca>
Date: Mon, 4 Sep 2000 04:24:00 +0000
Subject: [PATCH] (WIDE_CHAR_SUPPORT): New macro. (btowc, iswctype, wctype)
 [_LIBC]: Redefine to __<fun>. (BIT_ALPHA, BIT_ALNUM, BIT_ASCII, BIT_NONASCII,
 BIT_GRAPH, BIT_PRINT) (BIT_UNIBYTE): Remove. (re_match_2_internal): Delete
 corresponding code and streamline the BIT_MULTIBYTE case to not bother
 checking ISUNIBYTE. (CHAR_CLASS_MAX_LENGTH) [!WIDE_CHAR_SUPPORT]: Set to 9
 rather than 6. (re_wctype_t): New type. (re_wctype, re_iswctype,
 re_wctype_to_bit): New functions. (regex_compile): Use them and fix handling
 of overly long char classes.

---
 src/ChangeLog |  13 +++
 src/regex.c   | 268 +++++++++++++++++++++++++++++---------------------
 2 files changed, 169 insertions(+), 112 deletions(-)
diff --git a/src/ChangeLog b/src/ChangeLog
index 2af573f6c50..8e2e938782e 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,16 @@
+2000-09-04  Stefan Monnier  <monnier@cs.yale.edu>
+
+	* regex.c (WIDE_CHAR_SUPPORT): New macro.
+	(btowc, iswctype, wctype) [_LIBC]: Redefine to __<fun>.
+	(BIT_ALPHA, BIT_ALNUM, BIT_ASCII, BIT_NONASCII, BIT_GRAPH, BIT_PRINT)
+	(BIT_UNIBYTE): Remove.
+	(re_match_2_internal): Delete corresponding code and streamline the
+	BIT_MULTIBYTE case to not bother checking ISUNIBYTE.
+	(CHAR_CLASS_MAX_LENGTH) [!WIDE_CHAR_SUPPORT]: Set to 9 rather than 6.
+	(re_wctype_t): New type.
+	(re_wctype, re_iswctype, re_wctype_to_bit): New functions.
+	(regex_compile): Use them and fix handling of overly long char classes.
+
 2000-09-03  Andrew Innes  <andrewi@gnu.org>
 
 	* makefile.w32-in: Change to DOS line endings.
diff --git a/src/regex.c b/src/regex.c
index f779d9d82e1..82fd4e2062d 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -46,6 +46,19 @@
 # include <sys/types.h>
 #endif
 
+/* Whether to use ISO C Amendment 1 wide char functions.
+   Those should not be used for Emacs since it uses its own.  */
+#define WIDE_CHAR_SUPPORT \
+  (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
+
+/* For platform which support the ISO C amendement 1 functionality we
+   support user defined character classes.  */
+#if defined _LIBC || WIDE_CHAR_SUPPORT
+/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>.  */
+# include <wchar.h>
+# include <wctype.h>
+#endif
+
 #ifdef _LIBC
 /* We have to keep the namespace clean.  */
 # define regfree(preg) __regfree (preg)
@@ -68,6 +81,11 @@
 	__re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
 # define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
 
+/* Make sure we call libc's function even if the user overrides them.  */
+# define btowc __btowc
+# define iswctype __iswctype
+# define wctype __wctype
+
 # define WEAK_ALIAS(a,b) weak_alias (a, b)
 
 /* We are also using some library internals.  */
@@ -253,7 +271,7 @@ enum syntaxcode { Swhitespace = 0, Sword = 1 };
 		    ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237)	\
 		    : 1)
 
-# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c)		\
+# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c)				\
 		    ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237)	\
 		    : 1)
 
@@ -1858,21 +1876,14 @@ struct range_table_work_area
 #define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit)		\
   (work_area).bits |= (bit)
 
-/* These bits represent the various character classes such as [:alnum:]
-   in a charset's range table.  */
-#define BIT_ALNUM 0x1
-#define BIT_ALPHA 0x2
-#define BIT_WORD  0x4
-#define BIT_ASCII 0x8
-#define BIT_NONASCII 0x10
-#define BIT_GRAPH 0x20
-#define BIT_LOWER 0x40
-#define BIT_PRINT 0x80
-#define BIT_PUNCT 0x100
-#define BIT_SPACE 0x200
-#define BIT_UPPER 0x400
-#define BIT_UNIBYTE 0x800
-#define BIT_MULTIBYTE 0x1000
+/* Bits used to implement the multibyte-part of the various character classes
+   such as [:alnum:] in a charset's range table.  */
+#define BIT_WORD	0x1
+#define BIT_LOWER	0x2
+#define BIT_PUNCT	0x4
+#define BIT_SPACE	0x8
+#define BIT_UPPER	0x10
+#define BIT_MULTIBYTE	0x20
 
 /* Set a range (RANGE_START, RANGE_END) to WORK_AREA.  */
 #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end)	\
@@ -1918,18 +1929,110 @@ struct range_table_work_area
        }								\
     } while (0)
 
-#define CHAR_CLASS_MAX_LENGTH  6 /* Namely, `xdigit'.  */
+#if defined _LIBC || WIDE_CHAR_SUPPORT
+/* The GNU C library provides support for user-defined character classes
+   and the functions from ISO C amendement 1.  */
+# ifdef CHARCLASS_NAME_MAX
+#  define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
+# else
+/* This shouldn't happen but some implementation might still have this
+   problem.  Use a reasonable default value.  */
+#  define CHAR_CLASS_MAX_LENGTH 256
+# endif
+typedef wctype_t re_wctype_t;
+# define re_wctype wctype
+# define re_iswctype iswctype
+# define re_wctype_to_bit(cc) 0
+#else
+# define CHAR_CLASS_MAX_LENGTH  9 /* Namely, `multibyte'.  */
+# define btowc(c) c
+
+/* Character classes' indices.  */
+typedef enum { RECC_ERROR = 0,
+	       RECC_ALNUM, RECC_ALPHA, RECC_WORD,
+	       RECC_GRAPH, RECC_PRINT,
+	       RECC_LOWER, RECC_UPPER,
+	       RECC_PUNCT, RECC_CNTRL,
+	       RECC_DIGIT, RECC_XDIGIT,
+	       RECC_BLANK, RECC_SPACE,
+	       RECC_MULTIBYTE, RECC_NONASCII,
+	       RECC_ASCII, RECC_UNIBYTE
+} re_wctype_t;
+
+/* Map a string to the char class it names (if any).  */
+static re_wctype_t
+re_wctype (string)
+     unsigned char *string;
+{
+  if      (STREQ (string, "alnum"))	return RECC_ALNUM;
+  else if (STREQ (string, "alpha"))	return RECC_ALPHA;
+  else if (STREQ (string, "word"))	return RECC_WORD;
+  else if (STREQ (string, "ascii"))	return RECC_ASCII;
+  else if (STREQ (string, "nonascii"))	return RECC_NONASCII;
+  else if (STREQ (string, "graph"))	return RECC_GRAPH;
+  else if (STREQ (string, "lower"))	return RECC_LOWER;
+  else if (STREQ (string, "print"))	return RECC_PRINT;
+  else if (STREQ (string, "punct"))	return RECC_PUNCT;
+  else if (STREQ (string, "space"))	return RECC_SPACE;
+  else if (STREQ (string, "upper"))	return RECC_UPPER;
+  else if (STREQ (string, "unibyte"))	return RECC_UNIBYTE;
+  else if (STREQ (string, "multibyte"))	return RECC_MULTIBYTE;
+  else if (STREQ (string, "digit"))	return RECC_DIGIT;
+  else if (STREQ (string, "xdigit"))	return RECC_XDIGIT;
+  else if (STREQ (string, "cntrl"))	return RECC_CNTRL;
+  else if (STREQ (string, "blank"))	return RECC_BLANK;
+  else return 0;
+}
+
+/* True iff CH is in the char class CC.  */
+static boolean
+re_iswctype (ch, cc)
+     int ch;
+     re_wctype_t cc;
+{
+  switch (cc)
+    {
+    case RECC_ALNUM: return ISALNUM (ch);
+    case RECC_ALPHA: return ISALPHA (ch);
+    case RECC_BLANK: return ISBLANK (ch);
+    case RECC_CNTRL: return ISCNTRL (ch);
+    case RECC_DIGIT: return ISDIGIT (ch);
+    case RECC_GRAPH: return ISGRAPH (ch);
+    case RECC_LOWER: return ISLOWER (ch);
+    case RECC_PRINT: return ISPRINT (ch);
+    case RECC_PUNCT: return ISPUNCT (ch);
+    case RECC_SPACE: return ISSPACE (ch);
+    case RECC_UPPER: return ISUPPER (ch);
+    case RECC_XDIGIT: return ISXDIGIT (ch);
+    case RECC_ASCII: return IS_REAL_ASCII (ch);
+    case RECC_NONASCII: return !IS_REAL_ASCII (ch);
+    case RECC_UNIBYTE: return ISUNIBYTE (ch);
+    case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
+    case RECC_WORD: return ISWORD (ch);
+    case RECC_ERROR: return false;
+    }
+}
 
-#define IS_CHAR_CLASS(string)						\
-   (STREQ (string, "alpha") || STREQ (string, "upper")			\
-    || STREQ (string, "lower") || STREQ (string, "digit")		\
-    || STREQ (string, "alnum") || STREQ (string, "xdigit")		\
-    || STREQ (string, "space") || STREQ (string, "print")		\
-    || STREQ (string, "punct") || STREQ (string, "graph")		\
-    || STREQ (string, "cntrl") || STREQ (string, "blank")		\
-    || STREQ (string, "word")						\
-    || STREQ (string, "ascii") || STREQ (string, "nonascii")		\
-    || STREQ (string, "unibyte") || STREQ (string, "multibyte"))
+/* Return a bit-pattern to use in the range-table bits to match multibyte
+   chars of class CC.  */
+static int
+re_wctype_to_bit (cc)
+     re_wctype_t cc;
+{
+  switch (cc)
+    {
+    case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
+    case RECC_MULTIBYTE: return BIT_MULTIBYTE;
+    case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
+    case RECC_LOWER: return BIT_LOWER;
+    case RECC_UPPER: return BIT_UPPER;
+    case RECC_PUNCT: return BIT_PUNCT;
+    case RECC_SPACE: return BIT_SPACE;
+    case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
+    case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
+    }
+}
+#endif
 
 /* QUIT is only used on NTemacs.  */
 #if !defined WINDOWSNT || !defined emacs || !defined QUIT
@@ -2405,7 +2508,7 @@ regex_compile (pattern, size, syntax, bufp)
 		    syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
 		  {
 		    /* Leave room for the null.	 */
-		    char str[CHAR_CLASS_MAX_LENGTH + 1];
+		    unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
 		    const unsigned char *class_beg;
 
 		    PATFETCH (c);
@@ -2417,11 +2520,14 @@ regex_compile (pattern, size, syntax, bufp)
 
 		    for (;;)
 		      {
-			PATFETCH (c);
-			if (c == ':' || c == ']' || p == pend
-			    || c1 == CHAR_CLASS_MAX_LENGTH)
-			  break;
-			str[c1++] = c;
+		        PATFETCH (c);
+		        if ((c == ':' && *p == ']') || p == pend)
+		          break;
+			if (c1 < CHAR_CLASS_MAX_LENGTH)
+			  str[c1++] = c;
+			else
+			  /* This is in any case an invalid class name.  */
+			  str[0] = '\0';
 		      }
 		    str[c1] = '\0';
 
@@ -2432,89 +2538,34 @@ regex_compile (pattern, size, syntax, bufp)
 		    if (c == ':' && *p == ']')
 		      {
 			int ch;
-			boolean is_alnum = STREQ (str, "alnum");
-			boolean is_alpha = STREQ (str, "alpha");
-			boolean is_ascii = STREQ (str, "ascii");
-			boolean is_blank = STREQ (str, "blank");
-			boolean is_cntrl = STREQ (str, "cntrl");
-			boolean is_digit = STREQ (str, "digit");
-			boolean is_graph = STREQ (str, "graph");
-			boolean is_lower = STREQ (str, "lower");
-			boolean is_multibyte = STREQ (str, "multibyte");
-			boolean is_nonascii = STREQ (str, "nonascii");
-			boolean is_print = STREQ (str, "print");
-			boolean is_punct = STREQ (str, "punct");
-			boolean is_space = STREQ (str, "space");
-			boolean is_unibyte = STREQ (str, "unibyte");
-			boolean is_upper = STREQ (str, "upper");
-			boolean is_word = STREQ (str, "word");
-			boolean is_xdigit = STREQ (str, "xdigit");
-
-			if (!IS_CHAR_CLASS (str))
+			re_wctype_t cc;
+
+			cc = re_wctype (str);
+
+			if (cc == 0)
 			  FREE_STACK_RETURN (REG_ECTYPE);
 
-			/* Throw away the ] at the end of the character
-			   class.  */
-			PATFETCH (c);
+                        /* Throw away the ] at the end of the character
+                           class.  */
+                        PATFETCH (c);
 
-			if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+                        if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
 
 			/* Most character classes in a multibyte match
 			   just set a flag.  Exceptions are is_blank,
 			   is_digit, is_cntrl, and is_xdigit, since
 			   they can only match ASCII characters.  We
-			   don't need to handle them for multibyte.  */
+			   don't need to handle them for multibyte.
+			   They are distinguished by a negative wctype.  */
 
 			if (multibyte)
-			  {
-			    int bit = 0;
-
-			    if (is_alnum) bit = BIT_ALNUM;
-			    if (is_alpha) bit = BIT_ALPHA;
-			    if (is_ascii) bit = BIT_ASCII;
-			    if (is_graph) bit = BIT_GRAPH;
-			    if (is_lower) bit = BIT_LOWER;
-			    if (is_multibyte) bit = BIT_MULTIBYTE;
-			    if (is_nonascii) bit = BIT_NONASCII;
-			    if (is_print) bit = BIT_PRINT;
-			    if (is_punct) bit = BIT_PUNCT;
-			    if (is_space) bit = BIT_SPACE;
-			    if (is_unibyte) bit = BIT_UNIBYTE;
-			    if (is_upper) bit = BIT_UPPER;
-			    if (is_word) bit = BIT_WORD;
-			    if (bit)
-			      SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work,
-							     bit);
-			  }
+			  SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work,
+							 re_wctype_to_bit (cc));
 
-			/* Handle character classes for ASCII characters.  */
-			for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
+                        for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
 			  {
 			    int translated = TRANSLATE (ch);
-			    /* This was split into 3 if's to
-			       avoid an arbitrary limit in some compiler.  */
-			    if (   (is_alnum  && ISALNUM (ch))
-				|| (is_alpha  && ISALPHA (ch))
-				|| (is_blank  && ISBLANK (ch))
-				|| (is_cntrl  && ISCNTRL (ch)))
-			      SET_LIST_BIT (translated);
-			    if (   (is_digit  && ISDIGIT (ch))
-				|| (is_graph  && ISGRAPH (ch))
-				|| (is_lower  && ISLOWER (ch))
-				|| (is_print  && ISPRINT (ch)))
-			      SET_LIST_BIT (translated);
-			    if (   (is_punct  && ISPUNCT (ch))
-				|| (is_space  && ISSPACE (ch))
-				|| (is_upper  && ISUPPER (ch))
-				|| (is_xdigit && ISXDIGIT (ch)))
-			      SET_LIST_BIT (translated);
-			    if (   (is_ascii  && IS_REAL_ASCII (ch))
-				|| (is_nonascii && !IS_REAL_ASCII (ch))
-				|| (is_unibyte && ISUNIBYTE (ch))
-				|| (is_multibyte && !ISUNIBYTE (ch)))
-			      SET_LIST_BIT (translated);
-
-			    if (   (is_word   && ISWORD (ch)))
+			    if (re_iswctype (btowc (ch), cc))
 			      SET_LIST_BIT (translated);
 			  }
 
@@ -4972,17 +5023,10 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
 	      {
 		int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
 
-		if (  (class_bits & BIT_ALNUM && ISALNUM (c))
-		    | (class_bits & BIT_ALPHA && ISALPHA (c))
-		    | (class_bits & BIT_ASCII && IS_REAL_ASCII (c))
-		    | (class_bits & BIT_GRAPH && ISGRAPH (c))
-		    | (class_bits & BIT_LOWER && ISLOWER (c))
-		    | (class_bits & BIT_MULTIBYTE && !ISUNIBYTE (c))
-		    | (class_bits & BIT_NONASCII && !IS_REAL_ASCII (c))
-		    | (class_bits & BIT_PRINT && ISPRINT (c))
+		if (  (class_bits & BIT_LOWER && ISLOWER (c))
+		    | (class_bits & BIT_MULTIBYTE)
 		    | (class_bits & BIT_PUNCT && ISPUNCT (c))
 		    | (class_bits & BIT_SPACE && ISSPACE (c))
-		    | (class_bits & BIT_UNIBYTE && ISUNIBYTE (c))
 		    | (class_bits & BIT_UPPER && ISUPPER (c))
 		    | (class_bits & BIT_WORD  && ISWORD (c)))
 		  not = !not;
-- 
2.39.5