Improve [:alpha:] and [:alnum:] for multibyte characters (Bug#19878)

author Eli Zaretskii <eliz@gnu.org>

Sat, 28 Feb 2015 12:25:35 +0000 (14:25 +0200)

committer Eli Zaretskii <eliz@gnu.org>

Sat, 28 Feb 2015 12:25:35 +0000 (14:25 +0200)
author Eli Zaretskii <eliz@gnu.org>
Sat, 28 Feb 2015 12:25:35 +0000 (14:25 +0200)
committer Eli Zaretskii <eliz@gnu.org>
Sat, 28 Feb 2015 12:25:35 +0000 (14:25 +0200)
diff --git a/doc/lispref/ChangeLog b/doc/lispref/ChangeLog

index bff469a5188e3f1c16fe800579fa5bed96d4c96f..78f7e34ca0106a324d0b9e012717456bf7abac49 100644 (file)
--- a/doc/lispref/ChangeLog
+++ b/doc/lispref/ChangeLog
@@ -1,3 +1,8 @@
+2015-02-28  Eli Zaretskii  <eliz@gnu.org>
+
+       * searching.texi (Char Classes): Update the documentation of
+       [:alpha:] and [:alnum:].  (Bug#19878)
+
  2015-02-27  Eli Zaretskii  <eliz@gnu.org>
  
         * os.texi (Startup Summary):
diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi

index 61fac78e4a8d16c0e4346832f677220c3794b456..87513e8f9ce252bc2effd5ef03694f7c4345eb72 100644 (file)
--- a/doc/lispref/searching.texi
+++ b/doc/lispref/searching.texi
@@ -541,11 +541,15 @@ and what they mean:
  @item [:ascii:]
  This matches any @acronym{ASCII} character (codes 0--127).
  @item [:alnum:]
-This matches any letter or digit.  (At present, for multibyte
-characters, it matches anything that has word syntax.)
+This matches any letter or digit.  For multibyte characters, it
+matches characters whose Unicode @samp{general-category} property
+(@pxref{Character Properties}) indicates they are alphabetic or
+decimal number characters.
  @item [:alpha:]
-This matches any letter.  (At present, for multibyte characters, it
-matches anything that has word syntax.)
+This matches any letter.  For multibyte characters, it matches
+characters whose Unicode @samp{general-category} property
+(@pxref{Character Properties}) indicates they are alphabetic
+characters.
  @item [:blank:]
  This matches space and tab only.
  @item [:cntrl:]
diff --git a/etc/NEWS b/etc/NEWS

index d5cb9474c83b07646085a7f995548cbef6296f80..3be820e0d5f5682e1a636c15279c5a7ec9eb78f4 100644 (file)
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -612,6 +612,12 @@ when signaling a file error.  For example, it now reports "Permission
  denied" instead of "permission denied".  The old behavior was problematic
  in languages like German where downcasing rules depend on grammar.
  
++++
+** The character classes [:alpha:] and [:alnum:] in regular expressions
+now match multibyte characters using Unicode character properties.
+If you want the old behavior where they matched any character with
+word syntax, use `\sw' instead.
+
  \f
  * Lisp Changes in Emacs 25.1
  
diff --git a/src/ChangeLog b/src/ChangeLog

index df687914911166cfde2a7b5cf443add76f1fe4e2..97ecbac0953a9a657ec999c4486f5469818c54ae 100644 (file)
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,16 @@
+2015-02-28  Eli Zaretskii  <eliz@gnu.org>
+
+       * character.c (alphabeticp, decimalnump): New functions.
+       * character.h (alphabeticp, decimalnump): Add prototypes.
+
+       * regex.c (ISALNUM, ISALPHA): Check Unicode character properties
+       for multibyte characters by calling alphabeticp and decimalnump.
+       (BIT_ALPHA, BIT_ALNUM): New bit masks.
+       (re_wctype_to_bit): Return them when the class is RECC_ALPHA or
+       RECC_ALNUM.
+       (re_match_2_internal): Call ISALPHA and ISALNUM when appropriate.
+       (Bug#19878)
+
  2015-02-27  Jan Djärv  <jan.h.d@swipnet.se>
  
         * xterm.h (x_real_pos_and_offsets): Take outer_border as arg also.
diff --git a/src/character.c b/src/character.c

index 39d32c9d41af9e8296eefea2e476139a9a61f9b5..999f99aa0030c0416c1ffcf721b023401007625e 100644 (file)
--- a/src/character.c
+++ b/src/character.c
@@ -984,6 +984,48 @@ character is not ASCII nor 8-bit character, an error is signaled.  */)
  
  #ifdef emacs
  
+/* Return 'true' if C is an alphabetic character as defined by its
+   Unicode properties.  */
+bool
+alphabeticp (int c)
+{
+  Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
+
+  if (INTEGERP (category))
+    {
+      unicode_category_t gen_cat = XINT (category);
+
+      /* See UTS #18.  There are additional characters that should be
+        here, those designated as Other_uppercase, Other_lowercase,
+        and Other_alphabetic; FIXME.  */
+      return (gen_cat == UNICODE_CATEGORY_Lu
+             || gen_cat == UNICODE_CATEGORY_Ll
+             || gen_cat == UNICODE_CATEGORY_Lt
+             || gen_cat == UNICODE_CATEGORY_Lm
+             || gen_cat == UNICODE_CATEGORY_Lo
+             || gen_cat == UNICODE_CATEGORY_Mn
+             || gen_cat == UNICODE_CATEGORY_Mc
+             || gen_cat == UNICODE_CATEGORY_Me
+             || gen_cat == UNICODE_CATEGORY_Nl) ? true : false;
+    }
+}
+
+/* Return 'true' if C is an decimal-number character as defined by its
+   Unicode properties.  */
+bool
+decimalnump (int c)
+{
+  Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
+
+  if (INTEGERP (category))
+    {
+      unicode_category_t gen_cat = XINT (category);
+
+      /* See UTS #18.  */
+      return (gen_cat == UNICODE_CATEGORY_Nd) ? true : false;
+    }
+}
+
  void
  syms_of_character (void)
  {
diff --git a/src/character.h b/src/character.h

index 5043880cb42f94a5147161cc783eff8a32733a93..7d902952db6c3450e54adb6e6405f2b9fbecfab0 100644 (file)
--- a/src/character.h
+++ b/src/character.h
@@ -660,6 +660,9 @@ extern ptrdiff_t lisp_string_width (Lisp_Object, ptrdiff_t,
  extern Lisp_Object Vchar_unify_table;
  extern Lisp_Object string_escape_byte8 (Lisp_Object);
  
+extern bool alphabeticp (int);
+extern bool decimalnump (int);
+
  /* Return a translation table of id number ID.  */
  #define GET_TRANSLATION_TABLE(id) \
    (XCDR (XVECTOR (Vtranslation_table_vector)->contents[(id)]))
diff --git a/src/regex.c b/src/regex.c

index 41fe3fa80882687c72783fd22e6eabb5537bb25e..1afc5037594096ad0b1b8cdcda7158fb1dc48afe 100644 (file)
--- a/src/regex.c
+++ b/src/regex.c
@@ -324,12 +324,12 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
                     ? (((c) >= 'a' && (c) <= 'z')       \
                        || ((c) >= 'A' && (c) <= 'Z')    \
                        || ((c) >= '0' && (c) <= '9'))   \
-                   : SYNTAX (c) == Sword)
+                   : (alphabeticp (c) || decimalnump (c)))
  
  # define ISALPHA(c) (IS_REAL_ASCII (c)                 \
                     ? (((c) >= 'a' && (c) <= 'z')       \
                        || ((c) >= 'A' && (c) <= 'Z'))   \
-                   : SYNTAX (c) == Sword)
+                   : alphabeticp (c))
  
  # define ISLOWER(c) lowercasep (c)
  
@@ -1872,6 +1872,8 @@ struct range_table_work_area
  #define BIT_SPACE      0x8
  #define BIT_UPPER      0x10
  #define BIT_MULTIBYTE  0x20
+#define BIT_ALPHA      0x40
+#define BIT_ALNUM      0x80
  \f
  
  /* Set the bit for character C in a list.  */
@@ -2072,7 +2074,9 @@ re_wctype_to_bit (re_wctype_t cc)
      {
      case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
      case RECC_MULTIBYTE: return BIT_MULTIBYTE;
-    case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
+    case RECC_ALPHA: return BIT_ALPHA;
+    case RECC_ALNUM: return BIT_ALNUM;
+    case RECC_WORD: return BIT_WORD;
      case RECC_LOWER: return BIT_LOWER;
      case RECC_UPPER: return BIT_UPPER;
      case RECC_PUNCT: return BIT_PUNCT;
@@ -2930,7 +2934,7 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
  #endif /* emacs */
                         /* In most cases the matching rule for char classes
                            only uses the syntax table for multibyte chars,
-                          so that the content of the syntax-table it is not
+                          so that the content of the syntax-table is not
                            hardcoded in the range_table.  SPACE and WORD are
                            the two exceptions.  */
                         if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
@@ -2945,7 +2949,7 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
                         p = class_beg;
                         SET_LIST_BIT ('[');
  
-                       /* Because the `:' may starts the range, we
+                       /* Because the `:' may start the range, we
                            can't simply set bit and repeat the loop.
                            Instead, just set it to C and handle below.  */
                         c = ':';
@@ -5513,7 +5517,9 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1,
                     | (class_bits & BIT_PUNCT && ISPUNCT (c))
                     | (class_bits & BIT_SPACE && ISSPACE (c))
                     | (class_bits & BIT_UPPER && ISUPPER (c))
-                   | (class_bits & BIT_WORD  && ISWORD (c)))
+                   | (class_bits & BIT_WORD  && ISWORD  (c))
+                   | (class_bits & BIT_ALPHA && ISALPHA (c))
+                   | (class_bits & BIT_ALNUM && ISALNUM (c)))
                   not = !not;
                 else
                   CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
author	Eli Zaretskii <eliz@gnu.org>
	Sat, 28 Feb 2015 12:25:35 +0000 (14:25 +0200)
committer	Eli Zaretskii <eliz@gnu.org>
	Sat, 28 Feb 2015 12:25:35 +0000 (14:25 +0200)
doc/lispref/ChangeLog		patch \| blob \| history
doc/lispref/searching.texi		patch \| blob \| history
etc/NEWS		patch \| blob \| history
src/ChangeLog		patch \| blob \| history
src/character.c		patch \| blob \| history
src/character.h		patch \| blob \| history
src/regex.c		patch \| blob \| history