From: Eli Zaretskii Date: Sat, 28 Feb 2015 12:25:35 +0000 (+0200) Subject: Improve [:alpha:] and [:alnum:] for multibyte characters (Bug#19878) X-Git-Tag: emacs-25.0.90~2564^2~272 X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=1a50945fa4c666ae2ab5cd9419d23ad063ea1249;p=emacs.git Improve [:alpha:] and [:alnum:] for multibyte characters (Bug#19878) src/character.c (alphabeticp, decimalnump): New functions. src/character.h (alphabeticp, decimalnump): Add prototypes. src/regex.c (ISALNUM, ISALPHA): Check Unicode character properties for multibyte characters by calling alphabeticp and decimalnump. (BIT_ALPHA, BIT_ALNUM): New bit masks. (re_wctype_to_bit): Return them when the class is RECC_ALPHA or RECC_ALNUM. (re_match_2_internal): Call ISALPHA and ISALNUM when appropriate. doc/lispref/searching.texi (Char Classes): Update the documentation of [:alpha:] and [:alnum:]. etc/NEWS: Mention the changes in [:alpha:] and [:alnum:]. --- diff --git a/doc/lispref/ChangeLog b/doc/lispref/ChangeLog index bff469a5188..78f7e34ca01 100644 --- a/doc/lispref/ChangeLog +++ b/doc/lispref/ChangeLog @@ -1,3 +1,8 @@ +2015-02-28 Eli Zaretskii + + * searching.texi (Char Classes): Update the documentation of + [:alpha:] and [:alnum:]. (Bug#19878) + 2015-02-27 Eli Zaretskii * os.texi (Startup Summary): diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi index 61fac78e4a8..87513e8f9ce 100644 --- a/doc/lispref/searching.texi +++ b/doc/lispref/searching.texi @@ -541,11 +541,15 @@ and what they mean: @item [:ascii:] This matches any @acronym{ASCII} character (codes 0--127). @item [:alnum:] -This matches any letter or digit. (At present, for multibyte -characters, it matches anything that has word syntax.) +This matches any letter or digit. For multibyte characters, it +matches characters whose Unicode @samp{general-category} property +(@pxref{Character Properties}) indicates they are alphabetic or +decimal number characters. @item [:alpha:] -This matches any letter. (At present, for multibyte characters, it -matches anything that has word syntax.) +This matches any letter. For multibyte characters, it matches +characters whose Unicode @samp{general-category} property +(@pxref{Character Properties}) indicates they are alphabetic +characters. @item [:blank:] This matches space and tab only. @item [:cntrl:] diff --git a/etc/NEWS b/etc/NEWS index d5cb9474c83..3be820e0d5f 100644 --- a/etc/NEWS +++ b/etc/NEWS @@ -612,6 +612,12 @@ when signaling a file error. For example, it now reports "Permission denied" instead of "permission denied". The old behavior was problematic in languages like German where downcasing rules depend on grammar. ++++ +** The character classes [:alpha:] and [:alnum:] in regular expressions +now match multibyte characters using Unicode character properties. +If you want the old behavior where they matched any character with +word syntax, use `\sw' instead. + * Lisp Changes in Emacs 25.1 diff --git a/src/ChangeLog b/src/ChangeLog index df687914911..97ecbac0953 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,16 @@ +2015-02-28 Eli Zaretskii + + * character.c (alphabeticp, decimalnump): New functions. + * character.h (alphabeticp, decimalnump): Add prototypes. + + * regex.c (ISALNUM, ISALPHA): Check Unicode character properties + for multibyte characters by calling alphabeticp and decimalnump. + (BIT_ALPHA, BIT_ALNUM): New bit masks. + (re_wctype_to_bit): Return them when the class is RECC_ALPHA or + RECC_ALNUM. + (re_match_2_internal): Call ISALPHA and ISALNUM when appropriate. + (Bug#19878) + 2015-02-27 Jan Djärv * xterm.h (x_real_pos_and_offsets): Take outer_border as arg also. diff --git a/src/character.c b/src/character.c index 39d32c9d41a..999f99aa003 100644 --- a/src/character.c +++ b/src/character.c @@ -984,6 +984,48 @@ character is not ASCII nor 8-bit character, an error is signaled. */) #ifdef emacs +/* Return 'true' if C is an alphabetic character as defined by its + Unicode properties. */ +bool +alphabeticp (int c) +{ + Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c); + + if (INTEGERP (category)) + { + unicode_category_t gen_cat = XINT (category); + + /* See UTS #18. There are additional characters that should be + here, those designated as Other_uppercase, Other_lowercase, + and Other_alphabetic; FIXME. */ + return (gen_cat == UNICODE_CATEGORY_Lu + || gen_cat == UNICODE_CATEGORY_Ll + || gen_cat == UNICODE_CATEGORY_Lt + || gen_cat == UNICODE_CATEGORY_Lm + || gen_cat == UNICODE_CATEGORY_Lo + || gen_cat == UNICODE_CATEGORY_Mn + || gen_cat == UNICODE_CATEGORY_Mc + || gen_cat == UNICODE_CATEGORY_Me + || gen_cat == UNICODE_CATEGORY_Nl) ? true : false; + } +} + +/* Return 'true' if C is an decimal-number character as defined by its + Unicode properties. */ +bool +decimalnump (int c) +{ + Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c); + + if (INTEGERP (category)) + { + unicode_category_t gen_cat = XINT (category); + + /* See UTS #18. */ + return (gen_cat == UNICODE_CATEGORY_Nd) ? true : false; + } +} + void syms_of_character (void) { diff --git a/src/character.h b/src/character.h index 5043880cb42..7d902952db6 100644 --- a/src/character.h +++ b/src/character.h @@ -660,6 +660,9 @@ extern ptrdiff_t lisp_string_width (Lisp_Object, ptrdiff_t, extern Lisp_Object Vchar_unify_table; extern Lisp_Object string_escape_byte8 (Lisp_Object); +extern bool alphabeticp (int); +extern bool decimalnump (int); + /* Return a translation table of id number ID. */ #define GET_TRANSLATION_TABLE(id) \ (XCDR (XVECTOR (Vtranslation_table_vector)->contents[(id)])) diff --git a/src/regex.c b/src/regex.c index 41fe3fa8088..1afc5037594 100644 --- a/src/regex.c +++ b/src/regex.c @@ -324,12 +324,12 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 }; ? (((c) >= 'a' && (c) <= 'z') \ || ((c) >= 'A' && (c) <= 'Z') \ || ((c) >= '0' && (c) <= '9')) \ - : SYNTAX (c) == Sword) + : (alphabeticp (c) || decimalnump (c))) # define ISALPHA(c) (IS_REAL_ASCII (c) \ ? (((c) >= 'a' && (c) <= 'z') \ || ((c) >= 'A' && (c) <= 'Z')) \ - : SYNTAX (c) == Sword) + : alphabeticp (c)) # define ISLOWER(c) lowercasep (c) @@ -1872,6 +1872,8 @@ struct range_table_work_area #define BIT_SPACE 0x8 #define BIT_UPPER 0x10 #define BIT_MULTIBYTE 0x20 +#define BIT_ALPHA 0x40 +#define BIT_ALNUM 0x80 /* Set the bit for character C in a list. */ @@ -2072,7 +2074,9 @@ re_wctype_to_bit (re_wctype_t cc) { case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH: case RECC_MULTIBYTE: return BIT_MULTIBYTE; - case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD; + case RECC_ALPHA: return BIT_ALPHA; + case RECC_ALNUM: return BIT_ALNUM; + case RECC_WORD: return BIT_WORD; case RECC_LOWER: return BIT_LOWER; case RECC_UPPER: return BIT_UPPER; case RECC_PUNCT: return BIT_PUNCT; @@ -2930,7 +2934,7 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax, #endif /* emacs */ /* In most cases the matching rule for char classes only uses the syntax table for multibyte chars, - so that the content of the syntax-table it is not + so that the content of the syntax-table is not hardcoded in the range_table. SPACE and WORD are the two exceptions. */ if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD))) @@ -2945,7 +2949,7 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax, p = class_beg; SET_LIST_BIT ('['); - /* Because the `:' may starts the range, we + /* Because the `:' may start the range, we can't simply set bit and repeat the loop. Instead, just set it to C and handle below. */ c = ':'; @@ -5513,7 +5517,9 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, | (class_bits & BIT_PUNCT && ISPUNCT (c)) | (class_bits & BIT_SPACE && ISSPACE (c)) | (class_bits & BIT_UPPER && ISUPPER (c)) - | (class_bits & BIT_WORD && ISWORD (c))) + | (class_bits & BIT_WORD && ISWORD (c)) + | (class_bits & BIT_ALPHA && ISALPHA (c)) + | (class_bits & BIT_ALNUM && ISALNUM (c))) not = !not; else CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);