src/character.c (alphabeticp, decimalnump): New functions.
src/character.h (alphabeticp, decimalnump): Add prototypes.
src/regex.c (ISALNUM, ISALPHA): Check Unicode character properties
for multibyte characters by calling alphabeticp and decimalnump.
(BIT_ALPHA, BIT_ALNUM): New bit masks.
(re_wctype_to_bit): Return them when the class is RECC_ALPHA or
RECC_ALNUM.
(re_match_2_internal): Call ISALPHA and ISALNUM when appropriate.
doc/lispref/searching.texi (Char Classes): Update the documentation of
[:alpha:] and [:alnum:].
etc/NEWS: Mention the changes in [:alpha:] and [:alnum:].
+2015-02-28 Eli Zaretskii <eliz@gnu.org>
+
+ * searching.texi (Char Classes): Update the documentation of
+ [:alpha:] and [:alnum:]. (Bug#19878)
+
2015-02-27 Eli Zaretskii <eliz@gnu.org>
* os.texi (Startup Summary):
@item [:ascii:]
This matches any @acronym{ASCII} character (codes 0--127).
@item [:alnum:]
-This matches any letter or digit. (At present, for multibyte
-characters, it matches anything that has word syntax.)
+This matches any letter or digit. For multibyte characters, it
+matches characters whose Unicode @samp{general-category} property
+(@pxref{Character Properties}) indicates they are alphabetic or
+decimal number characters.
@item [:alpha:]
-This matches any letter. (At present, for multibyte characters, it
-matches anything that has word syntax.)
+This matches any letter. For multibyte characters, it matches
+characters whose Unicode @samp{general-category} property
+(@pxref{Character Properties}) indicates they are alphabetic
+characters.
@item [:blank:]
This matches space and tab only.
@item [:cntrl:]
denied" instead of "permission denied". The old behavior was problematic
in languages like German where downcasing rules depend on grammar.
++++
+** The character classes [:alpha:] and [:alnum:] in regular expressions
+now match multibyte characters using Unicode character properties.
+If you want the old behavior where they matched any character with
+word syntax, use `\sw' instead.
+
\f
* Lisp Changes in Emacs 25.1
+2015-02-28 Eli Zaretskii <eliz@gnu.org>
+
+ * character.c (alphabeticp, decimalnump): New functions.
+ * character.h (alphabeticp, decimalnump): Add prototypes.
+
+ * regex.c (ISALNUM, ISALPHA): Check Unicode character properties
+ for multibyte characters by calling alphabeticp and decimalnump.
+ (BIT_ALPHA, BIT_ALNUM): New bit masks.
+ (re_wctype_to_bit): Return them when the class is RECC_ALPHA or
+ RECC_ALNUM.
+ (re_match_2_internal): Call ISALPHA and ISALNUM when appropriate.
+ (Bug#19878)
+
2015-02-27 Jan Djärv <jan.h.d@swipnet.se>
* xterm.h (x_real_pos_and_offsets): Take outer_border as arg also.
#ifdef emacs
+/* Return 'true' if C is an alphabetic character as defined by its
+ Unicode properties. */
+bool
+alphabeticp (int c)
+{
+ Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
+
+ if (INTEGERP (category))
+ {
+ unicode_category_t gen_cat = XINT (category);
+
+ /* See UTS #18. There are additional characters that should be
+ here, those designated as Other_uppercase, Other_lowercase,
+ and Other_alphabetic; FIXME. */
+ return (gen_cat == UNICODE_CATEGORY_Lu
+ || gen_cat == UNICODE_CATEGORY_Ll
+ || gen_cat == UNICODE_CATEGORY_Lt
+ || gen_cat == UNICODE_CATEGORY_Lm
+ || gen_cat == UNICODE_CATEGORY_Lo
+ || gen_cat == UNICODE_CATEGORY_Mn
+ || gen_cat == UNICODE_CATEGORY_Mc
+ || gen_cat == UNICODE_CATEGORY_Me
+ || gen_cat == UNICODE_CATEGORY_Nl) ? true : false;
+ }
+}
+
+/* Return 'true' if C is an decimal-number character as defined by its
+ Unicode properties. */
+bool
+decimalnump (int c)
+{
+ Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
+
+ if (INTEGERP (category))
+ {
+ unicode_category_t gen_cat = XINT (category);
+
+ /* See UTS #18. */
+ return (gen_cat == UNICODE_CATEGORY_Nd) ? true : false;
+ }
+}
+
void
syms_of_character (void)
{
extern Lisp_Object Vchar_unify_table;
extern Lisp_Object string_escape_byte8 (Lisp_Object);
+extern bool alphabeticp (int);
+extern bool decimalnump (int);
+
/* Return a translation table of id number ID. */
#define GET_TRANSLATION_TABLE(id) \
(XCDR (XVECTOR (Vtranslation_table_vector)->contents[(id)]))
? (((c) >= 'a' && (c) <= 'z') \
|| ((c) >= 'A' && (c) <= 'Z') \
|| ((c) >= '0' && (c) <= '9')) \
- : SYNTAX (c) == Sword)
+ : (alphabeticp (c) || decimalnump (c)))
# define ISALPHA(c) (IS_REAL_ASCII (c) \
? (((c) >= 'a' && (c) <= 'z') \
|| ((c) >= 'A' && (c) <= 'Z')) \
- : SYNTAX (c) == Sword)
+ : alphabeticp (c))
# define ISLOWER(c) lowercasep (c)
#define BIT_SPACE 0x8
#define BIT_UPPER 0x10
#define BIT_MULTIBYTE 0x20
+#define BIT_ALPHA 0x40
+#define BIT_ALNUM 0x80
\f
/* Set the bit for character C in a list. */
{
case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
case RECC_MULTIBYTE: return BIT_MULTIBYTE;
- case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
+ case RECC_ALPHA: return BIT_ALPHA;
+ case RECC_ALNUM: return BIT_ALNUM;
+ case RECC_WORD: return BIT_WORD;
case RECC_LOWER: return BIT_LOWER;
case RECC_UPPER: return BIT_UPPER;
case RECC_PUNCT: return BIT_PUNCT;
#endif /* emacs */
/* In most cases the matching rule for char classes
only uses the syntax table for multibyte chars,
- so that the content of the syntax-table it is not
+ so that the content of the syntax-table is not
hardcoded in the range_table. SPACE and WORD are
the two exceptions. */
if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
p = class_beg;
SET_LIST_BIT ('[');
- /* Because the `:' may starts the range, we
+ /* Because the `:' may start the range, we
can't simply set bit and repeat the loop.
Instead, just set it to C and handle below. */
c = ':';
| (class_bits & BIT_PUNCT && ISPUNCT (c))
| (class_bits & BIT_SPACE && ISSPACE (c))
| (class_bits & BIT_UPPER && ISUPPER (c))
- | (class_bits & BIT_WORD && ISWORD (c)))
+ | (class_bits & BIT_WORD && ISWORD (c))
+ | (class_bits & BIT_ALPHA && ISALPHA (c))
+ | (class_bits & BIT_ALNUM && ISALNUM (c)))
not = !not;
else
CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);