From: Eli Zaretskii Date: Tue, 14 Apr 2015 15:47:04 +0000 (+0300) Subject: Make [:print:] support non-ASCII characters correctly X-Git-Tag: emacs-25.0.90~2426 X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=6c284c6b5828bc4407f7201499e0507ce0e5a0a0;p=emacs.git Make [:print:] support non-ASCII characters correctly * src/regex.c (ISPRINT): Call 'printablep' for multibyte characters. (BIT_PRINT): New bit mask. (re_wctype_to_bit): Return BIT_PRINT for RECC_PRINT. * src/character.c (printablep): New function. * src/character.h (printablep): Add prototype. * lisp/emacs-lisp/rx.el (rx): Doc fix: document the new behavior of 'print', 'alnum', and 'alphabetic'. * doc/lispref/searching.texi (Char Classes): Document the new behavior of [:print:]. * etc/NEWS: Mention the new behavior of [:print:]. --- diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi index 87513e8f9ce..238d814a9dc 100644 --- a/doc/lispref/searching.texi +++ b/doc/lispref/searching.texi @@ -569,8 +569,11 @@ This matches any multibyte character (@pxref{Text Representations}). @item [:nonascii:] This matches any non-@acronym{ASCII} character. @item [:print:] -This matches printing characters---everything except @acronym{ASCII} control -characters and the delete character. +This matches printing characters---everything except @acronym{ASCII} +and non-@acronym{ASCII} control characters (including the delete +character), surrogates, and codepoints unassigned by Unicode, as +indicated by the Unicode @samp{general-category} property +(@pxref{Character Properties}). @item [:punct:] This matches any punctuation character. (At present, for multibyte characters, it matches anything that has non-word syntax.) diff --git a/etc/NEWS b/etc/NEWS index 6d8b4c6faf8..907787a1f3e 100644 --- a/etc/NEWS +++ b/etc/NEWS @@ -628,6 +628,14 @@ notifications, if Emacs is compiled with file notification support. --- *** gulp.el ++++ +** The character class [:print:] in regular expressions +no longer matches any multibyte character. Instead, Emacs now +consults the Unicode character properties to determine which +characters are printable. In particular, surrogates and unassigned +codepoints are now rejected by this class. If you want the old +behavior, use [:multibyte:] instead. + * New Modes and Packages in Emacs 25.1 diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el index 20af59f2abf..a5a228e5876 100644 --- a/lisp/emacs-lisp/rx.el +++ b/lisp/emacs-lisp/rx.el @@ -969,16 +969,16 @@ CHAR space, and DEL. `printing', `print' - matches printing characters--everything except ASCII control chars - and DEL. + matches printing characters--everything except ASCII and non-ASCII + control characters, surrogates, and codepoints unassigned by Unicode. `alphanumeric', `alnum' - matches letters and digits. (But at present, for multibyte characters, - it matches anything that has word syntax.) + matches alphabetic characters and digits. (For multibyte characters, + it matches according to Unicode character properties.) `letter', `alphabetic', `alpha' - matches letters. (But at present, for multibyte characters, - it matches anything that has word syntax.) + matches alphabetic characters. (For multibyte characters, + it matches according to Unicode character properties.) `ascii' matches ASCII (unibyte) characters. diff --git a/src/character.c b/src/character.c index ad78f512f43..b357dd5a334 100644 --- a/src/character.c +++ b/src/character.c @@ -1022,6 +1022,22 @@ decimalnump (int c) return gen_cat == UNICODE_CATEGORY_Nd; } +/* Return 'true' if C is a printable character as defined by its + Unicode properties. */ +bool +printablep (int c) +{ + Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c); + if (! INTEGERP (category)) + return false; + EMACS_INT gen_cat = XINT (category); + + /* See UTS #18. */ + return (!(gen_cat == UNICODE_CATEGORY_Cc /* control */ + || gen_cat == UNICODE_CATEGORY_Cs /* surrogate */ + || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */ +} + void syms_of_character (void) { diff --git a/src/character.h b/src/character.h index 7d902952db6..1a5d2c8a670 100644 --- a/src/character.h +++ b/src/character.h @@ -662,6 +662,7 @@ extern Lisp_Object string_escape_byte8 (Lisp_Object); extern bool alphabeticp (int); extern bool decimalnump (int); +extern bool printablep (int); /* Return a translation table of id number ID. */ #define GET_TRANSLATION_TABLE(id) \ diff --git a/src/regex.c b/src/regex.c index 1afc5037594..b9d09d02c22 100644 --- a/src/regex.c +++ b/src/regex.c @@ -318,7 +318,7 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 }; # define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \ ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \ - : 1) + : printablep (c)) # define ISALNUM(c) (IS_REAL_ASCII (c) \ ? (((c) >= 'a' && (c) <= 'z') \ @@ -1865,7 +1865,8 @@ struct range_table_work_area #define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i]) /* Bits used to implement the multibyte-part of the various character classes - such as [:alnum:] in a charset's range table. */ + such as [:alnum:] in a charset's range table. The code currently assumes + that only the low 16 bits are used. */ #define BIT_WORD 0x1 #define BIT_LOWER 0x2 #define BIT_PUNCT 0x4 @@ -1874,6 +1875,7 @@ struct range_table_work_area #define BIT_MULTIBYTE 0x20 #define BIT_ALPHA 0x40 #define BIT_ALNUM 0x80 +#define BIT_PRINT 0x100 /* Set the bit for character C in a list. */ @@ -2072,7 +2074,7 @@ re_wctype_to_bit (re_wctype_t cc) { switch (cc) { - case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH: + case RECC_NONASCII: case RECC_GRAPH: case RECC_MULTIBYTE: return BIT_MULTIBYTE; case RECC_ALPHA: return BIT_ALPHA; case RECC_ALNUM: return BIT_ALNUM; @@ -2081,6 +2083,7 @@ re_wctype_to_bit (re_wctype_t cc) case RECC_UPPER: return BIT_UPPER; case RECC_PUNCT: return BIT_PUNCT; case RECC_SPACE: return BIT_SPACE; + case RECC_PRINT: return BIT_PRINT; case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL: case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0; default: