* src/regex.c (ISPRINT): Call 'printablep' for multibyte characters.
(BIT_PRINT): New bit mask.
(re_wctype_to_bit): Return BIT_PRINT for RECC_PRINT.
* src/character.c (printablep): New function.
* src/character.h (printablep): Add prototype.
* lisp/emacs-lisp/rx.el (rx): Doc fix: document the new behavior
of 'print', 'alnum', and 'alphabetic'.
* doc/lispref/searching.texi (Char Classes): Document the new
behavior of [:print:].
* etc/NEWS: Mention the new behavior of [:print:].
@item [:nonascii:]
This matches any non-@acronym{ASCII} character.
@item [:print:]
-This matches printing characters---everything except @acronym{ASCII} control
-characters and the delete character.
+This matches printing characters---everything except @acronym{ASCII}
+and non-@acronym{ASCII} control characters (including the delete
+character), surrogates, and codepoints unassigned by Unicode, as
+indicated by the Unicode @samp{general-category} property
+(@pxref{Character Properties}).
@item [:punct:]
This matches any punctuation character. (At present, for multibyte
characters, it matches anything that has non-word syntax.)
---
*** gulp.el
++++
+** The character class [:print:] in regular expressions
+no longer matches any multibyte character. Instead, Emacs now
+consults the Unicode character properties to determine which
+characters are printable. In particular, surrogates and unassigned
+codepoints are now rejected by this class. If you want the old
+behavior, use [:multibyte:] instead.
+
\f
* New Modes and Packages in Emacs 25.1
space, and DEL.
`printing', `print'
- matches printing characters--everything except ASCII control chars
- and DEL.
+ matches printing characters--everything except ASCII and non-ASCII
+ control characters, surrogates, and codepoints unassigned by Unicode.
`alphanumeric', `alnum'
- matches letters and digits. (But at present, for multibyte characters,
- it matches anything that has word syntax.)
+ matches alphabetic characters and digits. (For multibyte characters,
+ it matches according to Unicode character properties.)
`letter', `alphabetic', `alpha'
- matches letters. (But at present, for multibyte characters,
- it matches anything that has word syntax.)
+ matches alphabetic characters. (For multibyte characters,
+ it matches according to Unicode character properties.)
`ascii'
matches ASCII (unibyte) characters.
return gen_cat == UNICODE_CATEGORY_Nd;
}
+/* Return 'true' if C is a printable character as defined by its
+ Unicode properties. */
+bool
+printablep (int c)
+{
+ Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
+ if (! INTEGERP (category))
+ return false;
+ EMACS_INT gen_cat = XINT (category);
+
+ /* See UTS #18. */
+ return (!(gen_cat == UNICODE_CATEGORY_Cc /* control */
+ || gen_cat == UNICODE_CATEGORY_Cs /* surrogate */
+ || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
+}
+
void
syms_of_character (void)
{
extern bool alphabeticp (int);
extern bool decimalnump (int);
+extern bool printablep (int);
/* Return a translation table of id number ID. */
#define GET_TRANSLATION_TABLE(id) \
# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
- : 1)
+ : printablep (c))
# define ISALNUM(c) (IS_REAL_ASCII (c) \
? (((c) >= 'a' && (c) <= 'z') \
#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
/* Bits used to implement the multibyte-part of the various character classes
- such as [:alnum:] in a charset's range table. */
+ such as [:alnum:] in a charset's range table. The code currently assumes
+ that only the low 16 bits are used. */
#define BIT_WORD 0x1
#define BIT_LOWER 0x2
#define BIT_PUNCT 0x4
#define BIT_MULTIBYTE 0x20
#define BIT_ALPHA 0x40
#define BIT_ALNUM 0x80
+#define BIT_PRINT 0x100
\f
/* Set the bit for character C in a list. */
{
switch (cc)
{
- case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
+ case RECC_NONASCII: case RECC_GRAPH:
case RECC_MULTIBYTE: return BIT_MULTIBYTE;
case RECC_ALPHA: return BIT_ALPHA;
case RECC_ALNUM: return BIT_ALNUM;
case RECC_UPPER: return BIT_UPPER;
case RECC_PUNCT: return BIT_PUNCT;
case RECC_SPACE: return BIT_SPACE;
+ case RECC_PRINT: return BIT_PRINT;
case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
default: