From a122a0276bddbda8ca84f9b94250a5a5f4e0582a Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Wed, 15 Apr 2015 00:26:32 -0700 Subject: [PATCH] Make [:graph:] act like [:print:] sans space In POSIX [[:print:]] is equivalent to [ [:graph:]], so change [:graph:] so that it matches everything that [:print:] does, except for space. * doc/lispref/searching.texi (Char Classes): * etc/NEWS: * lisp/emacs-lisp/rx.el (rx): Document [:graph:] to be [:print:] sans ' '. * src/character.c, src/character.h (graphicp): New function. * src/regex.c (ISGRAPH) [emacs]: Use it. (BIT_GRAPH): New macro. (BIT_PRINT): Increase to 0x200, to make room for BIT_GRAPH. (re_wctype_to_bit) [! WIDE_CHAR_SUPPORT]: Return BIT_GRAPH for RECC_GRAPH. (re_match_2_internal) [emacs]: Use ISGRAPH if BIT_GRAPH, and ISPRINT if BIT_PRINT. --- doc/lispref/searching.texi | 14 +++++++------- etc/NEWS | 10 +++++----- lisp/emacs-lisp/rx.el | 8 ++++---- src/character.c | 8 ++++++++ src/character.h | 1 + src/regex.c | 12 ++++++++---- 6 files changed, 33 insertions(+), 20 deletions(-) diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi index 238d814a9dc..10ea411d436 100644 --- a/doc/lispref/searching.texi +++ b/doc/lispref/searching.texi @@ -558,8 +558,11 @@ This matches any @acronym{ASCII} control character. This matches @samp{0} through @samp{9}. Thus, @samp{[-+[:digit:]]} matches any digit, as well as @samp{+} and @samp{-}. @item [:graph:] -This matches graphic characters---everything except @acronym{ASCII} control -characters, space, and the delete character. +This matches graphic characters---everything except space, +@acronym{ASCII} and non-@acronym{ASCII} control characters, +surrogates, and codepoints unassigned by Unicode, as indicated by the +Unicode @samp{general-category} property (@pxref{Character +Properties}). @item [:lower:] This matches any lower-case letter, as determined by the current case table (@pxref{Case Tables}). If @code{case-fold-search} is @@ -569,11 +572,8 @@ This matches any multibyte character (@pxref{Text Representations}). @item [:nonascii:] This matches any non-@acronym{ASCII} character. @item [:print:] -This matches printing characters---everything except @acronym{ASCII} -and non-@acronym{ASCII} control characters (including the delete -character), surrogates, and codepoints unassigned by Unicode, as -indicated by the Unicode @samp{general-category} property -(@pxref{Character Properties}). +This matches any printing character---either space, or a graphic +character matched by @samp{[:graph:]}. @item [:punct:] This matches any punctuation character. (At present, for multibyte characters, it matches anything that has non-word syntax.) diff --git a/etc/NEWS b/etc/NEWS index 907787a1f3e..d97e80a7171 100644 --- a/etc/NEWS +++ b/etc/NEWS @@ -629,12 +629,12 @@ notifications, if Emacs is compiled with file notification support. *** gulp.el +++ -** The character class [:print:] in regular expressions -no longer matches any multibyte character. Instead, Emacs now +** The character classes [:graph:] and [:print:] in regular expressions +no longer match every multibyte character. Instead, Emacs now consults the Unicode character properties to determine which -characters are printable. In particular, surrogates and unassigned -codepoints are now rejected by this class. If you want the old -behavior, use [:multibyte:] instead. +characters are graphic or printable. In particular, surrogates and +unassigned codepoints are now rejected. If you want the old behavior, +use [:multibyte:] instead. * New Modes and Packages in Emacs 25.1 diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el index a5a228e5876..ab9beb60928 100644 --- a/lisp/emacs-lisp/rx.el +++ b/lisp/emacs-lisp/rx.el @@ -965,12 +965,12 @@ CHAR matches space and tab only. `graphic', `graph' - matches graphic characters--everything except ASCII control chars, - space, and DEL. + matches graphic characters--everything except space, ASCII + and non-ASCII control characters, surrogates, and codepoints + unassigned by Unicode. `printing', `print' - matches printing characters--everything except ASCII and non-ASCII - control characters, surrogates, and codepoints unassigned by Unicode. + matches space and graphic characters. `alphanumeric', `alnum' matches alphabetic characters and digits. (For multibyte characters, diff --git a/src/character.c b/src/character.c index b357dd5a334..ea98cf68e6c 100644 --- a/src/character.c +++ b/src/character.c @@ -1022,6 +1022,14 @@ decimalnump (int c) return gen_cat == UNICODE_CATEGORY_Nd; } +/* Return 'true' if C is a graphic character as defined by its + Unicode properties. */ +bool +graphicp (int c) +{ + return c == ' ' || printablep (c); +} + /* Return 'true' if C is a printable character as defined by its Unicode properties. */ bool diff --git a/src/character.h b/src/character.h index 1a5d2c8a670..859d717a0ba 100644 --- a/src/character.h +++ b/src/character.h @@ -662,6 +662,7 @@ extern Lisp_Object string_escape_byte8 (Lisp_Object); extern bool alphabeticp (int); extern bool decimalnump (int); +extern bool graphicp (int); extern bool printablep (int); /* Return a translation table of id number ID. */ diff --git a/src/regex.c b/src/regex.c index b9d09d02c22..4af70c62cf5 100644 --- a/src/regex.c +++ b/src/regex.c @@ -314,7 +314,7 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 }; # define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \ ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \ - : 1) + : graphicp (c)) # define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \ ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \ @@ -1875,7 +1875,8 @@ struct range_table_work_area #define BIT_MULTIBYTE 0x20 #define BIT_ALPHA 0x40 #define BIT_ALNUM 0x80 -#define BIT_PRINT 0x100 +#define BIT_GRAPH 0x100 +#define BIT_PRINT 0x200 /* Set the bit for character C in a list. */ @@ -2074,7 +2075,7 @@ re_wctype_to_bit (re_wctype_t cc) { switch (cc) { - case RECC_NONASCII: case RECC_GRAPH: + case RECC_NONASCII: case RECC_MULTIBYTE: return BIT_MULTIBYTE; case RECC_ALPHA: return BIT_ALPHA; case RECC_ALNUM: return BIT_ALNUM; @@ -2083,6 +2084,7 @@ re_wctype_to_bit (re_wctype_t cc) case RECC_UPPER: return BIT_UPPER; case RECC_PUNCT: return BIT_PUNCT; case RECC_SPACE: return BIT_SPACE; + case RECC_GRAPH: return BIT_GRAPH; case RECC_PRINT: return BIT_PRINT; case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL: case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0; @@ -5522,7 +5524,9 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, | (class_bits & BIT_UPPER && ISUPPER (c)) | (class_bits & BIT_WORD && ISWORD (c)) | (class_bits & BIT_ALPHA && ISALPHA (c)) - | (class_bits & BIT_ALNUM && ISALNUM (c))) + | (class_bits & BIT_ALNUM && ISALNUM (c)) + | (class_bits & BIT_GRAPH && ISGRAPH (c)) + | (class_bits & BIT_PRINT && ISPRINT (c))) not = !not; else CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count); -- 2.39.2