Make [:print:] support non-ASCII characters correctly

author Eli Zaretskii <eliz@gnu.org>

Tue, 14 Apr 2015 15:47:04 +0000 (18:47 +0300)

committer Eli Zaretskii <eliz@gnu.org>

Tue, 14 Apr 2015 15:47:04 +0000 (18:47 +0300)
author Eli Zaretskii <eliz@gnu.org>
Tue, 14 Apr 2015 15:47:04 +0000 (18:47 +0300)
committer Eli Zaretskii <eliz@gnu.org>
Tue, 14 Apr 2015 15:47:04 +0000 (18:47 +0300)
diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi

index 87513e8f9ce252bc2effd5ef03694f7c4345eb72..238d814a9dcccb7797993daa8ec9216e0ff8f8e9 100644 (file)
--- a/doc/lispref/searching.texi
+++ b/doc/lispref/searching.texi
@@ -569,8 +569,11 @@ This matches any multibyte character (@pxref{Text Representations}).
  @item [:nonascii:]
  This matches any non-@acronym{ASCII} character.
  @item [:print:]
-This matches printing characters---everything except @acronym{ASCII} control
-characters and the delete character.
+This matches printing characters---everything except @acronym{ASCII}
+and non-@acronym{ASCII} control characters (including the delete
+character), surrogates, and codepoints unassigned by Unicode, as
+indicated by the Unicode @samp{general-category} property
+(@pxref{Character Properties}).
  @item [:punct:]
  This matches any punctuation character.  (At present, for multibyte
  characters, it matches anything that has non-word syntax.)
diff --git a/etc/NEWS b/etc/NEWS

index 6d8b4c6faf8bcd21f880d8c4041bbd4dfda8d815..907787a1f3e43602fabf5feadaf47890dafcb800 100644 (file)
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -628,6 +628,14 @@ notifications, if Emacs is compiled with file notification support.
  ---
  *** gulp.el
  
++++
+** The character class [:print:] in regular expressions
+no longer matches any multibyte character.  Instead, Emacs now
+consults the Unicode character properties to determine which
+characters are printable.  In particular, surrogates and unassigned
+codepoints are now rejected by this class.  If you want the old
+behavior, use [:multibyte:] instead.
+
  \f
  * New Modes and Packages in Emacs 25.1
  
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el

index 20af59f2abfdc83021c2b22ec034dc8f947f03d7..a5a228e58765464608cd92028aeb943be4ec3ef2 100644 (file)
--- a/lisp/emacs-lisp/rx.el
+++ b/lisp/emacs-lisp/rx.el
@@ -969,16 +969,16 @@ CHAR
       space, and DEL.
  
  `printing', `print'
-     matches printing characters--everything except ASCII control chars
-     and DEL.
+     matches printing characters--everything except ASCII and non-ASCII
+     control characters, surrogates, and codepoints unassigned by Unicode.
  
  `alphanumeric', `alnum'
-     matches letters and digits.  (But at present, for multibyte characters,
-     it matches anything that has word syntax.)
+     matches alphabetic characters and digits.  (For multibyte characters,
+     it matches according to Unicode character properties.)
  
  `letter', `alphabetic', `alpha'
-     matches letters.  (But at present, for multibyte characters,
-     it matches anything that has word syntax.)
+     matches alphabetic characters.  (For multibyte characters,
+     it matches according to Unicode character properties.)
  
  `ascii'
       matches ASCII (unibyte) characters.
diff --git a/src/character.c b/src/character.c

index ad78f512f43b57355143ede329bcbd4c808a0686..b357dd5a334d1a63a860396680d73b191b350b61 100644 (file)
--- a/src/character.c
+++ b/src/character.c
@@ -1022,6 +1022,22 @@ decimalnump (int c)
    return gen_cat == UNICODE_CATEGORY_Nd;
  }
  
+/* Return 'true' if C is a printable character as defined by its
+   Unicode properties.  */
+bool
+printablep (int c)
+{
+  Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
+  if (! INTEGERP (category))
+    return false;
+  EMACS_INT gen_cat = XINT (category);
+
+  /* See UTS #18.  */
+  return (!(gen_cat == UNICODE_CATEGORY_Cc /* control */
+           || gen_cat == UNICODE_CATEGORY_Cs /* surrogate */
+           || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
+}
+
  void
  syms_of_character (void)
  {
diff --git a/src/character.h b/src/character.h

index 7d902952db6c3450e54adb6e6405f2b9fbecfab0..1a5d2c8a670c512508975da953b3089749c6bb00 100644 (file)
--- a/src/character.h
+++ b/src/character.h
@@ -662,6 +662,7 @@ extern Lisp_Object string_escape_byte8 (Lisp_Object);
  
  extern bool alphabeticp (int);
  extern bool decimalnump (int);
+extern bool printablep (int);
  
  /* Return a translation table of id number ID.  */
  #define GET_TRANSLATION_TABLE(id) \
diff --git a/src/regex.c b/src/regex.c

index 1afc5037594096ad0b1b8cdcda7158fb1dc48afe..b9d09d02c22cf111b256ed1c998d9dc3a630c289 100644 (file)
--- a/src/regex.c
+++ b/src/regex.c
@@ -318,7 +318,7 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
  
  # define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c)                            \
                     ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237)       \
-                   : 1)
+                    : printablep (c))
  
  # define ISALNUM(c) (IS_REAL_ASCII (c)                 \
                     ? (((c) >= 'a' && (c) <= 'z')       \
@@ -1865,7 +1865,8 @@ struct range_table_work_area
  #define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
  
  /* Bits used to implement the multibyte-part of the various character classes
-   such as [:alnum:] in a charset's range table.  */
+   such as [:alnum:] in a charset's range table.  The code currently assumes
+   that only the low 16 bits are used.  */
  #define BIT_WORD       0x1
  #define BIT_LOWER      0x2
  #define BIT_PUNCT      0x4
@@ -1874,6 +1875,7 @@ struct range_table_work_area
  #define BIT_MULTIBYTE  0x20
  #define BIT_ALPHA      0x40
  #define BIT_ALNUM      0x80
+#define BIT_PRINT      0x100
  \f
  
  /* Set the bit for character C in a list.  */
@@ -2072,7 +2074,7 @@ re_wctype_to_bit (re_wctype_t cc)
  {
    switch (cc)
      {
-    case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
+    case RECC_NONASCII: case RECC_GRAPH:
      case RECC_MULTIBYTE: return BIT_MULTIBYTE;
      case RECC_ALPHA: return BIT_ALPHA;
      case RECC_ALNUM: return BIT_ALNUM;
@@ -2081,6 +2083,7 @@ re_wctype_to_bit (re_wctype_t cc)
      case RECC_UPPER: return BIT_UPPER;
      case RECC_PUNCT: return BIT_PUNCT;
      case RECC_SPACE: return BIT_SPACE;
+    case RECC_PRINT: return BIT_PRINT;
      case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
      case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
      default:
author	Eli Zaretskii <eliz@gnu.org>
	Tue, 14 Apr 2015 15:47:04 +0000 (18:47 +0300)
committer	Eli Zaretskii <eliz@gnu.org>
	Tue, 14 Apr 2015 15:47:04 +0000 (18:47 +0300)
doc/lispref/searching.texi		patch \| blob \| history
etc/NEWS		patch \| blob \| history
lisp/emacs-lisp/rx.el		patch \| blob \| history
src/character.c		patch \| blob \| history
src/character.h		patch \| blob \| history
src/regex.c		patch \| blob \| history