See Bug#25366.
* src/character.c (blankp): New function for checking Unicode
horizontal whitespace.
* src/regex.c (ISBLANK): Use 'blankp' for non-ASCII horizontal
whitespace.
(BIT_BLANK): New bit for range table.
(re_wctype_to_bit, execute_charset): Use it.
* test/lisp/subr-tests.el (subr-tests--string-match-p--blank): Add
unit test for [:blank:] character class.
* test/src/regex-tests.el (test): Adapt unit test.
* doc/lispref/searching.texi (Char Classes): Document new Unicode
behavior for [:blank:].
(@pxref{Character Properties}) indicates they are alphabetic
characters.
@item [:blank:]
-This matches space and tab only.
+This matches horizontal whitespace, as defined by Annex C of the
+Unicode Technical Standard #18. In particular, it matches spaces,
+tabs, and other characters whose Unicode @samp{general-category}
+property (@pxref{Character Properties}) indicates they are spacing
+separators.
@item [:cntrl:]
This matches any @acronym{ASCII} control character.
@item [:digit:]
now generate less chatter and more-compact diagnostics. The auxiliary
function 'check-declare-errmsg' has been removed.
++++
+** The regular expression character class [:blank:] now matches
+Unicode horizontal whitespace as defined in the Unicode Technical
+Standard #18. If you only want to match space and tab, use [ \t]
+instead.
+
\f
* Lisp Changes in Emacs 26.1
|| gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
}
+/* Return true if C is a horizontal whitespace character, as defined
+ by http://www.unicode.org/reports/tr18/tr18-19.html#blank. */
+bool
+blankp (int c)
+{
+ /* Fast path for ASCII characters that are always assumed to
+ constitute horizontal whitespace. */
+ if (c == ' ' || c == '\t')
+ return true;
+
+ Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
+ if (! INTEGERP (category))
+ return false;
+
+ return XINT (category) == UNICODE_CATEGORY_Zs; /* separator, space */
+}
+
void
syms_of_character (void)
{
extern bool alphanumericp (int);
extern bool graphicp (int);
extern bool printablep (int);
+extern bool blankp (int);
/* Return a translation table of id number ID. */
#define GET_TRANSLATION_TABLE(id) \
|| ((c) >= 'a' && (c) <= 'f') \
|| ((c) >= 'A' && (c) <= 'F'))
-/* This is only used for single-byte characters. */
-# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
-
/* The rest must handle multibyte characters. */
+# define ISBLANK(c) (IS_REAL_ASCII (c) \
+ ? ((c) == ' ' || (c) == '\t') \
+ : blankp (c))
+
# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
? (c) > ' ' && !((c) >= 0177 && (c) <= 0240) \
: graphicp (c))
#define BIT_ALNUM 0x80
#define BIT_GRAPH 0x100
#define BIT_PRINT 0x200
+#define BIT_BLANK 0x400
\f
/* Set the bit for character C in a list. */
case RECC_SPACE: return BIT_SPACE;
case RECC_GRAPH: return BIT_GRAPH;
case RECC_PRINT: return BIT_PRINT;
+ case RECC_BLANK: return BIT_BLANK;
case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
- case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
+ case RECC_UNIBYTE: case RECC_ERROR: return 0;
default:
abort ();
}
(class_bits & BIT_ALNUM && ISALNUM (c)) ||
(class_bits & BIT_ALPHA && ISALPHA (c)) ||
(class_bits & BIT_SPACE && ISSPACE (c)) ||
+ (class_bits & BIT_BLANK && ISBLANK (c)) ||
(class_bits & BIT_WORD && ISWORD (c)) ||
((class_bits & BIT_UPPER) &&
(ISUPPER (c) || (corig != c &&
(let ((frame-lists (subr-test--frames-1 'subr-test--frames-2)))
(should (equal (car frame-lists) (cdr frame-lists)))))
+(ert-deftest subr-tests--string-match-p--blank ()
+ "Test that [:blank:] matches horizontal whitespace, cf. Bug#25366."
+ (should (equal (string-match-p "\\`[[:blank:]]\\'" " ") 0))
+ (should (equal (string-match-p "\\`[[:blank:]]\\'" "\t") 0))
+ (should-not (string-match-p "\\`[[:blank:]]\\'" "\n"))
+ (should-not (string-match-p "\\`[[:blank:]]\\'" "a"))
+ (should (equal (string-match-p "\\`[[:blank:]]\\'" "\N{HAIR SPACE}") 0))
+ (should (equal (string-match-p "\\`[[:blank:]]\\'" "\u3000") 0))
+ (should-not (string-match-p "\\`[[:blank:]]\\'" "\N{LINE SEPARATOR}")))
+
(provide 'subr-tests)
;;; subr-tests.el ends here
("print" "abcłąka\u2620-, " "\t\n\1")
("space" " \t\n\u2001" "abcABCł0123")
- ("blank" " \t" "\n\u2001")
+ ("blank" " \t\u2001" "\n")
("ascii" "abcABC012 \t\n\1" "łą\u2620")
("nonascii" "łą\u2622" "abcABC012 \t\n\1")