and end. */
#define CHARSET_RANGE_TABLE_END(range_table, count) \
((range_table) + (count) * 2 * 3)
-
-/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
- COUNT is number of ranges in RANGE_TABLE. */
-#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
- do \
- { \
- re_wchar_t range_start, range_end; \
- re_char *rtp; \
- re_char *range_table_end \
- = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
- \
- for (rtp = (range_table); rtp < range_table_end; rtp += 2 * 3) \
- { \
- EXTRACT_CHARACTER (range_start, rtp); \
- EXTRACT_CHARACTER (range_end, rtp + 3); \
- \
- if (range_start <= (c) && (c) <= range_end) \
- { \
- (not) = !(not); \
- break; \
- } \
- } \
- } \
- while (0)
-
-/* Test if C is in range table of CHARSET. The flag NOT is negated if
- C is listed in it. */
-#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
- do \
- { \
- /* Number of ranges in range table. */ \
- int count; \
- re_char *range_table = CHARSET_RANGE_TABLE (charset); \
- \
- EXTRACT_NUMBER_AND_INCR (count, range_table); \
- CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
- } \
- while (0)
\f
/* If DEBUG is defined, Regex prints many voluminous messages about what
it is doing (if the variable `debug' is nonzero). If linked with the
return p;
}
+/* Test if C matches charset op. *PP points to the charset or chraset_not
+ opcode. When the function finishes, *PP will be advanced past that opcode.
+ C is character to test (possibly after translations) and CORIG is original
+ character (i.e. without any translations). UNIBYTE denotes whether c is
+ unibyte or multibyte character. */
+static bool
+execute_charset (const_re_char **pp, unsigned c, unsigned corig, bool unibyte)
+{
+ re_char *p = *pp, *rtp = NULL;
+ bool not = (re_opcode_t) *p == charset_not;
+
+ if (CHARSET_RANGE_TABLE_EXISTS_P (p))
+ {
+ int count;
+ rtp = CHARSET_RANGE_TABLE (p);
+ EXTRACT_NUMBER_AND_INCR (count, rtp);
+ *pp = CHARSET_RANGE_TABLE_END ((rtp), (count));
+ }
+ else
+ *pp += 2 + CHARSET_BITMAP_SIZE (p);
+
+ if (unibyte && c < (1 << BYTEWIDTH))
+ { /* Lookup bitmap. */
+ /* Cast to `unsigned' instead of `unsigned char' in
+ case the bit list is a full 32 bytes long. */
+ if (c < (unsigned) (CHARSET_BITMAP_SIZE (p) * BYTEWIDTH)
+ && p[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
+ return !not;
+ }
+#ifdef emacs
+ else if (rtp)
+ {
+ int class_bits = CHARSET_RANGE_TABLE_BITS (p);
+ re_wchar_t range_start, range_end;
+
+ /* Sort tests by the most commonly used classes with some adjustment to which
+ tests are easiest to perform. Frequencies of character class names used in
+ Emacs sources as of 2016-07-15:
+
+ $ find \( -name \*.c -o -name \*.el \) -exec grep -h '\[:[a-z]*:]' {} + |
+ sed 's/]/]\n/g' |grep -o '\[:[a-z]*:]' |sort |uniq -c |sort -nr
+ 213 [:alnum:]
+ 104 [:alpha:]
+ 62 [:space:]
+ 39 [:digit:]
+ 36 [:blank:]
+ 26 [:upper:]
+ 24 [:word:]
+ 21 [:lower:]
+ 10 [:punct:]
+ 10 [:ascii:]
+ 9 [:xdigit:]
+ 4 [:nonascii:]
+ 4 [:graph:]
+ 2 [:print:]
+ 2 [:cntrl:]
+ 1 [:ff:]
+ */
+
+ if ((class_bits & BIT_MULTIBYTE) ||
+ (class_bits & BIT_ALNUM && ISALNUM (c)) ||
+ (class_bits & BIT_ALPHA && ISALPHA (c)) ||
+ (class_bits & BIT_SPACE && ISSPACE (c)) ||
+ (class_bits & BIT_WORD && ISWORD (c)) ||
+ ((class_bits & BIT_UPPER) &&
+ (ISUPPER (c) || (corig != c &&
+ c == downcase (corig) && ISLOWER (c)))) ||
+ ((class_bits & BIT_LOWER) &&
+ (ISLOWER (c) || (corig != c &&
+ c == upcase (corig) && ISUPPER(c)))) ||
+ (class_bits & BIT_PUNCT && ISPUNCT (c)) ||
+ (class_bits & BIT_GRAPH && ISGRAPH (c)) ||
+ (class_bits & BIT_PRINT && ISPRINT (c)))
+ return !not;
+
+ for (p = *pp; rtp < p; rtp += 2 * 3)
+ {
+ EXTRACT_CHARACTER (range_start, rtp);
+ EXTRACT_CHARACTER (range_end, rtp + 3);
+ if (range_start <= c && c <= range_end)
+ return !not;
+ }
+ }
+#endif /* emacs */
+ return not;
+}
+
/* Non-zero if "p1 matches something" implies "p2 fails". */
static int
mutually_exclusive_p (struct re_pattern_buffer *bufp, const_re_char *p1,
else if ((re_opcode_t) *p1 == charset
|| (re_opcode_t) *p1 == charset_not)
{
- int not = (re_opcode_t) *p1 == charset_not;
-
- /* Test if C is listed in charset (or charset_not)
- at `p1'. */
- if (! multibyte || IS_REAL_ASCII (c))
- {
- if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
- && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
- not = !not;
- }
- else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
- CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
-
- /* `not' is equal to 1 if c would match, which means
- that we can't change to pop_failure_jump. */
- if (!not)
+ if (!execute_charset (&p1, c, c, !multibyte))
{
DEBUG_PRINT (" No match => fast loop.\n");
return 1;
case charset_not:
{
register unsigned int c, corig;
- boolean not = (re_opcode_t) *(p - 1) == charset_not;
int len;
- /* Start of actual range_table, or end of bitmap if there is no
- range table. */
- re_char *range_table UNINIT;
-
- /* Nonzero if there is a range table. */
- int range_table_exists;
-
- /* Number of ranges of range table. This is not included
- in the initial byte-length of the command. */
- int count = 0;
-
/* Whether matching against a unibyte character. */
boolean unibyte_char = false;
- DEBUG_PRINT ("EXECUTING charset%s.\n", not ? "_not" : "");
-
- range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
-
- if (range_table_exists)
- {
- range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
- EXTRACT_NUMBER_AND_INCR (count, range_table);
- }
+ DEBUG_PRINT ("EXECUTING charset%s.\n",
+ (re_opcode_t) *(p - 1) == charset_not ? "_not" : "");
PREFETCH ();
corig = c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
unibyte_char = true;
}
- if (unibyte_char && c < (1 << BYTEWIDTH))
- { /* Lookup bitmap. */
- /* Cast to `unsigned' instead of `unsigned char' in
- case the bit list is a full 32 bytes long. */
- if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
- && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
- not = !not;
- }
-#ifdef emacs
- else if (range_table_exists)
- {
- int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
-
- if ( (class_bits & BIT_LOWER
- && (ISLOWER (c)
- || (corig != c
- && c == upcase (corig) && ISUPPER(c))))
- | (class_bits & BIT_MULTIBYTE)
- | (class_bits & BIT_PUNCT && ISPUNCT (c))
- | (class_bits & BIT_SPACE && ISSPACE (c))
- | (class_bits & BIT_UPPER
- && (ISUPPER (c)
- || (corig != c
- && c == downcase (corig) && ISLOWER (c))))
- | (class_bits & BIT_WORD && ISWORD (c))
- | (class_bits & BIT_ALPHA && ISALPHA (c))
- | (class_bits & BIT_ALNUM && ISALNUM (c))
- | (class_bits & BIT_GRAPH && ISGRAPH (c))
- | (class_bits & BIT_PRINT && ISPRINT (c)))
- not = !not;
- else
- CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
- }
-#endif /* emacs */
-
- if (range_table_exists)
- p = CHARSET_RANGE_TABLE_END (range_table, count);
- else
- p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
-
- if (!not) goto fail;
+ p -= 1;
+ if (!execute_charset (&p, c, corig, unibyte_char))
+ goto fail;
d += len;
}
--- /dev/null
+;;; regex-tests.el --- tests for regex.c functions -*- lexical-binding: t -*-
+
+;; Copyright (C) 2015-2016 Free Software Foundation, Inc.
+
+;; This file is part of GNU Emacs.
+
+;; GNU Emacs is free software: you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
+
+;; GNU Emacs is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
+
+;;; Code:
+
+(require 'ert)
+
+(ert-deftest regex-word-cc-fallback-test ()
+ "Test that ‘[[:cc:]]*x’ matches ‘x’ (bug#24020).
+
+Test that a regex of the form \"[[:cc:]]*x\" where CC is
+a character class which matches a multibyte character X, matches
+string \"x\".
+
+For example, ‘[[:word:]]*\u2620’ regex (note: \u2620 is a word
+character) must match a string \"\u2420\"."
+ (dolist (class '("[[:word:]]" "\\sw"))
+ (dolist (repeat '("*" "+"))
+ (dolist (suffix '("" "b" "bar" "\u2620"))
+ (dolist (string '("" "foo"))
+ (when (not (and (string-equal repeat "+")
+ (string-equal string "")))
+ (should (string-match (concat "^" class repeat suffix "$")
+ (concat string suffix)))))))))
+
+(defun regex--test-cc (name matching not-matching)
+ (should (string-match-p (concat "^[[:" name ":]]*$") matching))
+ (should (string-match-p (concat "^[[:" name ":]]*?\u2622$")
+ (concat matching "\u2622")))
+ (should (string-match-p (concat "^[^[:" name ":]]*$") not-matching))
+ (should (string-match-p (concat "^[^[:" name ":]]*\u2622$")
+ (concat not-matching "\u2622")))
+ (with-temp-buffer
+ (insert matching)
+ (let ((p (point)))
+ (insert not-matching)
+ (goto-char (point-min))
+ (skip-chars-forward (concat "[:" name ":]"))
+ (should (equal (point) p))
+ (skip-chars-forward (concat "^[:" name ":]"))
+ (should (equal (point) (point-max)))
+ (goto-char (point-min))
+ (skip-chars-forward (concat "[:" name ":]\u2622"))
+ (should (or (equal (point) p) (equal (point) (1+ p)))))))
+
+(ert-deftest regex-character-classes ()
+ "Perform sanity test of regexes using character classes.
+
+Go over all the supported character classes and test whether the
+classes and their inversions match what they are supposed to
+match. The test is done using `string-match-p' as well as
+`skip-chars-forward'."
+ (let (case-fold-search)
+ (regex--test-cc "alnum" "abcABC012łąka" "-, \t\n")
+ (regex--test-cc "alpha" "abcABCłąka" "-,012 \t\n")
+ (regex--test-cc "digit" "012" "abcABCłąka-, \t\n")
+ (regex--test-cc "xdigit" "0123aBc" "łąk-, \t\n")
+ (regex--test-cc "upper" "ABCŁĄKA" "abc012-, \t\n")
+ (regex--test-cc "lower" "abcłąka" "ABC012-, \t\n")
+
+ (regex--test-cc "word" "abcABC012\u2620" "-, \t\n")
+
+ (regex--test-cc "punct" ".,-" "abcABC012\u2620 \t\n")
+ (regex--test-cc "cntrl" "\1\2\t\n" ".,-abcABC012\u2620 ")
+ (regex--test-cc "graph" "abcłąka\u2620-," " \t\n\1")
+ (regex--test-cc "print" "abcłąka\u2620-, " "\t\n\1")
+
+ (regex--test-cc "space" " \t\n\u2001" "abcABCł0123")
+ (regex--test-cc "blank" " \t" "\n\u2001")
+
+ (regex--test-cc "ascii" "abcABC012 \t\n\1" "łą\u2620")
+ (regex--test-cc "nonascii" "łą\u2622" "abcABC012 \t\n\1")
+ (regex--test-cc "unibyte" "abcABC012 \t\n\1" "łą\u2622")
+ (regex--test-cc "multibyte" "łą\u2622" "abcABC012 \t\n\1")))
+
+;;; regex-tests.el ends here