analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte)
{
int j, k;
+ int nbits;
bool not;
/* If all elements for base leading-codes in fastmap is set, this
each byte is a character. Thus, this works in both
cases. */
fastmap[p[1]] = 1;
- if (! multibyte)
+ if (multibyte)
+ {
+ /* Cover the case of matching a raw char in a
+ multibyte regexp against unibyte. */
+ if (CHAR_BYTE8_HEAD_P (p[1]))
+ fastmap[CHAR_TO_BYTE8 (STRING_CHAR (p + 1))] = 1;
+ }
+ else
{
/* For the case of matching this unibyte regex
against multibyte, we must set a leading code of
case charset:
if (!fastmap) break;
not = (re_opcode_t) *(p - 1) == charset_not;
- for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
- j >= 0; j--)
+ nbits = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
+ p++;
+ for (j = 0; j < nbits; j++)
if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
fastmap[j] = 1;
+ /* To match raw bytes (in the 80..ff range) against multibyte
+ strings, add their leading bytes to the fastmap. */
+ for (j = 0x80; j < nbits; j++)
+ if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
+ fastmap[CHAR_LEADING_CODE (BYTE8_TO_CHAR (j))] = 1;
+
if (/* Any leading code can possibly start a character
which doesn't match the specified set of characters. */
not
}
p += pat_charlen;
d++;
+ mcnt -= pat_charlen;
}
- while (--mcnt);
+ while (mcnt > 0);
break;
(should-not (string-match "\\`x\\{65535\\}" (make-string 65534 ?x)))
(should-error (string-match "\\`x\\{65536\\}" "X") :type 'invalid-regexp))
+(ert-deftest regexp-unibyte-unibyte ()
+ "Test matching a unibyte regexp against a unibyte string."
+ ;; Sanity check
+ (should-not (multibyte-string-p "ab"))
+ (should-not (multibyte-string-p "\xff"))
+ ;; ASCII
+ (should (string-match "a[b]" "ab"))
+ ;; Raw
+ (should (string-match "\xf1" "\xf1"))
+ (should-not (string-match "\xf1" "\xc1\xb1"))
+ ;; Raw, char alt
+ (should (string-match "[\xf1]" "\xf1"))
+ (should-not (string-match "[\xf1]" "\xc1\xb1"))
+ ;; Raw range
+ (should (string-match "[\x82-\xd3]" "\xbb"))
+ (should-not (string-match "[\x82-\xd3]" "a"))
+ (should-not (string-match "[\x82-\xd3]" "\x81"))
+ (should-not (string-match "[\x82-\xd3]" "\xd4"))
+ ;; ASCII-raw range
+ (should (string-match "[f-\xd3]" "q"))
+ (should (string-match "[f-\xd3]" "\xbb"))
+ (should-not (string-match "[f-\xd3]" "e"))
+ (should-not (string-match "[f-\xd3]" "\xd4")))
+
+(ert-deftest regexp-multibyte-multibyte ()
+ "Test matching a multibyte regexp against a multibyte string."
+ ;; Sanity check
+ (should (multibyte-string-p "åü"))
+ ;; ASCII
+ (should (string-match (string-to-multibyte "a[b]")
+ (string-to-multibyte "ab")))
+ ;; Unicode
+ (should (string-match "å[ü]z" "åüz"))
+ (should-not (string-match "ü" (string-to-multibyte "\xc3\xbc")))
+ ;; Raw
+ (should (string-match (string-to-multibyte "\xf1")
+ (string-to-multibyte "\xf1")))
+ (should-not (string-match (string-to-multibyte "\xf1")
+ (string-to-multibyte "\xc1\xb1")))
+ (should-not (string-match (string-to-multibyte "\xc1\xb1")
+ (string-to-multibyte "\xf1")))
+ ;; Raw, char alt
+ (should (string-match (string-to-multibyte "[\xf1]")
+ (string-to-multibyte "\xf1")))
+ ;; Raw range
+ (should (string-match (string-to-multibyte "[\x82-\xd3]")
+ (string-to-multibyte "\xbb")))
+ (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "a"))
+ (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "Å"))
+ (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "ü"))
+ (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "\x81"))
+ (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "\xd4"))
+ ;; ASCII-raw range: should exclude U+0100..U+10FFFF
+ (should (string-match (string-to-multibyte "[f-\xd3]")
+ (string-to-multibyte "q")))
+ (should (string-match (string-to-multibyte "[f-\xd3]")
+ (string-to-multibyte "\xbb")))
+ (should-not (string-match (string-to-multibyte "[f-\xd3]") "e"))
+ (should-not (string-match (string-to-multibyte "[f-\xd3]") "Å"))
+ (should-not (string-match (string-to-multibyte "[f-\xd3]") "ü"))
+ (should-not (string-match (string-to-multibyte "[f-\xd3]") "\xd4"))
+ ;; Unicode-raw range: should be empty
+ (should-not (string-match "[å-\xd3]" "å"))
+ (should-not (string-match "[å-\xd3]" (string-to-multibyte "\xd3")))
+ (should-not (string-match "[å-\xd3]" (string-to-multibyte "\xbb")))
+ (should-not (string-match "[å-\xd3]" "ü"))
+ ;; No equivalence between raw bytes and latin-1
+ (should-not (string-match "å" (string-to-multibyte "\xe5")))
+ (should-not (string-match "[å]" (string-to-multibyte "\xe5")))
+ (should-not (string-match "\xe5" "å"))
+ (should-not (string-match "[\xe5]" "å")))
+
+(ert-deftest regexp-unibyte-multibyte ()
+ "Test matching a unibyte regexp against a multibyte string."
+ ;; ASCII
+ (should (string-match "a[b]" (string-to-multibyte "ab")))
+ ;; Unicode
+ (should (string-match "a.[^b]c" (string-to-multibyte "aåüc")))
+ ;; Raw
+ (should (string-match "\xf1" (string-to-multibyte "\xf1")))
+ (should-not (string-match "\xc1\xb1" (string-to-multibyte "\xf1")))
+ ;; Raw, char alt
+ (should (string-match "[\xf1]" (string-to-multibyte "\xf1")))
+ (should-not (string-match "[\xc1][\xb1]" (string-to-multibyte "\xf1")))
+ ;; ASCII-raw range: should exclude U+0100..U+10FFFF
+ (should (string-match "[f-\xd3]" (string-to-multibyte "q")))
+ (should (string-match "[f-\xd3]" (string-to-multibyte "\xbb")))
+ (should-not (string-match "[f-\xd3]" "e"))
+ (should-not (string-match "[f-\xd3]" "Å"))
+ (should-not (string-match "[f-\xd3]" "ü"))
+ (should-not (string-match "[f-\xd3]" "\xd4"))
+ ;; No equivalence between raw bytes and latin-1
+ (should-not (string-match "\xe5" "å"))
+ (should-not (string-match "[\xe5]" "å")))
+
+(ert-deftest regexp-multibyte-unibyte ()
+ "Test matching a multibyte regexp against a unibyte string."
+ ;; ASCII
+ (should (string-match (string-to-multibyte "a[b]") "ab"))
+ ;; Unicode
+ (should (string-match "a[^ü]c" "abc"))
+ (should-not (string-match "ü" "\xc3\xbc"))
+ ;; Raw
+ (should (string-match (string-to-multibyte "\xf1") "\xf1"))
+ (should-not (string-match (string-to-multibyte "\xf1") "\xc1\xb1"))
+ ;; Raw, char alt
+ (should (string-match (string-to-multibyte "[\xf1]") "\xf1"))
+ (should-not (string-match (string-to-multibyte "[\xf1]") "\xc1\xb1"))
+ ;; ASCII-raw range: should exclude U+0100..U+10FFFF
+ (should (string-match (string-to-multibyte "[f-\xd3]") "q"))
+ (should (string-match (string-to-multibyte "[f-\xd3]") "\xbb"))
+ (should-not (string-match (string-to-multibyte "[f-\xd3]") "e"))
+ (should-not (string-match (string-to-multibyte "[f-\xd3]") "\xd4"))
+ ;; Unicode-raw range: should be empty
+ (should-not (string-match "[å-\xd3]" "\xd3"))
+ (should-not (string-match "[å-\xd3]" "\xbb"))
+ ;; No equivalence between raw bytes and latin-1
+ (should-not (string-match "å" "\xe5"))
+ (should-not (string-match "[å]" "\xe5")))
+
;;; regex-emacs-tests.el ends here