From: Lars Ingebrigtsen Date: Sun, 27 Sep 2020 00:01:03 +0000 (+0200) Subject: Fix searching for multibyte needles in unibyte haystacks X-Git-Tag: emacs-28.0.90~5858 X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=8c569683f2ee5d14040f5605fd0570b2eb009c05;p=emacs.git Fix searching for multibyte needles in unibyte haystacks * src/fns.c (Fstring_search): Make this work better when searching unibyte haystacks for multibyte needles (bug#43598). --- diff --git a/src/fns.c b/src/fns.c index 2fcc282dcb3..0f768711544 100644 --- a/src/fns.c +++ b/src/fns.c @@ -5454,6 +5454,21 @@ It should not be used for anything security-related. See return make_digest_string (digest, SHA1_DIGEST_SIZE); } +static bool +string_ascii_p (Lisp_Object string) +{ + if (STRING_MULTIBYTE (string)) + return SBYTES (string) == SCHARS (string); + else + { + ptrdiff_t nbytes = SBYTES (string); + for (ptrdiff_t i = 0; i < nbytes; i++) + if (SREF (string, i) > 127) + return false; + return true; + } +} + DEFUN ("string-search", Fstring_search, Sstring_search, 2, 3, 0, doc: /* Search for the string NEEDLE in the string HAYSTACK. The return value is the position of the first occurrence of NEEDLE in @@ -5490,7 +5505,9 @@ Case is always significant and text properties are ignored. */) haystart = SSDATA (haystack) + start_byte; haybytes = SBYTES (haystack) - start_byte; - if (STRING_MULTIBYTE (haystack) == STRING_MULTIBYTE (needle)) + if (STRING_MULTIBYTE (haystack) == STRING_MULTIBYTE (needle) + || string_ascii_p (needle) + || string_ascii_p (haystack)) res = memmem (haystart, haybytes, SSDATA (needle), SBYTES (needle)); else if (STRING_MULTIBYTE (haystack)) /* unibyte needle */ @@ -5501,9 +5518,29 @@ Case is always significant and text properties are ignored. */) } else /* unibyte haystack, multibyte needle */ { - Lisp_Object uni_needle = Fstring_as_unibyte (needle); - res = memmem (haystart, haybytes, - SSDATA (uni_needle), SBYTES (uni_needle)); + /* The only possible way we can find the multibyte needle in the + unibyte stack (since we know that neither are pure-ASCII) is + if they contain "raw bytes" (and no other non-ASCII chars.) */ + ptrdiff_t chars = SCHARS (needle); + const unsigned char *src = SDATA (needle); + + for (ptrdiff_t i = 0; i < chars; i++) + { + int c = string_char_advance (&src); + + if (!CHAR_BYTE8_P (c) + && !ASCII_CHAR_P (c)) + /* Found a char that can't be in the haystack. */ + return Qnil; + } + + { + /* "Raw bytes" (aka eighth-bit) are represented differently in + multibyte and unibyte strings. */ + Lisp_Object uni_needle = Fstring_to_unibyte (needle); + res = memmem (haystart, haybytes, + SSDATA (uni_needle), SBYTES (uni_needle)); + } } if (! res) diff --git a/test/src/fns-tests.el b/test/src/fns-tests.el index f2e1a268b08..41969f2af2c 100644 --- a/test/src/fns-tests.el +++ b/test/src/fns-tests.el @@ -953,4 +953,8 @@ (should (equal (string-search "" "abc" 3) 3)) (should-error (string-search "" "abc" 4)) (should-error (string-search "" "abc" -1)) - ) + + (should-not (string-search "ø" "foo\303\270")) + (should (equal (string-search (string-to-multibyte "o\303\270") "foo\303\270") + 2)) + (should (equal (string-search "\303\270" "foo\303\270") 3)))