From: Mattias Engdegård Date: Sun, 27 Sep 2020 12:28:07 +0000 (+0200) Subject: Minor string-search optimisations (bug#43598) X-Git-Tag: emacs-28.0.90~5853 X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=8bd233a7eb6bc4709d0adf0577d30aaf167e75bf;p=emacs.git Minor string-search optimisations (bug#43598) * src/fns.c (Fstring_search): Perform cheap all-ASCII checks before more expensive ones. Use a faster loop when searching for non-ASCII non-raw bytes. * test/src/fns-tests.el (string-search): Add more test cases. --- diff --git a/src/fns.c b/src/fns.c index 0f768711544..f626fe11b20 100644 --- a/src/fns.c +++ b/src/fns.c @@ -5457,16 +5457,11 @@ It should not be used for anything security-related. See static bool string_ascii_p (Lisp_Object string) { - if (STRING_MULTIBYTE (string)) - return SBYTES (string) == SCHARS (string); - else - { - ptrdiff_t nbytes = SBYTES (string); - for (ptrdiff_t i = 0; i < nbytes; i++) - if (SREF (string, i) > 127) - return false; - return true; - } + ptrdiff_t nbytes = SBYTES (string); + for (ptrdiff_t i = 0; i < nbytes; i++) + if (SREF (string, i) > 127) + return false; + return true; } DEFUN ("string-search", Fstring_search, Sstring_search, 2, 3, 0, @@ -5505,9 +5500,14 @@ Case is always significant and text properties are ignored. */) haystart = SSDATA (haystack) + start_byte; haybytes = SBYTES (haystack) - start_byte; - if (STRING_MULTIBYTE (haystack) == STRING_MULTIBYTE (needle) - || string_ascii_p (needle) - || string_ascii_p (haystack)) + /* We can do a direct byte-string search if both strings have the + same multibyteness, or if at least one of them consists of ASCII + characters only. */ + if (STRING_MULTIBYTE (haystack) + ? (STRING_MULTIBYTE (needle) + || SCHARS (haystack) == SBYTES (haystack) || string_ascii_p (needle)) + : (!STRING_MULTIBYTE (needle) + || SCHARS (needle) == SBYTES (needle) || string_ascii_p (haystack))) res = memmem (haystart, haybytes, SSDATA (needle), SBYTES (needle)); else if (STRING_MULTIBYTE (haystack)) /* unibyte needle */ @@ -5521,26 +5521,21 @@ Case is always significant and text properties are ignored. */) /* The only possible way we can find the multibyte needle in the unibyte stack (since we know that neither are pure-ASCII) is if they contain "raw bytes" (and no other non-ASCII chars.) */ - ptrdiff_t chars = SCHARS (needle); - const unsigned char *src = SDATA (needle); - - for (ptrdiff_t i = 0; i < chars; i++) - { - int c = string_char_advance (&src); - - if (!CHAR_BYTE8_P (c) - && !ASCII_CHAR_P (c)) - /* Found a char that can't be in the haystack. */ - return Qnil; - } + ptrdiff_t nbytes = SBYTES (needle); + for (ptrdiff_t i = 0; i < nbytes; i++) + { + int c = SREF (needle, i); + if (CHAR_BYTE8_HEAD_P (c)) + i++; /* Skip raw byte. */ + else if (!ASCII_CHAR_P (c)) + return Qnil; /* Found a char that can't be in the haystack. */ + } - { - /* "Raw bytes" (aka eighth-bit) are represented differently in - multibyte and unibyte strings. */ - Lisp_Object uni_needle = Fstring_to_unibyte (needle); - res = memmem (haystart, haybytes, - SSDATA (uni_needle), SBYTES (uni_needle)); - } + /* "Raw bytes" (aka eighth-bit) are represented differently in + multibyte and unibyte strings. */ + Lisp_Object uni_needle = Fstring_to_unibyte (needle); + res = memmem (haystart, haybytes, + SSDATA (uni_needle), SBYTES (uni_needle)); } if (! res) diff --git a/test/src/fns-tests.el b/test/src/fns-tests.el index 41969f2af2c..d3c22f966e6 100644 --- a/test/src/fns-tests.el +++ b/test/src/fns-tests.el @@ -913,6 +913,7 @@ (should (equal (string-search "ab\0" "ab") nil)) (should (equal (string-search "ab" "abababab" 3) 4)) (should (equal (string-search "ab" "ababac" 3) nil)) + (should (equal (string-search "aaa" "aa") nil)) (let ((case-fold-search t)) (should (equal (string-search "ab" "AB") nil))) @@ -936,14 +937,16 @@ (should (equal (string-search (string-to-multibyte "\377") "ab\377c") 2)) (should (equal (string-search "\303" "aøb") nil)) (should (equal (string-search "\270" "aøb") nil)) - ;; This test currently fails, but it shouldn't! - ;;(should (equal (string-search "ø" "\303\270") nil)) + (should (equal (string-search "ø" "\303\270") nil)) + + (should (equal (string-search "a\U00010f98z" "a\U00010f98a\U00010f98z") 2)) (should-error (string-search "a" "abc" -1)) (should-error (string-search "a" "abc" 4)) (should-error (string-search "a" "abc" 100000000000)) (should (equal (string-search "a" "aaa" 3) nil)) + (should (equal (string-search "aa" "aa" 1) nil)) (should (equal (string-search "\0" "") nil)) (should (equal (string-search "" "") 0)) @@ -955,6 +958,21 @@ (should-error (string-search "" "abc" -1)) (should-not (string-search "ø" "foo\303\270")) + (should-not (string-search "\303\270" "ø")) + (should-not (string-search "\370" "ø")) + (should-not (string-search (string-to-multibyte "\370") "ø")) + (should-not (string-search "ø" "\370")) + (should-not (string-search "ø" (string-to-multibyte "\370"))) + (should-not (string-search "\303\270" "\370")) + (should-not (string-search (string-to-multibyte "\303\270") "\370")) + (should-not (string-search "\303\270" (string-to-multibyte "\370"))) + (should-not (string-search (string-to-multibyte "\303\270") + (string-to-multibyte "\370"))) + (should-not (string-search "\370" "\303\270")) + (should-not (string-search (string-to-multibyte "\370") "\303\270")) + (should-not (string-search "\370" (string-to-multibyte "\303\270"))) + (should-not (string-search (string-to-multibyte "\370") + (string-to-multibyte "\303\270"))) (should (equal (string-search (string-to-multibyte "o\303\270") "foo\303\270") 2)) (should (equal (string-search "\303\270" "foo\303\270") 3)))