More string-search optimisations

author Mattias Engdegård <mattiase@acm.org>

Thu, 19 Nov 2020 13:24:24 +0000 (14:24 +0100)

committer Mattias Engdegård <mattiase@acm.org>

Thu, 19 Nov 2020 13:58:48 +0000 (14:58 +0100)
author Mattias Engdegård <mattiase@acm.org>
Thu, 19 Nov 2020 13:24:24 +0000 (14:24 +0100)
committer Mattias Engdegård <mattiase@acm.org>
Thu, 19 Nov 2020 13:58:48 +0000 (14:58 +0100)
diff --git a/src/fns.c b/src/fns.c

index f50bf8ecb77dba6bdac3465ba671854015fd42d6..e4c9acc3163f9c5ced754df94b80c2797aa71ac5 100644 (file)
--- a/src/fns.c
+++ b/src/fns.c
@@ -5502,25 +5502,32 @@ Case is always significant and text properties are ignored. */)
    haybytes = SBYTES (haystack) - start_byte;
  
    /* We can do a direct byte-string search if both strings have the
-     same multibyteness, or if at least one of them consists of ASCII
-     characters only.  */
+     same multibyteness, or if the needle consists of ASCII characters only.  */
    if (STRING_MULTIBYTE (haystack)
        ? (STRING_MULTIBYTE (needle)
           || SCHARS (haystack) == SBYTES (haystack) || string_ascii_p (needle))
        : (!STRING_MULTIBYTE (needle)
-         || SCHARS (needle) == SBYTES (needle) || string_ascii_p (haystack)))
-    res = memmem (haystart, haybytes,
-                 SSDATA (needle), SBYTES (needle));
-  else if (STRING_MULTIBYTE (haystack))  /* unibyte needle */
+         || SCHARS (needle) == SBYTES (needle)))
+    {
+      if (STRING_MULTIBYTE (haystack) && STRING_MULTIBYTE (needle)
+          && SCHARS (haystack) == SBYTES (haystack)
+          && SCHARS (needle) != SBYTES (needle))
+        /* Multibyte non-ASCII needle, multibyte ASCII haystack: impossible.  */
+        return Qnil;
+      else
+        res = memmem (haystart, haybytes,
+                      SSDATA (needle), SBYTES (needle));
+    }
+  else if (STRING_MULTIBYTE (haystack))  /* unibyte non-ASCII needle */
      {
        Lisp_Object multi_needle = string_to_multibyte (needle);
        res = memmem (haystart, haybytes,
                     SSDATA (multi_needle), SBYTES (multi_needle));
      }
-  else                        /* unibyte haystack, multibyte needle */
+  else              /* unibyte haystack, multibyte non-ASCII needle */
      {
        /* The only possible way we can find the multibyte needle in the
-        unibyte stack (since we know that neither are pure-ASCII) is
+        unibyte stack (since we know that the needle is non-ASCII) is
          if they contain "raw bytes" (and no other non-ASCII chars.)  */
        ptrdiff_t nbytes = SBYTES (needle);
        for (ptrdiff_t i = 0; i < nbytes; i++)
diff --git a/test/src/fns-tests.el b/test/src/fns-tests.el

index d3c22f966e6383c2732754a9c17f94cff46053eb..86b8d655d2654450fed167c8f457676300ba5e1d 100644 (file)
--- a/test/src/fns-tests.el
+++ b/test/src/fns-tests.el
@@ -938,6 +938,13 @@
    (should (equal (string-search "\303" "aøb") nil))
    (should (equal (string-search "\270" "aøb") nil))
    (should (equal (string-search "ø" "\303\270") nil))
+  (should (equal (string-search "ø" (make-string 32 ?a)) nil))
+  (should (equal (string-search "ø" (string-to-multibyte (make-string 32 ?a)))
+                 nil))
+  (should (equal (string-search "o" (string-to-multibyte
+                                     (apply #'string
+                                            (number-sequence ?a ?z))))
+                 14))
  
    (should (equal (string-search "a\U00010f98z" "a\U00010f98a\U00010f98z") 2))
author	Mattias Engdegård <mattiase@acm.org>
	Thu, 19 Nov 2020 13:24:24 +0000 (14:24 +0100)
committer	Mattias Engdegård <mattiase@acm.org>
	Thu, 19 Nov 2020 13:58:48 +0000 (14:58 +0100)
src/fns.c		patch \| blob \| history
test/src/fns-tests.el		patch \| blob \| history