From: Mattias Engdegård Date: Thu, 6 Oct 2022 15:46:02 +0000 (+0200) Subject: Speed up string-lessp for multibyte strings X-Git-Tag: emacs-29.0.90~1616^2~717 X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=def6fa4246502befa174aa6409166b0967621f7b;p=emacs.git Speed up string-lessp for multibyte strings Improve comparison speed when both arguments are multibyte strings, at least one of them containing a non-ASCII character. (All-ASCII multibyte strings are already fast.) The speed-up is about 2× for strings of 10 chars, 10× for strings of 100 chars. * src/fns.c (Fstring_lessp): Quickly skip the common prefix by comparing words. --- diff --git a/src/fns.c b/src/fns.c index 22e66d3653d..bc4915eb25b 100644 --- a/src/fns.c +++ b/src/fns.c @@ -454,23 +454,50 @@ Symbols are also allowed; their print names are used instead. */) && (!STRING_MULTIBYTE (string2) || SCHARS (string2) == SBYTES (string2))) { /* Each argument is either unibyte or all-ASCII multibyte: - we can compare bytewise. - (Arbitrary multibyte strings cannot be compared bytewise because - that would give a different order for raw bytes 80..FF.) */ + we can compare bytewise. */ int d = memcmp (SSDATA (string1), SSDATA (string2), n); return d < 0 || (d == 0 && n < SCHARS (string2)) ? Qt : Qnil; } else if (STRING_MULTIBYTE (string1) && STRING_MULTIBYTE (string2)) { - ptrdiff_t i1 = 0, i1_byte = 0, i2 = 0, i2_byte = 0; - while (i1 < n) - { - int c1 = fetch_string_char_advance_no_check (string1, &i1, &i1_byte); - int c2 = fetch_string_char_advance_no_check (string2, &i2, &i2_byte); - if (c1 != c2) - return c1 < c2 ? Qt : Qnil; - } - return i1 < SCHARS (string2) ? Qt : Qnil; + /* Two arbitrary multibyte strings: we cannot use memcmp because + the encoding for raw bytes would sort those between U+007F and U+0080 + which isn't where we want them. + Instead, we skip the longest common prefix and look at + what follows. */ + ptrdiff_t nb1 = SBYTES (string1); + ptrdiff_t nb2 = SBYTES (string2); + ptrdiff_t nb = min (nb1, nb2); + + /* First compare entire machine words. (String data is allocated + with word alignment.) */ + typedef size_t word_t; + int ws = sizeof (word_t); + const word_t *w1 = (const word_t *) SDATA (string1); + const word_t *w2 = (const word_t *) SDATA (string2); + ptrdiff_t b = 0; + while (b < nb - ws + 1 && w1[b / ws] == w2[b / ws]) + b += ws; + + /* Scan forward to the differing byte (at most ws-1 bytes). */ + while (b < nb && SREF (string1, b) == SREF (string2, b)) + b++; + + if (b >= nb) + /* One string is a prefix of the other. */ + return b < nb2 ? Qt : Qnil; + + /* Now back up to the start of the differing characters: + it's the last byte not having the bit pattern 10xxxxxx. */ + while ((SREF (string1, b) & 0xc0) == 0x80) + b--; + + /* Compare the differing characters. */ + ptrdiff_t i1 = 0, i2 = 0; + ptrdiff_t i1_byte = b, i2_byte = b; + int c1 = fetch_string_char_advance_no_check (string1, &i1, &i1_byte); + int c2 = fetch_string_char_advance_no_check (string2, &i2, &i2_byte); + return c1 < c2 ? Qt : Qnil; } else if (STRING_MULTIBYTE (string1)) {