From: Mattias EngdegÄrd Date: Tue, 13 Feb 2024 13:52:39 +0000 (+0100) Subject: String hashing improvements (spread and performance) X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=88bc2ca183c74fedd1291d6a18b75ab7fd7c4125;p=emacs.git String hashing improvements (spread and performance) Fix gaps in hashing coverage in the middle and end of even fairly short strings. E.g., `outline-1`, `outline-2` etc all hashed to the exact same value but with the patch, there are no collisions among the ~160000 symbols in the Emacs tree. This change also improves average hashing speed by using fewer mixing operations. * src/fns.c (hash_string): Use unit stride for fairly short strings, while retaining the cap of 8 samples for long ones. Always hash the last word to ensure that the end of the string is covered. For strings shorter than a word, use fewer loads and a single reduction step. (cherry picked from commit 3a93e301ddc913758abe05c876aa3016e8b23af8) --- diff --git a/src/fns.c b/src/fns.c index 918ba0370e8..f94e8519957 100644 --- a/src/fns.c +++ b/src/fns.c @@ -5069,24 +5069,49 @@ hash_string (char const *ptr, ptrdiff_t len) EMACS_UINT hash = len; /* At most 8 steps. We could reuse SXHASH_MAX_LEN, of course, * but dividing by 8 is cheaper. */ - ptrdiff_t step = sizeof hash + ((end - p) >> 3); + ptrdiff_t step = max (sizeof hash, ((end - p) >> 3)); - while (p + sizeof hash <= end) + if (p + sizeof hash <= end) { + do + { + EMACS_UINT c; + /* We presume that the compiler will replace this `memcpy` with + a single load/move instruction when applicable. */ + memcpy (&c, p, sizeof hash); + p += step; + hash = sxhash_combine (hash, c); + } + while (p + sizeof hash <= end); + /* Hash the last wordful of bytes in the string, because that is + is often the part where strings differ. This may cause some + bytes to be hashed twice but we assume that's not a big problem. */ EMACS_UINT c; - /* We presume that the compiler will replace this `memcpy` with - a single load/move instruction when applicable. */ - memcpy (&c, p, sizeof hash); - p += step; + memcpy (&c, end - sizeof c, sizeof c); hash = sxhash_combine (hash, c); } - /* A few last bytes may remain (smaller than an EMACS_UINT). */ - /* FIXME: We could do this without a loop, but it'd require - endian-dependent code :-( */ - while (p < end) + else { - unsigned char c = *p++; - hash = sxhash_combine (hash, c); + /* String is shorter than an EMACS_UINT. Use smaller loads. */ + eassume (p <= end && end - p < sizeof (EMACS_UINT)); + EMACS_UINT tail = 0; + if (end - p >= 4) + { + uint32_t c; + memcpy (&c, p, sizeof c); + tail = (tail << (8 * sizeof c)) + c; + p += sizeof c; + } + if (end - p >= 2) + { + uint16_t c; + memcpy (&c, p, sizeof c); + tail = (tail << (8 * sizeof c)) + c; + p += sizeof c; + } + if (p < end) + tail = (tail << 8) + (unsigned char)*p; + hash = sxhash_combine (hash, tail); } return hash;