From ed2def7d5e423388ca75c6e10fd7b42e0c4789c7 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Sun, 26 Apr 2020 15:18:49 -0700 Subject: [PATCH] Improve string_char_and_length speed MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This tweak improved the CPU time performance of ‘make compile-always’ by about 1.7% on my platform. * src/character.c (string_char): Remove; no longer used. * src/character.h (string_char_and_length): Redo so that it needn’t call string_char. This helps the caller, which can now become a leaf function. --- src/character.c | 45 --------------------------------------------- src/character.h | 47 +++++++++++++++++++++++++++-------------------- 2 files changed, 27 insertions(+), 65 deletions(-) diff --git a/src/character.c b/src/character.c index edcec5f1c79..4902e564b1d 100644 --- a/src/character.c +++ b/src/character.c @@ -141,51 +141,6 @@ char_string (unsigned int c, unsigned char *p) } -/* Return a character whose multibyte form is at P. Set *LEN to the - byte length of the multibyte form. */ - -int -string_char (const unsigned char *p, int *len) -{ - int c; - const unsigned char *saved_p = p; - - if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10)) - { - /* 1-, 2-, and 3-byte sequences can be handled by the macro. */ - c = string_char_advance (&p); - } - else if (! (*p & 0x08)) - { - /* A 4-byte sequence of this form: - 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ - c = ((((p)[0] & 0x7) << 18) - | (((p)[1] & 0x3F) << 12) - | (((p)[2] & 0x3F) << 6) - | ((p)[3] & 0x3F)); - p += 4; - } - else - { - /* A 5-byte sequence of this form: - - 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - - Note that the top 4 `x's are always 0, so shifting p[1] can - never exceed the maximum valid character codepoint. */ - c = (/* (((p)[0] & 0x3) << 24) ... always 0, so no need to shift. */ - (((p)[1] & 0x3F) << 18) - | (((p)[2] & 0x3F) << 12) - | (((p)[3] & 0x3F) << 6) - | ((p)[4] & 0x3F)); - p += 5; - } - - *len = p - saved_p; - return c; -} - - /* Translate character C by translation table TABLE. If no translation is found in TABLE, return the untranslated character. If TABLE is a list, elements are char tables. In that case, recursively translate C by all the diff --git a/src/character.h b/src/character.h index 4887473b27e..d4d77504426 100644 --- a/src/character.h +++ b/src/character.h @@ -85,7 +85,6 @@ enum }; extern int char_string (unsigned, unsigned char *); -extern int string_char (const unsigned char *, int *); /* UTF-8 encodings. Use \x escapes, so they are portable to pre-C11 compilers and can be concatenated with ordinary string literals. */ @@ -371,33 +370,41 @@ raw_prev_char_len (unsigned char const *p) INLINE int string_char_and_length (unsigned char const *p, int *length) { - int c, len; + int c = p[0]; + if (! (c & 0x80)) + { + *length = 1; + return c; + } + eassume (0xC0 <= c); - if (! (p[0] & 0x80)) + int d = (c << 6) + p[1] - ((0xC0 << 6) + 0x80); + if (! (c & 0x20)) { - len = 1; - c = p[0]; + *length = 2; + return d + (c < 0xC2 ? 0x3FFF80 : 0); } - else if (! (p[0] & 0x20)) + + d = (d << 6) + p[2] - ((0x20 << 12) + 0x80); + if (! (c & 0x10)) { - len = 2; - c = ((((p[0] & 0x1F) << 6) - | (p[1] & 0x3F)) - + (p[0] < 0xC2 ? 0x3FFF80 : 0)); + *length = 3; + eassume (MAX_2_BYTE_CHAR < d && d <= MAX_3_BYTE_CHAR); + return d; } - else if (! (p[0] & 0x10)) + + d = (d << 6) + p[3] - ((0x10 << 18) + 0x80); + if (! (c & 0x08)) { - len = 3; - c = (((p[0] & 0x0F) << 12) - | ((p[1] & 0x3F) << 6) - | (p[2] & 0x3F)); + *length = 4; + eassume (MAX_3_BYTE_CHAR < d && d <= MAX_4_BYTE_CHAR); + return d; } - else - c = string_char (p, &len); - eassume (0 < len && len <= MAX_MULTIBYTE_LENGTH); - *length = len; - return c; + d = (d << 6) + p[4] - ((0x08 << 24) + 0x80); + *length = 5; + eassume (MAX_4_BYTE_CHAR < d && d <= MAX_5_BYTE_CHAR); + return d; } /* Return the character code of character whose multibyte form is at P. */ -- 2.39.2