From c88a3be8087ad0165415aa87c01f868a7433cb21 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Mon, 20 Apr 2020 22:26:30 -0700 Subject: [PATCH] Fix string-to-multibyte overlong sequence bug * src/character.h (MULTIBYTE_LENGTH, MULTIBYTE_LENGTH_NO_CHECK): Remove, replacing with ... (multibyte_length): ... this new function. All callers changed. The new function rejects overlong multibyte forms. * test/src/buffer-tests.el (buffer-multibyte-overlong-sequences): New test. --- src/buffer.c | 3 +- src/character.c | 42 ++++++++++-------- src/character.h | 96 +++++++++++++++++++++++----------------- src/coding.c | 14 +++--- test/src/buffer-tests.el | 14 ++++++ 5 files changed, 101 insertions(+), 68 deletions(-) diff --git a/src/buffer.c b/src/buffer.c index 5398414e6eb..53b3bd960c4 100644 --- a/src/buffer.c +++ b/src/buffer.c @@ -2634,8 +2634,7 @@ current buffer is cleared. */) if (ASCII_CHAR_P (*p)) p++, pos++; else if (EQ (flag, Qt) - && ! CHAR_BYTE8_HEAD_P (*p) - && (bytes = MULTIBYTE_LENGTH (p, pend)) > 0) + && 0 < (bytes = multibyte_length (p, pend, true, false))) p += bytes, pos += bytes; else { diff --git a/src/character.c b/src/character.c index 303c83ccec3..da09e77e131 100644 --- a/src/character.c +++ b/src/character.c @@ -486,7 +486,7 @@ multibyte_chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes) while (ptr < endp) { - int len = MULTIBYTE_LENGTH (ptr, endp); + int len = multibyte_length (ptr, endp, true, true); if (len == 0) emacs_abort (); @@ -508,7 +508,6 @@ parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len, ptrdiff_t *nchars, ptrdiff_t *nbytes) { const unsigned char *endp = str + len; - int n; ptrdiff_t chars = 0, bytes = 0; if (len >= MAX_MULTIBYTE_LENGTH) @@ -516,8 +515,8 @@ parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len, const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH; while (str < adjusted_endp) { - if (! CHAR_BYTE8_HEAD_P (*str) - && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0) + int n = multibyte_length (str, NULL, false, false); + if (0 < n) str += n, bytes += n; else str++, bytes += 2; @@ -526,8 +525,8 @@ parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len, } while (str < endp) { - if (! CHAR_BYTE8_HEAD_P (*str) - && (n = MULTIBYTE_LENGTH (str, endp)) > 0) + int n = multibyte_length (str, endp, true, false); + if (0 < n) str += n, bytes += n; else str++, bytes += 2; @@ -554,20 +553,25 @@ str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes, unsigned char *p = str, *endp = str + nbytes; unsigned char *to; ptrdiff_t chars = 0; - int n; if (nbytes >= MAX_MULTIBYTE_LENGTH) { unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH; - while (p < adjusted_endp - && ! CHAR_BYTE8_HEAD_P (*p) - && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0) - p += n, chars++; + while (p < adjusted_endp) + { + int n = multibyte_length (p, NULL, false, false); + if (n <= 0) + break; + p += n, chars++; + } + } + while (true) + { + int n = multibyte_length (p, endp, true, false); + if (n <= 0) + break; + p += n, chars++; } - while (p < endp - && ! CHAR_BYTE8_HEAD_P (*p) - && (n = MULTIBYTE_LENGTH (p, endp)) > 0) - p += n, chars++; if (nchars) *nchars = chars; if (p == endp) @@ -584,8 +588,8 @@ str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes, unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH; while (p < adjusted_endp) { - if (! CHAR_BYTE8_HEAD_P (*p) - && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0) + int n = multibyte_length (p, NULL, false, false); + if (0 < n) { while (n--) *to++ = *p++; @@ -601,8 +605,8 @@ str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes, } while (p < endp) { - if (! CHAR_BYTE8_HEAD_P (*p) - && (n = MULTIBYTE_LENGTH (p, endp)) > 0) + int n = multibyte_length (p, endp, true, false); + if (0 < n) { while (n--) *to++ = *p++; diff --git a/src/character.h b/src/character.h index 81320dedd17..4887473b27e 100644 --- a/src/character.h +++ b/src/character.h @@ -31,15 +31,19 @@ INLINE_HEADER_BEGIN /* character code 1st byte byte sequence -------------- -------- ------------- 0-7F 00..7F 0xxxxxxx - 80-7FF C2..DF 110xxxxx 10xxxxxx - 800-FFFF E0..EF 1110xxxx 10xxxxxx 10xxxxxx - 10000-1FFFFF F0..F7 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - 200000-3FFF7F F8 11111000 1000xxxx 10xxxxxx 10xxxxxx 10xxxxxx + 80-7FF C2..DF 110yyyyx 10xxxxxx + 800-FFFF E0..EF 1110yyyy 10yxxxxx 10xxxxxx + 10000-1FFFFF F0..F7 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx + 200000-3FFF7F F8 11111000 1000yxxx 10xxxxxx 10xxxxxx 10xxxxxx 3FFF80-3FFFFF C0..C1 1100000x 10xxxxxx (for eight-bit-char) 400000-... invalid invalid 1st byte 80..BF 10xxxxxx - F9..FF 11111xxx (xxx != 000) + F9..FF 11111yyy + + In each bit pattern, 'x' and 'y' each represent a single bit of the + character code payload, and least one 'y' must be a 1 bit. + In the 5-byte sequence, the 22-bit payload cannot exceed 3FFF7F. */ /* Maximum character code ((1 << CHARACTERBITS) - 1). */ @@ -284,7 +288,7 @@ CHAR_HEAD_P (int byte) } /* How many bytes a character that starts with BYTE occupies in a - multibyte form. Unlike MULTIBYTE_LENGTH below, this function does not + multibyte form. Unlike multibyte_length, this function does not validate the multibyte form, but looks only at its first byte. */ INLINE int BYTES_BY_CHAR_HEAD (int byte) @@ -297,44 +301,54 @@ BYTES_BY_CHAR_HEAD (int byte) } -/* The byte length of multibyte form at unibyte string P ending at - PEND. If the string doesn't point to a valid multibyte form, - return 0. Unlike BYTES_BY_CHAR_HEAD, this macro validates the - multibyte form. */ +/* The byte length of the multibyte form at the unibyte string P, + ending at PEND if CHECK, and without a length check if !CHECK. + If ALLOW_8BIT, allow multibyte forms of eight-bit characters. + If the string doesn't point to a valid multibyte form, return 0. + Unlike BYTES_BY_CHAR_HEAD, this function validates the multibyte form. */ INLINE int -MULTIBYTE_LENGTH (unsigned char const *p, unsigned char const *pend) -{ - return (! (p < pend) ? 0 - : ! (p[0] & 0x80) ? 1 - : ! (p + 1 < pend && (p[1] & 0xC0) == 0x80) ? 0 - : (p[0] & 0xE0) == 0xC0 ? 2 - : ! (p + 2 < pend && (p[2] & 0xC0) == 0x80) ? 0 - : (p[0] & 0xF0) == 0xE0 ? 3 - : ! (p + 3 < pend && (p[3] & 0xC0) == 0x80) ? 0 - : (p[0] & 0xF8) == 0xF0 ? 4 - : ! (p + 4 < pend && (p[4] & 0xC0) == 0x80) ? 0 - : p[0] == 0xF8 && (p[1] & 0xF0) == 0x80 ? 5 - : 0); -} - - -/* Like MULTIBYTE_LENGTH, but don't check the ending address. The - multibyte form is still validated, unlike BYTES_BY_CHAR_HEAD. */ +multibyte_length (unsigned char const *p, unsigned char const *pend, + bool check, bool allow_8bit) +{ + if (!check || p < pend) + { + unsigned char c = p[0]; + if (c < 0x80) + return 1; + if (!check || p + 1 < pend) + { + /* The 'unsigned int' avoids int overflow in the 5-byte case. */ + unsigned int d = p[1]; + + if (TRAILING_CODE_P (d)) + { + if (allow_8bit ? (c & 0xE0) == 0xC0 : 0xC2 <= c && c <= 0xDF) + return 2; + if ((!check || p + 2 < pend) + && TRAILING_CODE_P (p[2])) + { + if ((c & 0xF0) == 0xE0 && ((c & 0x0F) | (d & 0x20))) + return 3; + if ((!check || p + 3 < pend) && TRAILING_CODE_P (p[3])) + { + if ((c & 0xF8) == 0xF0 && ((c & 0x07) | (d & 0x30))) + return 4; + if (c == 0xF8 && (!check || p + 4 < pend) + && TRAILING_CODE_P (p[4])) + { + unsigned int w = ((d << 24) + (p[2] << 16) + + (p[3] << 8) + p[4]); + if (0x88808080 <= w && w <= 0x8FBFBDBF) + return 5; + } + } + } + } + } + } -INLINE int -MULTIBYTE_LENGTH_NO_CHECK (unsigned char const *p) -{ - return (!(p[0] & 0x80) ? 1 - : (p[1] & 0xC0) != 0x80 ? 0 - : (p[0] & 0xE0) == 0xC0 ? 2 - : (p[2] & 0xC0) != 0x80 ? 0 - : (p[0] & 0xF0) == 0xE0 ? 3 - : (p[3] & 0xC0) != 0x80 ? 0 - : (p[0] & 0xF8) == 0xF0 ? 4 - : (p[4] & 0xC0) != 0x80 ? 0 - : p[0] == 0xF8 && (p[1] & 0xF0) == 0x80 ? 5 - : 0); + return 0; } diff --git a/src/coding.c b/src/coding.c index 716b0d99792..34f36d5a86a 100644 --- a/src/coding.c +++ b/src/coding.c @@ -7670,15 +7670,17 @@ consume_chars (struct coding_system *coding, Lisp_Object translation_table, if (! multibytep) { - int bytes; - if (coding->encoder == encode_coding_raw_text || coding->encoder == encode_coding_ccl) c = *src++, pos++; - else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0) - c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes; else - c = BYTE8_TO_CHAR (*src), src++, pos++; + { + int bytes = multibyte_length (src, src_end, true, true); + if (0 < bytes) + c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes; + else + c = BYTE8_TO_CHAR (*src), src++, pos++; + } } else c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++; @@ -7727,7 +7729,7 @@ consume_chars (struct coding_system *coding, Lisp_Object translation_table, for (i = 1; i < to_nchars; i++) *buf++ = XFIXNUM (AREF (trans, i)); for (i = 1; i < from_nchars; i++, pos++) - src += MULTIBYTE_LENGTH_NO_CHECK (src); + src += multibyte_length (src, NULL, false, true); } } diff --git a/test/src/buffer-tests.el b/test/src/buffer-tests.el index 1c356698f66..6e87cb94897 100644 --- a/test/src/buffer-tests.el +++ b/test/src/buffer-tests.el @@ -1313,4 +1313,18 @@ with parameters from the *Messages* buffer modification." (ovshould nonempty-eob-end 4 5) (ovshould empty-eob 5 5))))) +(ert-deftest buffer-multibyte-overlong-sequences () + (dolist (uni '("\xE0\x80\x80" + "\xF0\x80\x80\x80" + "\xF8\x8F\xBF\xBF\x80")) + (let ((multi (string-to-multibyte uni))) + (should + (string-equal + multi + (with-temp-buffer + (set-buffer-multibyte nil) + (insert uni) + (set-buffer-multibyte t) + (buffer-string))))))) + ;;; buffer-tests.el ends here -- 2.39.2