From c88a3be8087ad0165415aa87c01f868a7433cb21 Mon Sep 17 00:00:00 2001
From: Paul Eggert <eggert@cs.ucla.edu>
Date: Mon, 20 Apr 2020 22:26:30 -0700
Subject: [PATCH] Fix string-to-multibyte overlong sequence bug

* src/character.h (MULTIBYTE_LENGTH, MULTIBYTE_LENGTH_NO_CHECK):
Remove, replacing with ...
(multibyte_length): ... this new function.  All callers changed.
The new function rejects overlong multibyte forms.
* test/src/buffer-tests.el (buffer-multibyte-overlong-sequences):
New test.
---
 src/buffer.c             |  3 +-
 src/character.c          | 42 ++++++++++--------
 src/character.h          | 96 +++++++++++++++++++++++-----------------
 src/coding.c             | 14 +++---
 test/src/buffer-tests.el | 14 ++++++
 5 files changed, 101 insertions(+), 68 deletions(-)

diff --git a/src/buffer.c b/src/buffer.c
index 5398414e6eb..53b3bd960c4 100644
--- a/src/buffer.c
+++ b/src/buffer.c
@@ -2634,8 +2634,7 @@ current buffer is cleared.  */)
 	  if (ASCII_CHAR_P (*p))
 	    p++, pos++;
 	  else if (EQ (flag, Qt)
-		   && ! CHAR_BYTE8_HEAD_P (*p)
-		   && (bytes = MULTIBYTE_LENGTH (p, pend)) > 0)
+		   && 0 < (bytes = multibyte_length (p, pend, true, false)))
 	    p += bytes, pos += bytes;
 	  else
 	    {
diff --git a/src/character.c b/src/character.c
index 303c83ccec3..da09e77e131 100644
--- a/src/character.c
+++ b/src/character.c
@@ -486,7 +486,7 @@ multibyte_chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 
   while (ptr < endp)
     {
-      int len = MULTIBYTE_LENGTH (ptr, endp);
+      int len = multibyte_length (ptr, endp, true, true);
 
       if (len == 0)
 	emacs_abort ();
@@ -508,7 +508,6 @@ parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
 			ptrdiff_t *nchars, ptrdiff_t *nbytes)
 {
   const unsigned char *endp = str + len;
-  int n;
   ptrdiff_t chars = 0, bytes = 0;
 
   if (len >= MAX_MULTIBYTE_LENGTH)
@@ -516,8 +515,8 @@ parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
       while (str < adjusted_endp)
 	{
-	  if (! CHAR_BYTE8_HEAD_P (*str)
-	      && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
+	  int n = multibyte_length (str, NULL, false, false);
+	  if (0 < n)
 	    str += n, bytes += n;
 	  else
 	    str++, bytes += 2;
@@ -526,8 +525,8 @@ parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
     }
   while (str < endp)
     {
-      if (! CHAR_BYTE8_HEAD_P (*str)
-	  && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
+      int n = multibyte_length (str, endp, true, false);
+      if (0 < n)
 	str += n, bytes += n;
       else
 	str++, bytes += 2;
@@ -554,20 +553,25 @@ str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
   unsigned char *p = str, *endp = str + nbytes;
   unsigned char *to;
   ptrdiff_t chars = 0;
-  int n;
 
   if (nbytes >= MAX_MULTIBYTE_LENGTH)
     {
       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
-      while (p < adjusted_endp
-	     && ! CHAR_BYTE8_HEAD_P (*p)
-	     && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
-	p += n, chars++;
+      while (p < adjusted_endp)
+	{
+	  int n = multibyte_length (p, NULL, false, false);
+	  if (n <= 0)
+	    break;
+	  p += n, chars++;
+	}
+    }
+  while (true)
+    {
+      int n = multibyte_length (p, endp, true, false);
+      if (n <= 0)
+	break;
+      p += n, chars++;
     }
-  while (p < endp
-	 && ! CHAR_BYTE8_HEAD_P (*p)
-	 && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
-    p += n, chars++;
   if (nchars)
     *nchars = chars;
   if (p == endp)
@@ -584,8 +588,8 @@ str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
       while (p < adjusted_endp)
 	{
-	  if (! CHAR_BYTE8_HEAD_P (*p)
-	      && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
+	  int n = multibyte_length (p, NULL, false, false);
+	  if (0 < n)
 	    {
 	      while (n--)
 		*to++ = *p++;
@@ -601,8 +605,8 @@ str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
     }
   while (p < endp)
     {
-      if (! CHAR_BYTE8_HEAD_P (*p)
-	  && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
+      int n = multibyte_length (p, endp, true, false);
+      if (0 < n)
 	{
 	  while (n--)
 	    *to++ = *p++;
diff --git a/src/character.h b/src/character.h
index 81320dedd17..4887473b27e 100644
--- a/src/character.h
+++ b/src/character.h
@@ -31,15 +31,19 @@ INLINE_HEADER_BEGIN
 /* character code	1st byte   byte sequence
    --------------	--------   -------------
         0-7F		00..7F	   0xxxxxxx
-       80-7FF		C2..DF	   110xxxxx 10xxxxxx
-      800-FFFF		E0..EF	   1110xxxx 10xxxxxx 10xxxxxx
-    10000-1FFFFF	F0..F7	   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-   200000-3FFF7F	F8	   11111000 1000xxxx 10xxxxxx 10xxxxxx 10xxxxxx
+       80-7FF		C2..DF	   110yyyyx 10xxxxxx
+      800-FFFF		E0..EF	   1110yyyy 10yxxxxx 10xxxxxx
+    10000-1FFFFF	F0..F7	   11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
+   200000-3FFF7F	F8	   11111000 1000yxxx 10xxxxxx 10xxxxxx 10xxxxxx
    3FFF80-3FFFFF	C0..C1	   1100000x 10xxxxxx (for eight-bit-char)
    400000-...		invalid
 
    invalid 1st byte	80..BF	   10xxxxxx
-			F9..FF	   11111xxx (xxx != 000)
+			F9..FF	   11111yyy
+
+   In each bit pattern, 'x' and 'y' each represent a single bit of the
+   character code payload, and least one 'y' must be a 1 bit.
+   In the 5-byte sequence, the 22-bit payload cannot exceed 3FFF7F.
 */
 
 /* Maximum character code ((1 << CHARACTERBITS) - 1).  */
@@ -284,7 +288,7 @@ CHAR_HEAD_P (int byte)
 }
 
 /* How many bytes a character that starts with BYTE occupies in a
-   multibyte form.  Unlike MULTIBYTE_LENGTH below, this function does not
+   multibyte form.  Unlike multibyte_length, this function does not
    validate the multibyte form, but looks only at its first byte.  */
 INLINE int
 BYTES_BY_CHAR_HEAD (int byte)
@@ -297,44 +301,54 @@ BYTES_BY_CHAR_HEAD (int byte)
 }
 
 
-/* The byte length of multibyte form at unibyte string P ending at
-   PEND.  If the string doesn't point to a valid multibyte form,
-   return 0.  Unlike BYTES_BY_CHAR_HEAD, this macro validates the
-   multibyte form.  */
+/* The byte length of the multibyte form at the unibyte string P,
+   ending at PEND if CHECK, and without a length check if !CHECK.
+   If ALLOW_8BIT, allow multibyte forms of eight-bit characters.
+   If the string doesn't point to a valid multibyte form, return 0.
+   Unlike BYTES_BY_CHAR_HEAD, this function validates the multibyte form.  */
 
 INLINE int
-MULTIBYTE_LENGTH (unsigned char const *p, unsigned char const *pend)
-{
-  return (! (p < pend) ? 0
-	  : ! (p[0] & 0x80) ? 1
-	  : ! (p + 1 < pend && (p[1] & 0xC0) == 0x80) ? 0
-	  : (p[0] & 0xE0) == 0xC0 ? 2
-	  : ! (p + 2 < pend && (p[2] & 0xC0) == 0x80) ? 0
-	  : (p[0] & 0xF0) == 0xE0 ? 3
-	  : ! (p + 3 < pend && (p[3] & 0xC0) == 0x80) ? 0
-	  : (p[0] & 0xF8) == 0xF0 ? 4
-	  : ! (p + 4 < pend && (p[4] & 0xC0) == 0x80) ? 0
-	  : p[0] == 0xF8 && (p[1] & 0xF0) == 0x80 ? 5
-	  : 0);
-}
-
-
-/* Like MULTIBYTE_LENGTH, but don't check the ending address.  The
-   multibyte form is still validated, unlike BYTES_BY_CHAR_HEAD.  */
+multibyte_length (unsigned char const *p, unsigned char const *pend,
+		  bool check, bool allow_8bit)
+{
+  if (!check || p < pend)
+    {
+      unsigned char c = p[0];
+      if (c < 0x80)
+	return 1;
+      if (!check || p + 1 < pend)
+	{
+	  /* The 'unsigned int' avoids int overflow in the 5-byte case.  */
+	  unsigned int d = p[1];
+
+	  if (TRAILING_CODE_P (d))
+	    {
+	      if (allow_8bit ? (c & 0xE0) == 0xC0 : 0xC2 <= c && c <= 0xDF)
+		return 2;
+	      if ((!check || p + 2 < pend)
+		  && TRAILING_CODE_P (p[2]))
+		{
+		  if ((c & 0xF0) == 0xE0 && ((c & 0x0F) | (d & 0x20)))
+		    return 3;
+		  if ((!check || p + 3 < pend) && TRAILING_CODE_P (p[3]))
+		    {
+		      if ((c & 0xF8) == 0xF0 && ((c & 0x07) | (d & 0x30)))
+			return 4;
+		      if (c == 0xF8 && (!check || p + 4 < pend)
+			  && TRAILING_CODE_P (p[4]))
+			{
+			  unsigned int w = ((d << 24) + (p[2] << 16)
+					    + (p[3] << 8) + p[4]);
+			  if (0x88808080 <= w && w <= 0x8FBFBDBF)
+			    return 5;
+			}
+		    }
+		}
+	    }
+	}
+    }
 
-INLINE int
-MULTIBYTE_LENGTH_NO_CHECK (unsigned char const *p)
-{
-  return (!(p[0] & 0x80) ? 1
-	  : (p[1] & 0xC0) != 0x80 ? 0
-	  : (p[0] & 0xE0) == 0xC0 ? 2
-	  : (p[2] & 0xC0) != 0x80 ? 0
-	  : (p[0] & 0xF0) == 0xE0 ? 3
-	  : (p[3] & 0xC0) != 0x80 ? 0
-	  : (p[0] & 0xF8) == 0xF0 ? 4
-	  : (p[4] & 0xC0) != 0x80 ? 0
-	  : p[0] == 0xF8 && (p[1] & 0xF0) == 0x80 ? 5
-	  : 0);
+  return 0;
 }
 
 
diff --git a/src/coding.c b/src/coding.c
index 716b0d99792..34f36d5a86a 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -7670,15 +7670,17 @@ consume_chars (struct coding_system *coding, Lisp_Object translation_table,
 
       if (! multibytep)
 	{
-	  int bytes;
-
 	  if (coding->encoder == encode_coding_raw_text
 	      || coding->encoder == encode_coding_ccl)
 	    c = *src++, pos++;
-	  else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
-	    c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
 	  else
-	    c = BYTE8_TO_CHAR (*src), src++, pos++;
+	    {
+	      int bytes = multibyte_length (src, src_end, true, true);
+	      if (0 < bytes)
+		c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
+	      else
+		c = BYTE8_TO_CHAR (*src), src++, pos++;
+	    }
 	}
       else
 	c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
@@ -7727,7 +7729,7 @@ consume_chars (struct coding_system *coding, Lisp_Object translation_table,
 	  for (i = 1; i < to_nchars; i++)
 	    *buf++ = XFIXNUM (AREF (trans, i));
 	  for (i = 1; i < from_nchars; i++, pos++)
-	    src += MULTIBYTE_LENGTH_NO_CHECK (src);
+	    src += multibyte_length (src, NULL, false, true);
 	}
     }
 
diff --git a/test/src/buffer-tests.el b/test/src/buffer-tests.el
index 1c356698f66..6e87cb94897 100644
--- a/test/src/buffer-tests.el
+++ b/test/src/buffer-tests.el
@@ -1313,4 +1313,18 @@ with parameters from the *Messages* buffer modification."
         (ovshould nonempty-eob-end 4 5)
         (ovshould empty-eob        5 5)))))
 
+(ert-deftest buffer-multibyte-overlong-sequences ()
+  (dolist (uni '("\xE0\x80\x80"
+                 "\xF0\x80\x80\x80"
+                 "\xF8\x8F\xBF\xBF\x80"))
+    (let ((multi (string-to-multibyte uni)))
+      (should
+       (string-equal
+        multi
+        (with-temp-buffer
+          (set-buffer-multibyte nil)
+          (insert uni)
+          (set-buffer-multibyte t)
+          (buffer-string)))))))
+
 ;;; buffer-tests.el ends here
-- 
2.39.5