From: K. Handa <handa@gnu.org>
Date: Sun, 4 Aug 2019 12:14:26 +0000 (+0900)
Subject: Add Unicode-safe UTF-8 converter
X-Git-Tag: emacs-27.0.90~1713
X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=a8026dfde9734a03ad03a9872ec801871dd1d81a;p=emacs.git

Add Unicode-safe UTF-8 converter

* src/coding.c (encode_string_utf_8, decode_string_utf_8): New functions.
* src/coding.h (encode_string_utf_8, decode_string_utf_8): Extern them.
---

diff --git a/src/coding.c b/src/coding.c
index 189a4b39d15..ab0e15119f3 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -9515,6 +9515,732 @@ code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
 }
 
+
+/* Return the gap address of BUFFER.  If the gap size is less than
+   NBYTES, enlarge the gap in advance.  */
+
+static unsigned char *
+get_buffer_gap_address (Lisp_Object buffer, int nbytes)
+{
+  struct buffer *buf = XBUFFER (buffer);
+
+  if (BUF_GPT (buf) != BUF_PT (buf))
+    {
+      struct buffer *oldb = current_buffer;
+
+      current_buffer = buf;
+      move_gap_both (PT, PT_BYTE);
+      current_buffer = oldb;
+    }
+  if (BUF_GAP_SIZE (buf) < nbytes)
+    make_gap_1 (buf, nbytes);
+  return BUF_GPT_ADDR (buf);
+}
+
+/* Return a pointer to the byte sequence for C, and set the length in
+   LEN.  This function is used to get a byte sequence for HANDLE_8_BIT
+   and HANDLE_OVER_UNI arguments of encode_string_utf_8 and
+   decode_string_utf_8 when those arguments are given by
+   characters.  */
+
+static unsigned char *
+get_char_bytes (int c, int *len)
+{
+  /* We uses two chaches considering the situation that
+     encode/decode_string_utf_8 are called repeatedly with the same
+     values for HANDLE_8_BIT and HANDLE_OVER_UNI arguments.  */
+  static int chars[2];
+  static unsigned char bytes[2][6];
+  static int nbytes[2];
+  static int last_index;
+
+  if (chars[last_index] == c)
+    {
+      *len = nbytes[last_index];
+      return bytes[last_index];
+    }
+  if (chars[1 - last_index] == c)
+    {
+      *len = nbytes[1 - last_index];
+      return bytes[1 - last_index];
+    }
+  last_index = 1 - last_index;
+  chars[last_index] = c;
+  *len = nbytes[last_index] = CHAR_STRING (c, bytes[last_index]);
+  return bytes[last_index];
+}
+
+/* Encode STRING by the coding system utf-8-unix.
+
+   Even if :pre-write-conversion and :encode-translation-table
+   properties are put to that coding system, they are ignored.
+
+   It ignores :pre-write-conversion and :encode-translation-table
+   propeties of that coding system.
+
+   This function assumes that arguments have values as described
+   below.  The validity must be assured by callers.
+
+   STRING is a multibyte string or an ASCII-only unibyte string.
+
+   BUFFER is a unibyte buffer or Qnil.
+
+   If BUFFER is a unibyte buffer, the encoding result of UTF-8
+   sequence is inserted after point of the buffer, and the number of
+   inserted characters is returned.  Note that a caller should have
+   made BUFFER ready for modifying in advance (e.g. by calling
+   invalidate_buffer_caches).
+
+   If BUFFER is Qnil, a unibyte string is made from the encodnig
+   result of UTF-8 sequence, and it is returned.  If NOCOPY and STRING
+   contains only Unicode characters (i.e. the encoding does not change
+   the byte sequence), STRING is returned even if it is multibyte.
+
+   HANDLE-8-BIT and HANDE-OVER-UNI specify how to handle a non-Unicode
+   character.  The former is for an eight-bit character (represented
+   by 2-byte overlong sequence in multibyte STRING).  The latter is
+   for an over-unicode character (a character whose code is greater
+   than the maximum Unicode character 0x10FFFF, and is represented by
+   4 or 5-byte sequence in multibyte STRING).
+
+   If they are unibyte strings (typically "\357\277\275"; UTF-8
+   sequence for the Unicode REPLACEMENT CHARACTER #xFFFD), a
+   non-Unicode character is encoded into that sequence.
+
+   If they are characters, a non-Unicode chracters is encoded into the
+   corresponding UTF-8 sequences.
+
+   If they are Qignored, a non-Unicode character is skipped on
+   encoding.
+
+   If HANDLE-8-BIT is Qt, an eight-bit character is encoded into one
+   byte of the same value.
+
+   If HANDLE-OVER-UNI is Qt, an over-unicode character is encoded
+   into the the same 4 or 5-byte sequence.
+
+   If they are Qnil, Qnil is returned if STRING has a non-Unicode
+   character. */
+
+Lisp_Object
+encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
+		     bool nocopy, Lisp_Object handle_8_bit,
+		     Lisp_Object handle_over_uni)
+{
+  ptrdiff_t nchars = SCHARS (string), nbytes = SBYTES (string);
+  if (NILP (buffer) && nchars == nbytes)
+    /* STRING contains only ASCII characters. */
+    return string;
+
+  ptrdiff_t num_8_bit = 0;   /* number of eight-bit chars in STRING */
+  /* The following two vars are counted only if handle_over_uni is not Qt */
+  ptrdiff_t num_over_4 = 0; /* number of 4-byte non-Unicode chars in STRING */
+  ptrdiff_t num_over_5 = 0; /* number of 5-byte non-Unicode chars in STRING */
+  ptrdiff_t outbytes;	     /* number of bytes of decoding result. */
+  unsigned char *p = SDATA (string);
+  unsigned char *pend = p + nbytes;
+  unsigned char *src = NULL, *dst = NULL;
+  unsigned char *replace_8_bit = NULL, *replace_over_uni = NULL;
+  int replace_8_bit_len = 0, replace_over_uni_len = 0;
+  Lisp_Object val;		/* the return value */
+
+  /* Scan bytes in STRING twice.  The first scan is to count non-Unicode
+     characters, and the second scan is to encode STRING.  If the
+     encoding is trivial (no need of changing the byte sequence),
+     the second scan is avoided.  */
+  for (int scan_count = 0; scan_count < 2; scan_count++)
+    {
+      while (p < pend)
+	{
+	  if (nchars == pend - p)
+	    /* There is no multibyte character remaining.  */
+	    break;
+
+	  int c = *p;
+	  int len = BYTES_BY_CHAR_HEAD (c);
+
+	  nchars--;
+	  if (len == 1
+	      || len == 3
+	      || (len == 2 ? ! CHAR_BYTE8_HEAD_P (c)
+		  : (EQ (handle_over_uni, Qt)
+		     || (len == 4
+			 && string_char (p, NULL, NULL) <= MAX_UNICODE_CHAR))))
+	    {
+	      p += len;
+	      continue;
+	    }
+
+	  /* A character to change the byte sequence on encoding was
+	     found.  A rare case. */
+	  if (len == 2)
+	    {
+	      /* Handle an eight-bit character by handle_8_bit. */
+	      if (scan_count == 0)
+		{
+		  if (NILP (handle_8_bit))
+		    return Qnil;
+		  num_8_bit++;
+		}
+	      else
+		{
+		  if (src < p)
+		    {
+		      memcpy (dst, src, p - src);
+		      dst += p - src;
+		    }
+		  if (replace_8_bit_len > 0)
+		    {
+		      memcpy (dst, replace_8_bit, replace_8_bit_len);
+		      dst += replace_8_bit_len;
+		    }
+		  else if (EQ (handle_8_bit, Qt))
+		    {
+		      int char8 = STRING_CHAR (p);
+		      *dst++ = CHAR_TO_BYTE8 (char8);
+		    }
+		}
+	    }
+	  else			/* len == 4 or 5 */
+	    {
+	      /* Handle an over-unicode character by handle_over_uni. */
+	      if (scan_count == 0)
+		{
+		  if (NILP (handle_over_uni))
+		    return Qnil;
+		  if (len == 4)
+		    num_over_4++;
+		  else
+		    num_over_5++;
+		}
+	      else
+		{
+		  if (src < p)
+		    {
+		      memcpy (dst, src, p - src);
+		      dst += p - src;
+		    }
+		  if (replace_over_uni_len > 0)
+		    {
+		      memcpy (dst, replace_over_uni, replace_over_uni_len);
+		      dst += replace_over_uni_len;
+		    }
+		}
+	    }
+	  p += len;
+	  src = p;
+	}
+
+      if (scan_count == 0)
+	{
+	  /* End of the first scane */
+	  outbytes = nbytes;
+	  if (num_8_bit == 0
+	      && (num_over_4 + num_over_5 == 0 || EQ (handle_over_uni, Qt)))
+	    {
+	      /* We can break the loop because there is no need of
+		 changing the byte sequence.  This is the typical
+		 case. */
+	      scan_count = 1;
+	    }
+	  else
+	    {
+	      /* Prepare for the next scan to handle non-Unicode characters. */
+	      if (num_8_bit > 0)
+		{
+		  if (CHARACTERP (handle_8_bit))
+		    replace_8_bit = get_char_bytes (XFIXNUM (handle_8_bit),
+						    &replace_8_bit_len);
+		  else if (STRINGP (handle_8_bit))
+		    {
+		      replace_8_bit = SDATA (handle_8_bit);
+		      replace_8_bit_len = SBYTES (handle_8_bit);
+		    }
+		  if (replace_8_bit)
+		    outbytes += (replace_8_bit_len - 2) * num_8_bit;
+		  else if (EQ (handle_8_bit, Qignored))
+		    outbytes -= 2 * num_8_bit;
+		  else if (EQ (handle_8_bit, Qt))
+		    outbytes -= num_8_bit;
+		  else
+		    return Qnil;
+		}
+	      if (num_over_4 + num_over_5 > 0)
+		{
+		  if (CHARACTERP (handle_over_uni))
+		    replace_over_uni = get_char_bytes (XFIXNUM (handle_over_uni),
+						       &replace_over_uni_len);
+		  else if (STRINGP (handle_over_uni))
+		    {
+		      replace_over_uni = SDATA (handle_over_uni);
+		      replace_over_uni_len = SBYTES (handle_over_uni);
+		    }
+		  if (num_over_4 > 0)
+		    {
+		      if (replace_over_uni)
+			outbytes += (replace_over_uni_len - 4) * num_over_4;
+		      else if (EQ (handle_over_uni, Qignored))
+			outbytes -= 4 * num_over_4;
+		      else if (! EQ (handle_over_uni, Qt))
+			return Qnil;
+		    }
+		  if (num_over_5 > 0)
+		    {
+		      if (replace_over_uni)
+			outbytes += (replace_over_uni_len - 5) * num_over_5;
+		      else if (EQ (handle_over_uni, Qignored))
+			outbytes -= 5 * num_over_5;
+		      else if (! EQ (handle_over_uni, Qt))
+			return Qnil;
+		    }
+		}
+	    }
+
+	  /* Prepare a return value and a space to store the encoded bytes. */
+	  if (BUFFERP (buffer))
+	    {
+	      val = make_fixnum (outbytes);
+	      dst = get_buffer_gap_address (buffer, nbytes);
+	    }
+	  else
+	    {
+	      if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0)
+		return string;
+	      val = make_uninit_string (outbytes);
+	      dst = SDATA (val);
+	    }
+	  p = src = SDATA (string);
+	}
+    }
+
+  if (src < pend)
+    memcpy (dst, src, pend - src);
+  if (BUFFERP (buffer))
+    {
+      struct buffer *oldb = current_buffer;
+
+      current_buffer = XBUFFER (buffer);
+      insert_from_gap (outbytes, outbytes, false);
+      current_buffer = oldb;
+    }
+  return val;
+}
+
+/* Decode STRING by the coding system utf-8-unix.
+
+   Even if :post-read-conversion and :decode-translation-table
+   properties are put to that coding system, they are ignored.
+
+   This function assumes that arguments have values as described
+   below.  The validity must be assured by callers.
+
+   STRING is a unibyte string or an ASCII-only multibyte string.
+
+   BUFFER is a multibyte buffer or Qnil.
+
+   If BUFFER is a multibyte buffer, the decoding result of Unicode
+   characters are inserted after point of the buffer, and the number
+   of inserted characters is returned.  Note that a caller should have
+   made BUFFER ready for modifying in advance (e.g. by calling
+   invalidate_buffer_caches).
+
+   If BUFFER is Qnil, a multibyte string is made from the decoding
+   result of Unicode characters, and it is returned.  As a special
+   case, STRING itself is returned in the following cases:
+   1. STRING contains only ASCII characters.
+   2. NOCOPY, and STRING contains only valid UTF-8 sequences.
+
+   HANDLE-8-BIT and HANDLE-OVER-UNI specify how to handle a invalid
+   byte sequence.  The former is for an 1-byte invalid sequence that
+   violates the fundamental UTF-8 encoding rule.  The latter is for a
+   4 or 5-byte invalid sequence that Emacs internally uses to
+   represent an over-unicode character (a character of code greater
+   than #x10FFFF).  Note that this function does not treat an overlong
+   UTF-8 sequence as invalid.
+
+   If they are strings (typically 1-char string of the Unicode
+   REPLACEMENT CHARACTER #xFFFD), an invalid sequence is decoded into
+   that string.  They must be multibyte strings if they contain a
+   non-ASCII character.
+
+   If they are characters, an invalid sequence is decoded into the
+   corresponding multibyte representation of the characters.
+
+   If they are Qignored, an invalid sequence is skipped on decoding.
+
+   If HANDLE-8-BIT is Qt, an 1-byte invalid sequence is deoded into
+   the corresponding eight-bit character.
+
+   If HANDLE-OVER-UNI is Qt, a 4 or 5-byte invalid sequence that
+   follows Emacs' representation for an over-unicode character is
+   decoded into the corresponding character.
+
+   If they are Qnil, Qnil is returned if STRING has an invalid sequence.  */
+
+Lisp_Object
+decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
+		     bool nocopy, Lisp_Object handle_8_bit,
+		     Lisp_Object handle_over_uni)
+{
+  /* This is like BYTES_BY_CHAR_HEAD, but it is assured that C >= 0x80
+     and it returns 0 for invalid sequence.  */
+#define UTF_8_SEQUENCE_LENGTH(c)	\
+  ((c) < 0xC2 ? 0			\
+   : (c) < 0xE0 ? 2			\
+   : (c) < 0xF0 ? 3			\
+   : (c) < 0xF8 ? 4			\
+   : (c) == 0xF8 ? 5			\
+   : 0)
+
+  ptrdiff_t nbytes = SBYTES (string);
+  unsigned char *p = SDATA (string), *pend = p + nbytes;
+  ptrdiff_t num_8_bit = 0;   /* number of invalid 1-byte sequences. */
+  ptrdiff_t num_over_4 = 0;  /* number of invalid 4-byte sequences. */
+  ptrdiff_t num_over_5 = 0;  /* number of invalid 5-byte sequences. */
+  ptrdiff_t outbytes = nbytes;	/* number of decoded bytes. */
+  ptrdiff_t outchars = 0;    /* number of decoded characters. */
+  unsigned char *src = NULL, *dst = NULL;
+  bool change_byte_sequence = false;
+
+  /* Scan bytes in STRING twice.  The first scan is to count invalid
+     sequences, and the second scan is to decode STRING.  If the
+     decoding is trivial (no need of changing the byte sequence),
+     the second scan is avoided.  */
+  while (p < pend)
+    {
+      src = p;
+      /* Try short cut for an ASCII-only case. */
+      while (p < pend && *p < 0x80) p++;
+      outchars += (p - src);
+      if (p == pend)
+	break;
+      int c = *p;
+      outchars++;
+      int len = UTF_8_SEQUENCE_LENGTH (c);
+      /* len == 0, 2, 3, 4, 5 */
+      if (UTF_8_EXTRA_OCTET_P (p[1])
+	  && (len == 2
+	      || (UTF_8_EXTRA_OCTET_P (p[2])
+		  && (len == 3
+		      || (UTF_8_EXTRA_OCTET_P (p[3])
+			  && len == 4
+			  && string_char (p, NULL, NULL) <= MAX_UNICODE_CHAR)))))
+	{
+	  p += len;
+	  continue;
+	}
+
+      /* A sequence to change on decoding was found.  A rare case. */
+      if (len == 0)
+	{
+	  if (NILP (handle_8_bit))
+	    return Qnil;
+	  num_8_bit++;
+	  len = 1;
+	}
+      else			/* len == 4 or 5 */
+	{
+	  if (NILP (handle_over_uni))
+	    return Qnil;
+	  if (len == 4)
+	    num_over_4++;
+	  else
+	    num_over_5++;
+	}
+      change_byte_sequence = true;
+      p += len;
+    }
+
+  Lisp_Object val;	     /* the return value. */
+
+  if (! change_byte_sequence
+      && NILP (buffer))
+    {
+      if (nocopy)
+	return string;
+      val = make_uninit_multibyte_string (outchars, outbytes);
+      memcpy (SDATA (val), SDATA (string), pend - SDATA (string));
+      return val;
+    }
+
+  /* Count the number of resulting chars and bytes. */
+  unsigned char *replace_8_bit = NULL, *replace_over_uni = NULL;
+  int replace_8_bit_len = 0, replace_over_uni_len = 0;
+
+  if (change_byte_sequence)
+    {
+      if (num_8_bit > 0)
+	{
+	  if (CHARACTERP (handle_8_bit))
+	    replace_8_bit = get_char_bytes (XFIXNUM (handle_8_bit),
+					    &replace_8_bit_len);
+	  else if (STRINGP (handle_8_bit))
+	    {
+	      replace_8_bit = SDATA (handle_8_bit);
+	      replace_8_bit_len = SBYTES (handle_8_bit);
+	    }
+	  if (replace_8_bit)
+	    outbytes += (replace_8_bit_len - 1) * num_8_bit;
+	  else if (EQ (handle_8_bit, Qignored))
+	    {
+	      outbytes -= num_8_bit;
+	      outchars -= num_8_bit;
+	    }
+	  else /* EQ (handle_8_bit, Qt)) */
+	    outbytes += num_8_bit;
+	}
+      else if (num_over_4 + num_over_5 > 0)
+	{
+	  if (CHARACTERP (handle_over_uni))
+	    replace_over_uni = get_char_bytes (XFIXNUM (handle_over_uni),
+					       &replace_over_uni_len);
+	  else if (STRINGP (handle_over_uni))
+	    {
+	      replace_over_uni = SDATA (handle_over_uni);
+	      replace_over_uni_len = SBYTES (handle_over_uni);
+	    }
+	  if (num_over_4 > 0)
+	    {
+	      if (replace_over_uni)
+		outbytes += (replace_over_uni_len - 4) * num_over_4;
+	      else if (EQ (handle_over_uni, Qignored))
+		{
+		  outbytes -= 4 * num_over_4;
+		  outchars -= num_over_4;
+		}
+	    }
+	  if (num_over_5 > 0)
+	    {
+	      if (replace_over_uni)
+		outbytes += (replace_over_uni_len - 5) * num_over_5;
+	      else if (EQ (handle_over_uni, Qignored))
+		{
+		  outbytes -= 5 * num_over_5;
+		  outchars -= num_over_5;
+		}
+	    }
+	}
+    }
+
+  /* Prepare a return value and a space to store the decoded bytes. */
+  if (BUFFERP (buffer))
+    {
+      val = make_fixnum (outchars);
+      dst = get_buffer_gap_address (buffer, outbytes);
+    }
+  else
+    {
+      if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0)
+	return string;
+      val = make_uninit_multibyte_string (outchars, outbytes);
+      dst = SDATA (val);
+    }
+
+  src = SDATA (string);
+  if (change_byte_sequence)
+    {
+      p = src;
+      while (p < pend)
+	{
+	  /* Try short cut for an ASCII-only case. */
+	  /* while (p < pend && *p < 0x80) p++; */
+	  /* if (p == pend) */
+	  /*   break; */
+	  int c = *p;
+	  if (c < 0x80)
+	    {
+	      p++;
+	      continue;
+	    }
+	  int len = UTF_8_SEQUENCE_LENGTH (c);
+	  if (len > 1)
+	    {
+	      int mlen;
+	      for (mlen = 1; mlen < len && UTF_8_EXTRA_OCTET_P (p[mlen]);
+		   mlen++);
+	      if (mlen == len
+		  && (len <= 3
+		      || (len == 4
+			  && string_char (p, NULL, NULL) <= MAX_UNICODE_CHAR)
+		      || EQ (handle_over_uni, Qt)))
+		{
+		  p += len;
+		  continue;
+		}
+	    }
+
+	  if (src < p)
+	    {
+	      memcpy (dst, src, p - src);
+	      dst += p - src;
+	    }
+	  if (len == 0)
+	    {
+	      if (replace_8_bit)
+		{
+		  memcpy (dst, replace_8_bit, replace_8_bit_len);
+		  dst += replace_8_bit_len;
+		}
+	      else if (EQ (handle_8_bit, Qt))
+		{
+		  dst += BYTE8_STRING (c, dst);
+		}
+	      len = 1;
+	    }
+	  else			/* len == 4 or 5 */
+	    {
+	      /* Handle p[0]... by handle_over_uni */
+	      if (replace_over_uni)
+		{
+		  memcpy (dst, replace_over_uni, replace_over_uni_len);
+		  dst += replace_over_uni_len;
+		}
+	    }
+	  p += len;
+	  src = p;
+	}
+    }
+
+  if (src < pend)
+    memcpy (dst, src, pend - src);
+  if (BUFFERP (buffer))
+    {
+      struct buffer *oldb = current_buffer;
+
+      current_buffer = XBUFFER (buffer);
+      insert_from_gap (outchars, outbytes, false);
+      current_buffer = oldb;
+    }
+  return val;
+}
+
+/* #define ENABLE_UTF_8_CONVERTER_TEST */
+
+#ifdef ENABLE_UTF_8_CONVERTER_TEST
+
+/* These functions are useful for testing and benchmarking
+   encode_string_utf_8 and decode_string_utf_8.  */
+
+/* ENCODE_METHOD specifies which internal decoder to use.
+   If it is Qnil, use encode_string_utf_8.
+   Otherwise, use code_convert_string.
+
+   COUNT, if integer, specifies how many times to call those functions
+   with the same arguments (for benchmarking). */
+
+DEFUN ("internal-encode-string-utf-8", Finternal_encode_string_utf_8,
+       Sinternal_encode_string_utf_8, 7, 7, 0,
+       doc: /* Internal use only.*/)
+  (Lisp_Object string, Lisp_Object buffer, Lisp_Object nocopy,
+   Lisp_Object handle_8_bit, Lisp_Object handle_over_uni,
+   Lisp_Object encode_method, Lisp_Object count)
+{
+  int repeat_count;
+  Lisp_Object val;
+
+  /* Check arguments.  Return Qnil when an argmement is invalid.  */
+  if (! STRINGP (string))
+    return Qnil;
+  if (! NILP (buffer)
+      && (! BUFFERP (buffer)
+	  || ! NILP (BVAR (XBUFFER (buffer), enable_multibyte_characters))))
+    return Qnil;
+  if (! NILP (handle_8_bit) && ! EQ (handle_8_bit, Qt)
+      && ! EQ (handle_8_bit, Qignored)
+      && ! CHARACTERP (handle_8_bit)
+      && (! STRINGP (handle_8_bit) || STRING_MULTIBYTE (handle_8_bit)))
+    return Qnil;
+  if (! NILP (handle_over_uni) && ! EQ (handle_over_uni, Qt)
+      && ! EQ (handle_over_uni, Qignored)
+      && ! CHARACTERP (handle_over_uni)
+      && (! STRINGP (handle_over_uni) || STRING_MULTIBYTE (handle_over_uni)))
+    return Qnil;
+
+  CHECK_FIXNUM (count);
+  repeat_count = XFIXNUM (count);
+
+  val = Qnil;
+  /* Run an encoder according to ENCODE_METHOD.  */
+  if (NILP (encode_method))
+    {
+      for (int i = 0; i < repeat_count; i++)
+	val = encode_string_utf_8 (string, buffer, ! NILP (nocopy),
+				   handle_8_bit, handle_over_uni);
+    }
+  else
+    {
+      for (int i = 0; i < repeat_count; i++)
+	val = code_convert_string (string, Qutf_8_unix, Qnil, true,
+				   ! NILP (nocopy), true);
+    }
+  return val;
+}
+
+/* DECODE_METHOD specifies which internal decoder to use.
+   If it is Qnil, use decode_string_utf_8.
+   If it is Qt, use code_convert_string.
+   Otherwise, use make_string_from_utf8.
+
+   COUNT, if integer, specifies how many times to call those functions
+   with the same arguments (for benchmarking).  */
+
+DEFUN ("internal-decode-string-utf-8", Finternal_decode_string_utf_8,
+       Sinternal_decode_string_utf_8, 7, 7, 0,
+       doc: /* Internal use only.*/)
+  (Lisp_Object string, Lisp_Object buffer, Lisp_Object nocopy,
+   Lisp_Object handle_8_bit, Lisp_Object handle_over_uni,
+   Lisp_Object decode_method, Lisp_Object count)
+{
+  int repeat_count;
+  Lisp_Object val;
+
+  /* Check arguments.  Return Qnil when an argmement is invalid.  */
+  if (! STRINGP (string))
+    return Qnil;
+  if (! NILP (buffer)
+      && (! BUFFERP (buffer)
+	  || NILP (BVAR (XBUFFER (buffer), enable_multibyte_characters))))
+    return Qnil;
+  if (! NILP (handle_8_bit) && ! EQ (handle_8_bit, Qt)
+      && ! EQ (handle_8_bit, Qignored)
+      && ! CHARACTERP (handle_8_bit)
+      && (! STRINGP (handle_8_bit) || ! STRING_MULTIBYTE (handle_8_bit)))
+    return Qnil;
+  if (! NILP (handle_over_uni) && ! EQ (handle_over_uni, Qt)
+      && ! EQ (handle_over_uni, Qignored)
+      && ! CHARACTERP (handle_over_uni)
+      && (! STRINGP (handle_over_uni) || ! STRING_MULTIBYTE (handle_over_uni)))
+    return Qnil;
+
+  CHECK_FIXNUM (count);
+  repeat_count = XFIXNUM (count);
+
+  val = Qnil;
+  /* Run a decoder according to DECODE_METHOD.  */
+  if (NILP (decode_method))
+    {
+      for (int i = 0; i < repeat_count; i++)
+	val = decode_string_utf_8 (string, buffer, ! NILP (nocopy),
+				   handle_8_bit, handle_over_uni);
+    }
+  else if (EQ (decode_method, Qt))
+    {
+      if (! BUFFERP (buffer))
+	buffer = Qt;
+      for (int i = 0; i < repeat_count; i++)
+	val = code_convert_string (string, Qutf_8_unix, buffer, false,
+				   ! NILP (nocopy), true);
+    }
+  else if (! NILP (decode_method))
+    {
+      for (int i = 0; i < repeat_count; i++)
+	val = make_string_from_utf8 ((char *) SDATA (string), SBYTES (string));
+    }
+  return val;
+}
+
+#endif	/* ENABLE_UTF_8_CONVERTER_TEST */
+
 /* Encode or decode a file name, to or from a unibyte string suitable
    for passing to C library functions.  */
 Lisp_Object
@@ -10974,6 +11700,10 @@ syms_of_coding (void)
   defsubr (&Sencode_coding_region);
   defsubr (&Sdecode_coding_string);
   defsubr (&Sencode_coding_string);
+#ifdef ENABLE_UTF_8_CONVERTER_TEST
+  defsubr (&Sinternal_encode_string_utf_8);
+  defsubr (&Sinternal_decode_string_utf_8);
+#endif	/* ENABLE_UTF_8_CONVERTER_TEST */
   defsubr (&Sdecode_sjis_char);
   defsubr (&Sencode_sjis_char);
   defsubr (&Sdecode_big5_char);
diff --git a/src/coding.h b/src/coding.h
index 70690d42d30..8efddbf55c4 100644
--- a/src/coding.h
+++ b/src/coding.h
@@ -689,6 +689,10 @@ extern Lisp_Object code_convert_string (Lisp_Object, Lisp_Object,
                                         Lisp_Object, bool, bool, bool);
 extern Lisp_Object code_convert_string_norecord (Lisp_Object, Lisp_Object,
                                                  bool);
+extern Lisp_Object encode_string_utf_8 (Lisp_Object, Lisp_Object, bool,
+					Lisp_Object, Lisp_Object);
+extern Lisp_Object decode_string_utf_8 (Lisp_Object, Lisp_Object, bool,
+					Lisp_Object, Lisp_Object);
 extern Lisp_Object encode_file_name (Lisp_Object);
 extern Lisp_Object decode_file_name (Lisp_Object);
 extern Lisp_Object raw_text_coding_system (Lisp_Object);