Add Unicode-safe UTF-8 converter

author K. Handa <handa@gnu.org>

Sun, 4 Aug 2019 12:14:26 +0000 (21:14 +0900)

committer K. Handa <handa@gnu.org>

Sun, 4 Aug 2019 12:14:26 +0000 (21:14 +0900)
author K. Handa <handa@gnu.org>
Sun, 4 Aug 2019 12:14:26 +0000 (21:14 +0900)
committer K. Handa <handa@gnu.org>
Sun, 4 Aug 2019 12:14:26 +0000 (21:14 +0900)
diff --git a/src/coding.c b/src/coding.c

index 189a4b39d152a7e62c32bac5c38608a0c19b7e60..ab0e15119f3d965b93918679d7239b0072385c5b 100644 (file)
--- a/src/coding.c
+++ b/src/coding.c
@@ -9515,6 +9515,732 @@ code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
    return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
  }
  
+
+/* Return the gap address of BUFFER.  If the gap size is less than
+   NBYTES, enlarge the gap in advance.  */
+
+static unsigned char *
+get_buffer_gap_address (Lisp_Object buffer, int nbytes)
+{
+  struct buffer *buf = XBUFFER (buffer);
+
+  if (BUF_GPT (buf) != BUF_PT (buf))
+    {
+      struct buffer *oldb = current_buffer;
+
+      current_buffer = buf;
+      move_gap_both (PT, PT_BYTE);
+      current_buffer = oldb;
+    }
+  if (BUF_GAP_SIZE (buf) < nbytes)
+    make_gap_1 (buf, nbytes);
+  return BUF_GPT_ADDR (buf);
+}
+
+/* Return a pointer to the byte sequence for C, and set the length in
+   LEN.  This function is used to get a byte sequence for HANDLE_8_BIT
+   and HANDLE_OVER_UNI arguments of encode_string_utf_8 and
+   decode_string_utf_8 when those arguments are given by
+   characters.  */
+
+static unsigned char *
+get_char_bytes (int c, int *len)
+{
+  /* We uses two chaches considering the situation that
+     encode/decode_string_utf_8 are called repeatedly with the same
+     values for HANDLE_8_BIT and HANDLE_OVER_UNI arguments.  */
+  static int chars[2];
+  static unsigned char bytes[2][6];
+  static int nbytes[2];
+  static int last_index;
+
+  if (chars[last_index] == c)
+    {
+      *len = nbytes[last_index];
+      return bytes[last_index];
+    }
+  if (chars[1 - last_index] == c)
+    {
+      *len = nbytes[1 - last_index];
+      return bytes[1 - last_index];
+    }
+  last_index = 1 - last_index;
+  chars[last_index] = c;
+  *len = nbytes[last_index] = CHAR_STRING (c, bytes[last_index]);
+  return bytes[last_index];
+}
+
+/* Encode STRING by the coding system utf-8-unix.
+
+   Even if :pre-write-conversion and :encode-translation-table
+   properties are put to that coding system, they are ignored.
+
+   It ignores :pre-write-conversion and :encode-translation-table
+   propeties of that coding system.
+
+   This function assumes that arguments have values as described
+   below.  The validity must be assured by callers.
+
+   STRING is a multibyte string or an ASCII-only unibyte string.
+
+   BUFFER is a unibyte buffer or Qnil.
+
+   If BUFFER is a unibyte buffer, the encoding result of UTF-8
+   sequence is inserted after point of the buffer, and the number of
+   inserted characters is returned.  Note that a caller should have
+   made BUFFER ready for modifying in advance (e.g. by calling
+   invalidate_buffer_caches).
+
+   If BUFFER is Qnil, a unibyte string is made from the encodnig
+   result of UTF-8 sequence, and it is returned.  If NOCOPY and STRING
+   contains only Unicode characters (i.e. the encoding does not change
+   the byte sequence), STRING is returned even if it is multibyte.
+
+   HANDLE-8-BIT and HANDE-OVER-UNI specify how to handle a non-Unicode
+   character.  The former is for an eight-bit character (represented
+   by 2-byte overlong sequence in multibyte STRING).  The latter is
+   for an over-unicode character (a character whose code is greater
+   than the maximum Unicode character 0x10FFFF, and is represented by
+   4 or 5-byte sequence in multibyte STRING).
+
+   If they are unibyte strings (typically "\357\277\275"; UTF-8
+   sequence for the Unicode REPLACEMENT CHARACTER #xFFFD), a
+   non-Unicode character is encoded into that sequence.
+
+   If they are characters, a non-Unicode chracters is encoded into the
+   corresponding UTF-8 sequences.
+
+   If they are Qignored, a non-Unicode character is skipped on
+   encoding.
+
+   If HANDLE-8-BIT is Qt, an eight-bit character is encoded into one
+   byte of the same value.
+
+   If HANDLE-OVER-UNI is Qt, an over-unicode character is encoded
+   into the the same 4 or 5-byte sequence.
+
+   If they are Qnil, Qnil is returned if STRING has a non-Unicode
+   character. */
+
+Lisp_Object
+encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
+                    bool nocopy, Lisp_Object handle_8_bit,
+                    Lisp_Object handle_over_uni)
+{
+  ptrdiff_t nchars = SCHARS (string), nbytes = SBYTES (string);
+  if (NILP (buffer) && nchars == nbytes)
+    /* STRING contains only ASCII characters. */
+    return string;
+
+  ptrdiff_t num_8_bit = 0;   /* number of eight-bit chars in STRING */
+  /* The following two vars are counted only if handle_over_uni is not Qt */
+  ptrdiff_t num_over_4 = 0; /* number of 4-byte non-Unicode chars in STRING */
+  ptrdiff_t num_over_5 = 0; /* number of 5-byte non-Unicode chars in STRING */
+  ptrdiff_t outbytes;       /* number of bytes of decoding result. */
+  unsigned char *p = SDATA (string);
+  unsigned char *pend = p + nbytes;
+  unsigned char *src = NULL, *dst = NULL;
+  unsigned char *replace_8_bit = NULL, *replace_over_uni = NULL;
+  int replace_8_bit_len = 0, replace_over_uni_len = 0;
+  Lisp_Object val;             /* the return value */
+
+  /* Scan bytes in STRING twice.  The first scan is to count non-Unicode
+     characters, and the second scan is to encode STRING.  If the
+     encoding is trivial (no need of changing the byte sequence),
+     the second scan is avoided.  */
+  for (int scan_count = 0; scan_count < 2; scan_count++)
+    {
+      while (p < pend)
+       {
+         if (nchars == pend - p)
+           /* There is no multibyte character remaining.  */
+           break;
+
+         int c = *p;
+         int len = BYTES_BY_CHAR_HEAD (c);
+
+         nchars--;
+         if (len == 1
+             || len == 3
+             || (len == 2 ? ! CHAR_BYTE8_HEAD_P (c)
+                 : (EQ (handle_over_uni, Qt)
+                    || (len == 4
+                        && string_char (p, NULL, NULL) <= MAX_UNICODE_CHAR))))
+           {
+             p += len;
+             continue;
+           }
+
+         /* A character to change the byte sequence on encoding was
+            found.  A rare case. */
+         if (len == 2)
+           {
+             /* Handle an eight-bit character by handle_8_bit. */
+             if (scan_count == 0)
+               {
+                 if (NILP (handle_8_bit))
+                   return Qnil;
+                 num_8_bit++;
+               }
+             else
+               {
+                 if (src < p)
+                   {
+                     memcpy (dst, src, p - src);
+                     dst += p - src;
+                   }
+                 if (replace_8_bit_len > 0)
+                   {
+                     memcpy (dst, replace_8_bit, replace_8_bit_len);
+                     dst += replace_8_bit_len;
+                   }
+                 else if (EQ (handle_8_bit, Qt))
+                   {
+                     int char8 = STRING_CHAR (p);
+                     *dst++ = CHAR_TO_BYTE8 (char8);
+                   }
+               }
+           }
+         else                  /* len == 4 or 5 */
+           {
+             /* Handle an over-unicode character by handle_over_uni. */
+             if (scan_count == 0)
+               {
+                 if (NILP (handle_over_uni))
+                   return Qnil;
+                 if (len == 4)
+                   num_over_4++;
+                 else
+                   num_over_5++;
+               }
+             else
+               {
+                 if (src < p)
+                   {
+                     memcpy (dst, src, p - src);
+                     dst += p - src;
+                   }
+                 if (replace_over_uni_len > 0)
+                   {
+                     memcpy (dst, replace_over_uni, replace_over_uni_len);
+                     dst += replace_over_uni_len;
+                   }
+               }
+           }
+         p += len;
+         src = p;
+       }
+
+      if (scan_count == 0)
+       {
+         /* End of the first scane */
+         outbytes = nbytes;
+         if (num_8_bit == 0
+             && (num_over_4 + num_over_5 == 0 || EQ (handle_over_uni, Qt)))
+           {
+             /* We can break the loop because there is no need of
+                changing the byte sequence.  This is the typical
+                case. */
+             scan_count = 1;
+           }
+         else
+           {
+             /* Prepare for the next scan to handle non-Unicode characters. */
+             if (num_8_bit > 0)
+               {
+                 if (CHARACTERP (handle_8_bit))
+                   replace_8_bit = get_char_bytes (XFIXNUM (handle_8_bit),
+                                                   &replace_8_bit_len);
+                 else if (STRINGP (handle_8_bit))
+                   {
+                     replace_8_bit = SDATA (handle_8_bit);
+                     replace_8_bit_len = SBYTES (handle_8_bit);
+                   }
+                 if (replace_8_bit)
+                   outbytes += (replace_8_bit_len - 2) * num_8_bit;
+                 else if (EQ (handle_8_bit, Qignored))
+                   outbytes -= 2 * num_8_bit;
+                 else if (EQ (handle_8_bit, Qt))
+                   outbytes -= num_8_bit;
+                 else
+                   return Qnil;
+               }
+             if (num_over_4 + num_over_5 > 0)
+               {
+                 if (CHARACTERP (handle_over_uni))
+                   replace_over_uni = get_char_bytes (XFIXNUM (handle_over_uni),
+                                                      &replace_over_uni_len);
+                 else if (STRINGP (handle_over_uni))
+                   {
+                     replace_over_uni = SDATA (handle_over_uni);
+                     replace_over_uni_len = SBYTES (handle_over_uni);
+                   }
+                 if (num_over_4 > 0)
+                   {
+                     if (replace_over_uni)
+                       outbytes += (replace_over_uni_len - 4) * num_over_4;
+                     else if (EQ (handle_over_uni, Qignored))
+                       outbytes -= 4 * num_over_4;
+                     else if (! EQ (handle_over_uni, Qt))
+                       return Qnil;
+                   }
+                 if (num_over_5 > 0)
+                   {
+                     if (replace_over_uni)
+                       outbytes += (replace_over_uni_len - 5) * num_over_5;
+                     else if (EQ (handle_over_uni, Qignored))
+                       outbytes -= 5 * num_over_5;
+                     else if (! EQ (handle_over_uni, Qt))
+                       return Qnil;
+                   }
+               }
+           }
+
+         /* Prepare a return value and a space to store the encoded bytes. */
+         if (BUFFERP (buffer))
+           {
+             val = make_fixnum (outbytes);
+             dst = get_buffer_gap_address (buffer, nbytes);
+           }
+         else
+           {
+             if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0)
+               return string;
+             val = make_uninit_string (outbytes);
+             dst = SDATA (val);
+           }
+         p = src = SDATA (string);
+       }
+    }
+
+  if (src < pend)
+    memcpy (dst, src, pend - src);
+  if (BUFFERP (buffer))
+    {
+      struct buffer *oldb = current_buffer;
+
+      current_buffer = XBUFFER (buffer);
+      insert_from_gap (outbytes, outbytes, false);
+      current_buffer = oldb;
+    }
+  return val;
+}
+
+/* Decode STRING by the coding system utf-8-unix.
+
+   Even if :post-read-conversion and :decode-translation-table
+   properties are put to that coding system, they are ignored.
+
+   This function assumes that arguments have values as described
+   below.  The validity must be assured by callers.
+
+   STRING is a unibyte string or an ASCII-only multibyte string.
+
+   BUFFER is a multibyte buffer or Qnil.
+
+   If BUFFER is a multibyte buffer, the decoding result of Unicode
+   characters are inserted after point of the buffer, and the number
+   of inserted characters is returned.  Note that a caller should have
+   made BUFFER ready for modifying in advance (e.g. by calling
+   invalidate_buffer_caches).
+
+   If BUFFER is Qnil, a multibyte string is made from the decoding
+   result of Unicode characters, and it is returned.  As a special
+   case, STRING itself is returned in the following cases:
+   1. STRING contains only ASCII characters.
+   2. NOCOPY, and STRING contains only valid UTF-8 sequences.
+
+   HANDLE-8-BIT and HANDLE-OVER-UNI specify how to handle a invalid
+   byte sequence.  The former is for an 1-byte invalid sequence that
+   violates the fundamental UTF-8 encoding rule.  The latter is for a
+   4 or 5-byte invalid sequence that Emacs internally uses to
+   represent an over-unicode character (a character of code greater
+   than #x10FFFF).  Note that this function does not treat an overlong
+   UTF-8 sequence as invalid.
+
+   If they are strings (typically 1-char string of the Unicode
+   REPLACEMENT CHARACTER #xFFFD), an invalid sequence is decoded into
+   that string.  They must be multibyte strings if they contain a
+   non-ASCII character.
+
+   If they are characters, an invalid sequence is decoded into the
+   corresponding multibyte representation of the characters.
+
+   If they are Qignored, an invalid sequence is skipped on decoding.
+
+   If HANDLE-8-BIT is Qt, an 1-byte invalid sequence is deoded into
+   the corresponding eight-bit character.
+
+   If HANDLE-OVER-UNI is Qt, a 4 or 5-byte invalid sequence that
+   follows Emacs' representation for an over-unicode character is
+   decoded into the corresponding character.
+
+   If they are Qnil, Qnil is returned if STRING has an invalid sequence.  */
+
+Lisp_Object
+decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
+                    bool nocopy, Lisp_Object handle_8_bit,
+                    Lisp_Object handle_over_uni)
+{
+  /* This is like BYTES_BY_CHAR_HEAD, but it is assured that C >= 0x80
+     and it returns 0 for invalid sequence.  */
+#define UTF_8_SEQUENCE_LENGTH(c)       \
+  ((c) < 0xC2 ? 0                      \
+   : (c) < 0xE0 ? 2                    \
+   : (c) < 0xF0 ? 3                    \
+   : (c) < 0xF8 ? 4                    \
+   : (c) == 0xF8 ? 5                   \
+   : 0)
+
+  ptrdiff_t nbytes = SBYTES (string);
+  unsigned char *p = SDATA (string), *pend = p + nbytes;
+  ptrdiff_t num_8_bit = 0;   /* number of invalid 1-byte sequences. */
+  ptrdiff_t num_over_4 = 0;  /* number of invalid 4-byte sequences. */
+  ptrdiff_t num_over_5 = 0;  /* number of invalid 5-byte sequences. */
+  ptrdiff_t outbytes = nbytes; /* number of decoded bytes. */
+  ptrdiff_t outchars = 0;    /* number of decoded characters. */
+  unsigned char *src = NULL, *dst = NULL;
+  bool change_byte_sequence = false;
+
+  /* Scan bytes in STRING twice.  The first scan is to count invalid
+     sequences, and the second scan is to decode STRING.  If the
+     decoding is trivial (no need of changing the byte sequence),
+     the second scan is avoided.  */
+  while (p < pend)
+    {
+      src = p;
+      /* Try short cut for an ASCII-only case. */
+      while (p < pend && *p < 0x80) p++;
+      outchars += (p - src);
+      if (p == pend)
+       break;
+      int c = *p;
+      outchars++;
+      int len = UTF_8_SEQUENCE_LENGTH (c);
+      /* len == 0, 2, 3, 4, 5 */
+      if (UTF_8_EXTRA_OCTET_P (p[1])
+         && (len == 2
+             || (UTF_8_EXTRA_OCTET_P (p[2])
+                 && (len == 3
+                     || (UTF_8_EXTRA_OCTET_P (p[3])
+                         && len == 4
+                         && string_char (p, NULL, NULL) <= MAX_UNICODE_CHAR)))))
+       {
+         p += len;
+         continue;
+       }
+
+      /* A sequence to change on decoding was found.  A rare case. */
+      if (len == 0)
+       {
+         if (NILP (handle_8_bit))
+           return Qnil;
+         num_8_bit++;
+         len = 1;
+       }
+      else                     /* len == 4 or 5 */
+       {
+         if (NILP (handle_over_uni))
+           return Qnil;
+         if (len == 4)
+           num_over_4++;
+         else
+           num_over_5++;
+       }
+      change_byte_sequence = true;
+      p += len;
+    }
+
+  Lisp_Object val;          /* the return value. */
+
+  if (! change_byte_sequence
+      && NILP (buffer))
+    {
+      if (nocopy)
+       return string;
+      val = make_uninit_multibyte_string (outchars, outbytes);
+      memcpy (SDATA (val), SDATA (string), pend - SDATA (string));
+      return val;
+    }
+
+  /* Count the number of resulting chars and bytes. */
+  unsigned char *replace_8_bit = NULL, *replace_over_uni = NULL;
+  int replace_8_bit_len = 0, replace_over_uni_len = 0;
+
+  if (change_byte_sequence)
+    {
+      if (num_8_bit > 0)
+       {
+         if (CHARACTERP (handle_8_bit))
+           replace_8_bit = get_char_bytes (XFIXNUM (handle_8_bit),
+                                           &replace_8_bit_len);
+         else if (STRINGP (handle_8_bit))
+           {
+             replace_8_bit = SDATA (handle_8_bit);
+             replace_8_bit_len = SBYTES (handle_8_bit);
+           }
+         if (replace_8_bit)
+           outbytes += (replace_8_bit_len - 1) * num_8_bit;
+         else if (EQ (handle_8_bit, Qignored))
+           {
+             outbytes -= num_8_bit;
+             outchars -= num_8_bit;
+           }
+         else /* EQ (handle_8_bit, Qt)) */
+           outbytes += num_8_bit;
+       }
+      else if (num_over_4 + num_over_5 > 0)
+       {
+         if (CHARACTERP (handle_over_uni))
+           replace_over_uni = get_char_bytes (XFIXNUM (handle_over_uni),
+                                              &replace_over_uni_len);
+         else if (STRINGP (handle_over_uni))
+           {
+             replace_over_uni = SDATA (handle_over_uni);
+             replace_over_uni_len = SBYTES (handle_over_uni);
+           }
+         if (num_over_4 > 0)
+           {
+             if (replace_over_uni)
+               outbytes += (replace_over_uni_len - 4) * num_over_4;
+             else if (EQ (handle_over_uni, Qignored))
+               {
+                 outbytes -= 4 * num_over_4;
+                 outchars -= num_over_4;
+               }
+           }
+         if (num_over_5 > 0)
+           {
+             if (replace_over_uni)
+               outbytes += (replace_over_uni_len - 5) * num_over_5;
+             else if (EQ (handle_over_uni, Qignored))
+               {
+                 outbytes -= 5 * num_over_5;
+                 outchars -= num_over_5;
+               }
+           }
+       }
+    }
+
+  /* Prepare a return value and a space to store the decoded bytes. */
+  if (BUFFERP (buffer))
+    {
+      val = make_fixnum (outchars);
+      dst = get_buffer_gap_address (buffer, outbytes);
+    }
+  else
+    {
+      if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0)
+       return string;
+      val = make_uninit_multibyte_string (outchars, outbytes);
+      dst = SDATA (val);
+    }
+
+  src = SDATA (string);
+  if (change_byte_sequence)
+    {
+      p = src;
+      while (p < pend)
+       {
+         /* Try short cut for an ASCII-only case. */
+         /* while (p < pend && *p < 0x80) p++; */
+         /* if (p == pend) */
+         /*   break; */
+         int c = *p;
+         if (c < 0x80)
+           {
+             p++;
+             continue;
+           }
+         int len = UTF_8_SEQUENCE_LENGTH (c);
+         if (len > 1)
+           {
+             int mlen;
+             for (mlen = 1; mlen < len && UTF_8_EXTRA_OCTET_P (p[mlen]);
+                  mlen++);
+             if (mlen == len
+                 && (len <= 3
+                     || (len == 4
+                         && string_char (p, NULL, NULL) <= MAX_UNICODE_CHAR)
+                     || EQ (handle_over_uni, Qt)))
+               {
+                 p += len;
+                 continue;
+               }
+           }
+
+         if (src < p)
+           {
+             memcpy (dst, src, p - src);
+             dst += p - src;
+           }
+         if (len == 0)
+           {
+             if (replace_8_bit)
+               {
+                 memcpy (dst, replace_8_bit, replace_8_bit_len);
+                 dst += replace_8_bit_len;
+               }
+             else if (EQ (handle_8_bit, Qt))
+               {
+                 dst += BYTE8_STRING (c, dst);
+               }
+             len = 1;
+           }
+         else                  /* len == 4 or 5 */
+           {
+             /* Handle p[0]... by handle_over_uni */
+             if (replace_over_uni)
+               {
+                 memcpy (dst, replace_over_uni, replace_over_uni_len);
+                 dst += replace_over_uni_len;
+               }
+           }
+         p += len;
+         src = p;
+       }
+    }
+
+  if (src < pend)
+    memcpy (dst, src, pend - src);
+  if (BUFFERP (buffer))
+    {
+      struct buffer *oldb = current_buffer;
+
+      current_buffer = XBUFFER (buffer);
+      insert_from_gap (outchars, outbytes, false);
+      current_buffer = oldb;
+    }
+  return val;
+}
+
+/* #define ENABLE_UTF_8_CONVERTER_TEST */
+
+#ifdef ENABLE_UTF_8_CONVERTER_TEST
+
+/* These functions are useful for testing and benchmarking
+   encode_string_utf_8 and decode_string_utf_8.  */
+
+/* ENCODE_METHOD specifies which internal decoder to use.
+   If it is Qnil, use encode_string_utf_8.
+   Otherwise, use code_convert_string.
+
+   COUNT, if integer, specifies how many times to call those functions
+   with the same arguments (for benchmarking). */
+
+DEFUN ("internal-encode-string-utf-8", Finternal_encode_string_utf_8,
+       Sinternal_encode_string_utf_8, 7, 7, 0,
+       doc: /* Internal use only.*/)
+  (Lisp_Object string, Lisp_Object buffer, Lisp_Object nocopy,
+   Lisp_Object handle_8_bit, Lisp_Object handle_over_uni,
+   Lisp_Object encode_method, Lisp_Object count)
+{
+  int repeat_count;
+  Lisp_Object val;
+
+  /* Check arguments.  Return Qnil when an argmement is invalid.  */
+  if (! STRINGP (string))
+    return Qnil;
+  if (! NILP (buffer)
+      && (! BUFFERP (buffer)
+         || ! NILP (BVAR (XBUFFER (buffer), enable_multibyte_characters))))
+    return Qnil;
+  if (! NILP (handle_8_bit) && ! EQ (handle_8_bit, Qt)
+      && ! EQ (handle_8_bit, Qignored)
+      && ! CHARACTERP (handle_8_bit)
+      && (! STRINGP (handle_8_bit) || STRING_MULTIBYTE (handle_8_bit)))
+    return Qnil;
+  if (! NILP (handle_over_uni) && ! EQ (handle_over_uni, Qt)
+      && ! EQ (handle_over_uni, Qignored)
+      && ! CHARACTERP (handle_over_uni)
+      && (! STRINGP (handle_over_uni) || STRING_MULTIBYTE (handle_over_uni)))
+    return Qnil;
+
+  CHECK_FIXNUM (count);
+  repeat_count = XFIXNUM (count);
+
+  val = Qnil;
+  /* Run an encoder according to ENCODE_METHOD.  */
+  if (NILP (encode_method))
+    {
+      for (int i = 0; i < repeat_count; i++)
+       val = encode_string_utf_8 (string, buffer, ! NILP (nocopy),
+                                  handle_8_bit, handle_over_uni);
+    }
+  else
+    {
+      for (int i = 0; i < repeat_count; i++)
+       val = code_convert_string (string, Qutf_8_unix, Qnil, true,
+                                  ! NILP (nocopy), true);
+    }
+  return val;
+}
+
+/* DECODE_METHOD specifies which internal decoder to use.
+   If it is Qnil, use decode_string_utf_8.
+   If it is Qt, use code_convert_string.
+   Otherwise, use make_string_from_utf8.
+
+   COUNT, if integer, specifies how many times to call those functions
+   with the same arguments (for benchmarking).  */
+
+DEFUN ("internal-decode-string-utf-8", Finternal_decode_string_utf_8,
+       Sinternal_decode_string_utf_8, 7, 7, 0,
+       doc: /* Internal use only.*/)
+  (Lisp_Object string, Lisp_Object buffer, Lisp_Object nocopy,
+   Lisp_Object handle_8_bit, Lisp_Object handle_over_uni,
+   Lisp_Object decode_method, Lisp_Object count)
+{
+  int repeat_count;
+  Lisp_Object val;
+
+  /* Check arguments.  Return Qnil when an argmement is invalid.  */
+  if (! STRINGP (string))
+    return Qnil;
+  if (! NILP (buffer)
+      && (! BUFFERP (buffer)
+         || NILP (BVAR (XBUFFER (buffer), enable_multibyte_characters))))
+    return Qnil;
+  if (! NILP (handle_8_bit) && ! EQ (handle_8_bit, Qt)
+      && ! EQ (handle_8_bit, Qignored)
+      && ! CHARACTERP (handle_8_bit)
+      && (! STRINGP (handle_8_bit) || ! STRING_MULTIBYTE (handle_8_bit)))
+    return Qnil;
+  if (! NILP (handle_over_uni) && ! EQ (handle_over_uni, Qt)
+      && ! EQ (handle_over_uni, Qignored)
+      && ! CHARACTERP (handle_over_uni)
+      && (! STRINGP (handle_over_uni) || ! STRING_MULTIBYTE (handle_over_uni)))
+    return Qnil;
+
+  CHECK_FIXNUM (count);
+  repeat_count = XFIXNUM (count);
+
+  val = Qnil;
+  /* Run a decoder according to DECODE_METHOD.  */
+  if (NILP (decode_method))
+    {
+      for (int i = 0; i < repeat_count; i++)
+       val = decode_string_utf_8 (string, buffer, ! NILP (nocopy),
+                                  handle_8_bit, handle_over_uni);
+    }
+  else if (EQ (decode_method, Qt))
+    {
+      if (! BUFFERP (buffer))
+       buffer = Qt;
+      for (int i = 0; i < repeat_count; i++)
+       val = code_convert_string (string, Qutf_8_unix, buffer, false,
+                                  ! NILP (nocopy), true);
+    }
+  else if (! NILP (decode_method))
+    {
+      for (int i = 0; i < repeat_count; i++)
+       val = make_string_from_utf8 ((char *) SDATA (string), SBYTES (string));
+    }
+  return val;
+}
+
+#endif /* ENABLE_UTF_8_CONVERTER_TEST */
+
  /* Encode or decode a file name, to or from a unibyte string suitable
     for passing to C library functions.  */
  Lisp_Object
@@ -10974,6 +11700,10 @@ syms_of_coding (void)
    defsubr (&Sencode_coding_region);
    defsubr (&Sdecode_coding_string);
    defsubr (&Sencode_coding_string);
+#ifdef ENABLE_UTF_8_CONVERTER_TEST
+  defsubr (&Sinternal_encode_string_utf_8);
+  defsubr (&Sinternal_decode_string_utf_8);
+#endif /* ENABLE_UTF_8_CONVERTER_TEST */
    defsubr (&Sdecode_sjis_char);
    defsubr (&Sencode_sjis_char);
    defsubr (&Sdecode_big5_char);
diff --git a/src/coding.h b/src/coding.h

index 70690d42d30229323786b07eea9756be85e1a868..8efddbf55c40a907477ba036bcb19e94bac79652 100644 (file)
--- a/src/coding.h
+++ b/src/coding.h
@@ -689,6 +689,10 @@ extern Lisp_Object code_convert_string (Lisp_Object, Lisp_Object,
                                          Lisp_Object, bool, bool, bool);
  extern Lisp_Object code_convert_string_norecord (Lisp_Object, Lisp_Object,
                                                   bool);
+extern Lisp_Object encode_string_utf_8 (Lisp_Object, Lisp_Object, bool,
+                                       Lisp_Object, Lisp_Object);
+extern Lisp_Object decode_string_utf_8 (Lisp_Object, Lisp_Object, bool,
+                                       Lisp_Object, Lisp_Object);
  extern Lisp_Object encode_file_name (Lisp_Object);
  extern Lisp_Object decode_file_name (Lisp_Object);
  extern Lisp_Object raw_text_coding_system (Lisp_Object);
author	K. Handa <handa@gnu.org>
	Sun, 4 Aug 2019 12:14:26 +0000 (21:14 +0900)
committer	K. Handa <handa@gnu.org>
	Sun, 4 Aug 2019 12:14:26 +0000 (21:14 +0900)
src/coding.c		patch \| blob \| history
src/coding.h		patch \| blob \| history