From: Eli Zaretskii <eliz@gnu.org>
Date: Sat, 23 Nov 2019 09:27:43 +0000 (+0200)
Subject: Fix and speed up en/decoding of UTF-8 strings
X-Git-Tag: emacs-27.0.90~546
X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=c26556bd18f8ca1e891bd1750c9f95b21ea457b0;p=emacs.git

Fix and speed up en/decoding of UTF-8 strings

* src/coding.c (get_char_bytes, encode_string_utf_8)
(decode_string_utf_8): Fix commentary.
(encode_string_utf_8): Return the original ASCII string only
if NOCOPY is non-zero.
(decode_string_utf_8): Accept 2 additional arguments STR and
STR_LEN, which allow to pass the input text as a C string.
(make_string_from_utf8): Delegate the job to decode_string_utf_8.
* src/coding.h: Update the prototype of decode_string_utf_8.
* src/json.c (json_encode): Call encode_string_utf_8.
---

diff --git a/src/coding.c b/src/coding.c
index 560ec0883ff..5f477cf9473 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -6353,11 +6353,15 @@ utf8_string_p (Lisp_Object string)
 }
 
 /* Like make_string, but always returns a multibyte Lisp string, and
-   avoids decoding if TEXT encoded in UTF-8.  */
-
+   avoids decoding if TEXT is encoded in UTF-8.  */
 Lisp_Object
 make_string_from_utf8 (const char *text, ptrdiff_t nbytes)
 {
+#if 0
+  /* This method is on average 2 times slower than if we use
+     decode_string_utf_8.  However, please leave the slower
+     implementation in the code for now, in case it needs to be reused
+     in some situations.  */
   ptrdiff_t chars, bytes;
   parse_str_as_multibyte ((const unsigned char *) text, nbytes,
 			  &chars, &bytes);
@@ -6374,6 +6378,9 @@ make_string_from_utf8 (const char *text, ptrdiff_t nbytes)
       decode_coding_object (&coding, Qnil, 0, 0, nbytes, nbytes, Qt);
       return coding.dst_object;
     }
+#else
+  return decode_string_utf_8 (Qnil, text, nbytes, Qnil, false, Qt, Qt);
+#endif
 }
 
 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
@@ -9537,7 +9544,7 @@ get_buffer_gap_address (Lisp_Object buffer, ptrdiff_t nbytes)
   return BUF_GPT_ADDR (buf);
 }
 
-/* Return a pointer to the byte sequence for C, and set the length in
+/* Return a pointer to the byte sequence for C, and its byte length in
    LEN.  This function is used to get a byte sequence for HANDLE_8_BIT
    and HANDLE_OVER_UNI arguments of encode_string_utf_8 and
    decode_string_utf_8 when those arguments are given by
@@ -9572,11 +9579,16 @@ get_char_bytes (int c, int *len)
 
 /* Encode STRING by the coding system utf-8-unix.
 
+   This function is optimized for speed when the input string is
+   already a valid sequence of Unicode codepoints in the internal
+   representation, i.e. there are neither 8-bit raw bytes nor
+   characters beyond the Unicode range in the string's contents.
+
    Ignore any :pre-write-conversion and :encode-translation-table
-   properties of that coding system.
+   properties.
 
    Assume that arguments have values as described below.
-   The validity must be assured by callers.
+   The validity must be enforced and ensured by the caller.
 
    STRING is a multibyte string or an ASCII-only unibyte string.
 
@@ -9587,17 +9599,24 @@ get_char_bytes (int c, int *len)
    inserted characters.  The caller should have made BUFFER ready for
    modifying in advance (e.g., by calling invalidate_buffer_caches).
 
-   If BUFFER is Qnil, return a unibyte string from the encoded result.
-   If NOCOPY, and if STRING contains only Unicode characters (i.e.,
-   the encoding does not change the byte sequence), return STRING even
-   if it is multibyte.
+   If BUFFER is nil, return a unibyte string from the encoded result.
+
+   If NOCOPY is non-zero, and if STRING contains only Unicode
+   characters (i.e., the encoding does not change the byte sequence),
+   return STRING even if it is multibyte.  WARNING: This will return a
+   _multibyte_ string, something that callers might not expect, especially
+   if STRING is not pure-ASCII; only use NOCOPY non-zero if the caller
+   will only use the byte sequence of the encoded result accessed by
+   SDATA or SSDATA, and the original STRING will _not_ be modified after
+   the encoding.  When in doubt, always pass NOCOPY as zero.  You _have_
+   been warned!
 
    HANDLE-8-BIT and HANDLE-OVER-UNI specify how to handle a non-Unicode
-   character.  The former is for an eight-bit character (represented
+   character in STRING.  The former is for an eight-bit character (represented
    by a 2-byte overlong sequence in a multibyte STRING).  The latter is
-   for an over-Unicode character (a character whose code is greater
-   than the maximum Unicode character 0x10FFFF, represented by a 4 or
-   5-byte sequence in a multibyte STRING).
+   for a codepoint beyond the end of the Unicode range (a character whose
+   code is greater than the maximum Unicode character 0x10FFFF, represented
+   by a 4 or 5-byte sequence in a multibyte STRING).
 
    If these two arguments are unibyte strings (typically
    "\357\277\275", the UTF-8 sequence for the Unicode REPLACEMENT
@@ -9605,18 +9624,20 @@ get_char_bytes (int c, int *len)
    unibyte sequence.
 
    If the two arguments are characters, encode a non-Unicode
-   character as if it was the argument.
+   character as the respective argument characters.
 
    If they are Qignored, skip a non-Unicode character.
 
-   If HANDLE-8-BIT is Qt, encode an eight-bit character into one
-   byte of the same value.
+   If HANDLE-8-BIT is Qt, encode eight-bit characters into single bytes
+   of the same value, like the usual Emacs encoding does.
 
-   If HANDLE-OVER-UNI is Qt, encode an over-unicode character
-   into the same 4 or 5-byte sequence.
+   If HANDLE-OVER-UNI is Qt, encode characters beyond the Unicode
+   range into the same 4 or 5-byte sequence as used by Emacs
+   internally, like the usual Emacs encoding does.
 
    If the two arguments are Qnil, return Qnil if STRING has a
-   non-Unicode character.  */
+   non-Unicode character.  This allows the caller to signal an error
+   if such input strings are not allowed.  */
 
 Lisp_Object
 encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
@@ -9624,15 +9645,15 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
 		     Lisp_Object handle_over_uni)
 {
   ptrdiff_t nchars = SCHARS (string), nbytes = SBYTES (string);
-  if (NILP (buffer) && nchars == nbytes)
-    /* STRING contains only ASCII characters. */
+  if (NILP (buffer) && nchars == nbytes && nocopy)
+    /* STRING contains only ASCII characters.  */
     return string;
 
   ptrdiff_t num_8_bit = 0;   /* number of eight-bit chars in STRING */
   /* The following two vars are counted only if handle_over_uni is not Qt.  */
   ptrdiff_t num_over_4 = 0; /* number of 4-byte non-Unicode chars in STRING */
   ptrdiff_t num_over_5 = 0; /* number of 5-byte non-Unicode chars in STRING */
-  ptrdiff_t outbytes;	     /* number of bytes of decoding result. */
+  ptrdiff_t outbytes;	     /* number of bytes of decoding result */
   unsigned char *p = SDATA (string);
   unsigned char *pend = p + nbytes;
   unsigned char *src = NULL, *dst = NULL;
@@ -9668,10 +9689,10 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
 	    }
 
 	  /* A character to change the byte sequence on encoding was
-	     found.  A rare case. */
+	     found.  A rare case.  */
 	  if (len == 2)
 	    {
-	      /* Handle an eight-bit character by handle_8_bit. */
+	      /* Handle an eight-bit character by handle_8_bit.  */
 	      if (scan_count == 0)
 		{
 		  if (NILP (handle_8_bit))
@@ -9699,7 +9720,7 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
 	    }
 	  else			/* len == 4 or 5 */
 	    {
-	      /* Handle an over-unicode character by handle_over_uni. */
+	      /* Handle an over-unicode character by handle_over_uni.  */
 	      if (scan_count == 0)
 		{
 		  if (NILP (handle_over_uni))
@@ -9729,19 +9750,20 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
 
       if (scan_count == 0)
 	{
-	  /* End of the first scane */
+	  /* End of the first scan.  */
 	  outbytes = nbytes;
 	  if (num_8_bit == 0
 	      && (num_over_4 + num_over_5 == 0 || EQ (handle_over_uni, Qt)))
 	    {
 	      /* We can break the loop because there is no need of
 		 changing the byte sequence.  This is the typical
-		 case. */
+		 case.  */
 	      scan_count = 1;
 	    }
 	  else
 	    {
-	      /* Prepare for the next scan to handle non-Unicode characters. */
+	      /* Prepare for handling non-Unicode characters during
+		 the next scan.  */
 	      if (num_8_bit > 0)
 		{
 		  if (CHARACTERP (handle_8_bit))
@@ -9792,7 +9814,7 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
 		}
 	    }
 
-	  /* Prepare a return value and a space to store the encoded bytes. */
+	  /* Prepare return value and space to store the encoded bytes.  */
 	  if (BUFFERP (buffer))
 	    {
 	      val = make_fixnum (outbytes);
@@ -9822,38 +9844,51 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
   return val;
 }
 
-/* Decode STRING by the coding system utf-8-unix.
+/* Decode input string by the coding system utf-8-unix.
 
-   Ignore any :pre-write-conversion and :encode-translation-table
-   properties of that coding system.
+   This function is optimized for speed when the input string is
+   already a valid UTF-8 sequence, i.e. there are neither 8-bit raw
+   bytes nor any UTF-8 sequences longer than 4 bytes in the string's
+   contents.
 
-   Assumes that arguments have values as described below.
-   The validity must be assured by callers.
+   Ignore any :post-read-conversion and :decode-translation-table
+   properties.
 
-   STRING is a unibyte string or an ASCII-only multibyte string.
+   Assume that arguments have values as described below.
+   The validity must be enforced and ensured by the caller.
 
-   BUFFER is a multibyte buffer or Qnil.
+   STRING is a unibyte string, an ASCII-only multibyte string, or Qnil.
+   If STRING is Qnil, the input is a C string pointed by STR whose
+   length in bytes is in STR_LEN.
 
+   BUFFER is a multibyte buffer or Qnil.
    If BUFFER is a multibyte buffer, insert the decoding result of
    Unicode characters after point of the buffer, and return the number
    of inserted characters.  The caller should have made BUFFER ready
    for modifying in advance (e.g., by calling invalidate_buffer_caches).
 
    If BUFFER is Qnil, return a multibyte string from the decoded result.
-   As a special case, return STRING itself in the following cases:
-   1. STRING contains only ASCII characters.
-   2. NOCOPY is true, and STRING contains only valid UTF-8 sequences.
 
-   For maximum speed, always specify NOCOPY true when STRING is
-   guaranteed to contain only valid UTF-8 sequences.
+   NOCOPY non-zero means it is OK to return the input STRING if it
+   contains only ASCII characters or only valid UTF-8 sequences of 2
+   to 4 bytes.  WARNING: This will return a _unibyte_ string, something
+   that callers might not expect, especially if STRING is not
+   pure-ASCII; only use NOCOPY non-zero if the caller will only use
+   the byte sequence of the decoded result accessed via SDATA or
+   SSDATA, and if the original STRING will _not_ be modified after the
+   decoding.  When in dount, always pass NOCOPY as zero.  You _have_
+   been warned!
+
+   If STRING is Qnil, and the original string is passed via STR, NOCOPY
+   is ignored.
 
    HANDLE-8-BIT and HANDLE-OVER-UNI specify how to handle a invalid
-   byte sequence.  The former is for an 1-byte invalid sequence that
-   violates the fundamental UTF-8 encoding rule.  The latter is for a
-   4 or 5-byte invalid sequence that Emacs internally uses to
-   represent an over-unicode character (a character of code greater
-   than #x10FFFF).  Note that this function does not treat an overlong
-   UTF-8 sequence as invalid.
+   byte sequence.  The former is for a 1-byte invalid sequence that
+   violates the fundamental UTF-8 encoding rules.  The latter is for a
+   4 or 5-byte overlong sequences that Emacs internally uses to
+   represent characters beyond the Unicode range (characters whose
+   codepoints are greater than #x10FFFF).  Note that this function does
+   not in general treat such overlong UTF-8 sequences as invalid.
 
    If these two arguments are strings (typically a 1-char string of
    the Unicode REPLACEMENT CHARACTER #xFFFD), decode an invalid byte
@@ -9862,24 +9897,28 @@ encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
 
    If the two arguments are characters, decode an invalid byte
    sequence into the corresponding multibyte representation of the
-   characters.
+   respective character.
 
-   If they are Qignored, skip an invalid byte sequence.
+   If they are Qignored, skip an invalid byte sequence without
+   producing anything in the decoded string.
 
-   If HANDLE-8-BIT is Qt, decode a 1-byte invalid sequence into
-   the corresponding eight-bit character.
+   If HANDLE-8-BIT is Qt, decode a 1-byte invalid sequence into the
+   corresponding eight-bit multibyte representation, like the usual
+   Emacs decoding does.
 
-   If HANDLE-OVER-UNI is Qt, decode a 4 or 5-byte invalid sequence
-   that follows Emacs' representation for an over-unicode character
-   into the corresponding character.
+   If HANDLE-OVER-UNI is Qt, decode a 4 or 5-byte overlong sequence
+   that follows Emacs' internal representation for a character beyond
+   Unicode range into the corresponding character, like the usual
+   Emacs decoding does.
 
-   If the two arguments are Qnil, return Qnil if STRING has an invalid
-   sequence.  */
+   If the two arguments are Qnil, return Qnil if the input string has
+   raw bytes or overlong sequences.  This allows the caller to signal
+   an error if such inputs are not allowed.  */
 
 Lisp_Object
-decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
-		     bool nocopy, Lisp_Object handle_8_bit,
-		     Lisp_Object handle_over_uni)
+decode_string_utf_8 (Lisp_Object string, const char *str, ptrdiff_t str_len,
+		     Lisp_Object buffer, bool nocopy,
+		     Lisp_Object handle_8_bit, Lisp_Object handle_over_uni)
 {
   /* This is like BYTES_BY_CHAR_HEAD, but it is assured that C >= 0x80
      and it returns 0 for an invalid sequence.  */
@@ -9891,24 +9930,26 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
    : (c) == 0xF8 ? 5			\
    : 0)
 
-  ptrdiff_t nbytes = SBYTES (string);
-  unsigned char *p = SDATA (string), *pend = p + nbytes;
-  ptrdiff_t num_8_bit = 0;   /* number of invalid 1-byte sequences. */
-  ptrdiff_t num_over_4 = 0;  /* number of invalid 4-byte sequences. */
-  ptrdiff_t num_over_5 = 0;  /* number of invalid 5-byte sequences. */
-  ptrdiff_t outbytes = nbytes;	/* number of decoded bytes. */
-  ptrdiff_t outchars = 0;    /* number of decoded characters. */
+  ptrdiff_t nbytes = STRINGP (string) ? SBYTES (string) : str_len;
+  unsigned char *p = STRINGP (string) ? SDATA (string) : (unsigned char *) str;
+  unsigned char *str_orig = p;
+  unsigned char *pend = p + nbytes;
+  ptrdiff_t num_8_bit = 0;   /* number of invalid 1-byte sequences */
+  ptrdiff_t num_over_4 = 0;  /* number of invalid 4-byte sequences */
+  ptrdiff_t num_over_5 = 0;  /* number of invalid 5-byte sequences */
+  ptrdiff_t outbytes = nbytes;	/* number of decoded bytes */
+  ptrdiff_t outchars = 0;    /* number of decoded characters */
   unsigned char *src = NULL, *dst = NULL;
   bool change_byte_sequence = false;
 
-  /* Scan bytes in STRING twice.  The first scan is to count invalid
-     sequences, and the second scan is to decode STRING.  If the
+  /* Scan input bytes twice.  The first scan is to count invalid
+     sequences, and the second scan is to decode input.  If the
      decoding is trivial (no need of changing the byte sequence),
      the second scan is avoided.  */
   while (p < pend)
     {
       src = p;
-      /* Try short cut for an ASCII-only case. */
+      /* Try short cut for an ASCII-only case.  */
       while (p < pend && *p < 0x80) p++;
       outchars += (p - src);
       if (p == pend)
@@ -9916,7 +9957,7 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
       int c = *p;
       outchars++;
       int len = UTF_8_SEQUENCE_LENGTH (c);
-      /* len == 0, 2, 3, 4, 5 */
+      /* len == 0, 2, 3, 4, 5.  */
       if (UTF_8_EXTRA_OCTET_P (p[1])
 	  && (len == 2
 	      || (UTF_8_EXTRA_OCTET_P (p[2])
@@ -9930,7 +9971,7 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
 	  continue;
 	}
 
-      /* A sequence to change on decoding was found.  A rare case. */
+      /* A sequence to change on decoding was found.  A rare case.  */
       if (len == 0)
 	{
 	  if (NILP (handle_8_bit))
@@ -9951,19 +9992,19 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
       p += len;
     }
 
-  Lisp_Object val;	     /* the return value. */
+  Lisp_Object val;	     /* the return value */
 
   if (! change_byte_sequence
       && NILP (buffer))
     {
-      if (nocopy)
+      if (nocopy && STRINGP (string))
 	return string;
       val = make_uninit_multibyte_string (outchars, outbytes);
-      memcpy (SDATA (val), SDATA (string), pend - SDATA (string));
+      memcpy (SDATA (val), str_orig, pend - str_orig);
       return val;
     }
 
-  /* Count the number of resulting chars and bytes. */
+  /* Count the number of resulting chars and bytes.  */
   unsigned char *replace_8_bit = NULL, *replace_over_uni = NULL;
   int replace_8_bit_len = 0, replace_over_uni_len = 0;
 
@@ -10022,7 +10063,7 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
 	}
     }
 
-  /* Prepare a return value and a space to store the decoded bytes. */
+  /* Prepare return value and  space to store the decoded bytes.  */
   if (BUFFERP (buffer))
     {
       val = make_fixnum (outchars);
@@ -10030,19 +10071,20 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
     }
   else
     {
-      if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0)
+      if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0
+	  && STRINGP (string))
 	return string;
       val = make_uninit_multibyte_string (outchars, outbytes);
       dst = SDATA (val);
     }
 
-  src = SDATA (string);
+  src = str_orig;
   if (change_byte_sequence)
     {
       p = src;
       while (p < pend)
 	{
-	  /* Try short cut for an ASCII-only case. */
+	  /* Try short cut for an ASCII-only case.  */
 	  /* while (p < pend && *p < 0x80) p++; */
 	  /* if (p == pend) */
 	  /*   break; */
@@ -10089,7 +10131,7 @@ decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
 	    }
 	  else			/* len == 4 or 5 */
 	    {
-	      /* Handle p[0]... by handle_over_uni */
+	      /* Handle p[0]... by handle_over_uni.  */
 	      if (replace_over_uni)
 		{
 		  memcpy (dst, replace_over_uni, replace_over_uni_len);
diff --git a/src/coding.h b/src/coding.h
index 8efddbf55c4..d552e7b4c8f 100644
--- a/src/coding.h
+++ b/src/coding.h
@@ -691,7 +691,8 @@ extern Lisp_Object code_convert_string_norecord (Lisp_Object, Lisp_Object,
                                                  bool);
 extern Lisp_Object encode_string_utf_8 (Lisp_Object, Lisp_Object, bool,
 					Lisp_Object, Lisp_Object);
-extern Lisp_Object decode_string_utf_8 (Lisp_Object, Lisp_Object, bool,
+extern Lisp_Object decode_string_utf_8 (Lisp_Object, const char *, ptrdiff_t,
+					Lisp_Object, bool,
 					Lisp_Object, Lisp_Object);
 extern Lisp_Object encode_file_name (Lisp_Object);
 extern Lisp_Object decode_file_name (Lisp_Object);
diff --git a/src/json.c b/src/json.c
index 5a3d0012f0a..cd3b9cc0236 100644
--- a/src/json.c
+++ b/src/json.c
@@ -228,7 +228,7 @@ json_encode (Lisp_Object string)
 {
   /* FIXME: Raise an error if STRING is not a scalar value
      sequence.  */
-  return code_convert_string (string, Qutf_8_unix, Qt, true, true, true);
+  return encode_string_utf_8 (string, Qnil, false, Qt, Qt);
 }
 
 static AVOID