Faster JSON parsing

author Mattias Engdegård <mattiase@acm.org>

Sun, 31 Mar 2024 13:00:00 +0000 (15:00 +0200)

committer Eshel Yaron <me@eshelyaron.com>

Tue, 2 Apr 2024 13:24:56 +0000 (15:24 +0200)
author Mattias Engdegård <mattiase@acm.org>
Sun, 31 Mar 2024 13:00:00 +0000 (15:00 +0200)
committer Eshel Yaron <me@eshelyaron.com>
Tue, 2 Apr 2024 13:24:56 +0000 (15:24 +0200)
diff --git a/etc/NEWS b/etc/NEWS

index 35c8dea8a2e607f8af56455be9c483834df96005..b6e7ec815bbc1b6130904915f7ecff2354dbe3db 100644 (file)
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -1838,6 +1838,11 @@ non-interactively.  This special behavior is removed in this version
  of Emacs, for consistency with the common Emacs behavior where
  minibuffer history is reserved for past minibuffer inputs.
  
+---
+** The JSON parser sometimes signals different types of errors.
+It will now signal 'json-utf8-decode-error' for inputs that are not
+correctly UTF-8 encoded.
+
  \f
  * Lisp Changes in Emacs 30.1
  
diff --git a/src/json.c b/src/json.c

index 908db022c50b4def13c746061e831a0c4974a3f2..8749009a24b92dcdeeac8a8f5bf7c2c6ee0d2caa 100644 (file)
--- a/src/json.c
+++ b/src/json.c
@@ -699,24 +699,6 @@ usage: (json-insert OBJECT &rest ARGS)  */)
  }
  
  
-/* Note that all callers of make_string_from_utf8 and build_string_from_utf8
-   below either pass only value UTF-8 strings or use the function for
-   formatting error messages; in the latter case correctness isn't
-   critical.  */
-
-/* Return a unibyte string containing the sequence of UTF-8 encoding
-   units of the UTF-8 representation of STRING.  If STRING does not
-   represent a sequence of Unicode scalar values, return a string with
-   unspecified contents.  */
-
-static Lisp_Object
-json_encode (Lisp_Object string)
-{
-  /* FIXME: Raise an error if STRING is not a scalar value
-     sequence.  */
-  return encode_string_utf_8 (string, Qnil, false, Qt, Qt);
-}
-
  #define JSON_PARSER_INTERNAL_OBJECT_WORKSPACE_SIZE 64
  #define JSON_PARSER_INTERNAL_BYTE_WORKSPACE_SIZE 512
  
@@ -1081,52 +1063,21 @@ json_parse_unicode (struct json_parser *parser)
    return v[0] << 12 | v[1] << 8 | v[2] << 4 | v[3];
  }
  
-/* Parses an utf-8 code-point encoding (except the first byte), and
-   returns the numeric value of the code-point (without considering
-   the first byte) */
-static int
-json_handle_utf8_tail_bytes (struct json_parser *parser, int n)
+static AVOID
+utf8_error (struct json_parser *parser)
  {
-  int v = 0;
-  for (int i = 0; i < n; i++)
-    {
-      int c = json_input_get (parser);
-      json_byte_workspace_put (parser, c);
-      if ((c & 0xc0) != 0x80)
-       json_signal_error (parser, Qjson_utf8_decode_error);
-      v = (v << 6) | (c & 0x3f);
-    }
-  return v;
+  json_signal_error (parser, Qjson_utf8_decode_error);
  }
  
-/* Reads a JSON string, and puts the result into the byte workspace */
-static void
-json_parse_string (struct json_parser *parser)
-{
-  /* a single_uninteresting byte can be simply copied from the input
-     to output, it doesn't need any extra care.  This means all the
-     characters between [0x20;0x7f], except the double quote and
-     the backslash */
-  static const char is_single_uninteresting[256] = {
-    /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
-    /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    /* 1 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    /* 2 */ 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    /* 3 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    /* 4 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    /* 5 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
-    /* 6 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    /* 7 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    /* 8 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    /* 9 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    /* a */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    /* b */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    /* c */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    /* d */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    /* e */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    /* f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  };
-
+/* Parse a string literal.  Optionally prepend a ':'.
+   Return the string or an interned symbol.  */
+static Lisp_Object
+json_parse_string (struct json_parser *parser, bool intern, bool leading_colon)
+{
+  json_byte_workspace_reset (parser);
+  if (leading_colon)
+    json_byte_workspace_put (parser, ':');
+  ptrdiff_t chars_delta = 0;   /* nchars - nbytes */
    for (;;)
      {
        /* This if is only here for a possible speedup.  If there are 4
@@ -1138,10 +1089,10 @@ json_parse_string (struct json_parser *parser)
           int c1 = parser->input_current[1];
           int c2 = parser->input_current[2];
           int c3 = parser->input_current[3];
-         bool v0 = is_single_uninteresting[c0];
-         bool v1 = is_single_uninteresting[c1];
-         bool v2 = is_single_uninteresting[c2];
-         bool v3 = is_single_uninteresting[c3];
+         bool v0 = json_plain_char[c0];
+         bool v1 = json_plain_char[c1];
+         bool v2 = json_plain_char[c2];
+         bool v3 = json_plain_char[c3];
           if (v0 && v1 && v2 && v3)
             {
               json_byte_workspace_put (parser, c0);
@@ -1156,43 +1107,62 @@ json_parse_string (struct json_parser *parser)
  
        int c = json_input_get (parser);
        parser->current_column++;
-      if (is_single_uninteresting[c])
+      if (json_plain_char[c])
         {
           json_byte_workspace_put (parser, c);
           continue;
         }
  
        if (c == '"')
-       return;
-      else if (c & 0x80)
         {
-         /* Handle utf-8 encoding */
+         ptrdiff_t nbytes
+           = parser->byte_workspace_current - parser->byte_workspace;
+         ptrdiff_t nchars = nbytes - chars_delta;
+         const char *str = (const char *)parser->byte_workspace;
+         return intern ? intern_c_multibyte (str, nchars, nbytes)
+                       : make_multibyte_string (str, nchars, nbytes);
+       }
+
+      if (c & 0x80)
+       {
+         /* Parse UTF-8, strictly.  This is the correct thing to do
+            whether or not the input is a unibyte or multibyte string.  */
           json_byte_workspace_put (parser, c);
-         if (c < 0xc0)
-           json_signal_error (parser, Qjson_utf8_decode_error);
-         else if (c < 0xe0)
+         unsigned char c1 = json_input_get (parser);
+         if ((c1 & 0xc0) != 0x80)
+           utf8_error (parser);
+         json_byte_workspace_put (parser, c1);
+         if (c <= 0xc1)
+           utf8_error (parser);
+         else if (c <= 0xdf)
+           chars_delta += 1;
+         else if (c <= 0xef)
             {
-             int n = ((c & 0x1f) << 6
-                      | json_handle_utf8_tail_bytes (parser, 1));
-             if (n < 0x80)
-               json_signal_error (parser, Qjson_utf8_decode_error);
-           }
-         else if (c < 0xf0)
-           {
-             int n = ((c & 0xf) << 12
-                      | json_handle_utf8_tail_bytes (parser, 2));
-             if (n < 0x800 || (n >= 0xd800 && n < 0xe000))
-               json_signal_error (parser, Qjson_utf8_decode_error);
+             unsigned char c2 = json_input_get (parser);
+             if ((c2 & 0xc0) != 0x80)
+               utf8_error (parser);
+             int v = ((c & 0x0f) << 12) + ((c1 & 0x3f) << 6) + (c2 & 0x3f);
+             if (v < 0x800 || (v >= 0xd800 && v <= 0xdfff))
+               utf8_error (parser);
+             json_byte_workspace_put (parser, c2);
+             chars_delta += 2;
             }
-         else if (c < 0xf8)
+         else if (c <= 0xf7)
             {
-             int n = ((c & 0x7) << 18
-                      | json_handle_utf8_tail_bytes (parser, 3));
-             if (n < 0x10000 || n > 0x10ffff)
-               json_signal_error (parser, Qjson_utf8_decode_error);
+             unsigned char c2 = json_input_get (parser);
+             unsigned char c3 = json_input_get (parser);
+             if ((c2 & 0xc0) != 0x80 || (c3 & 0xc0) != 0x80)
+               utf8_error (parser);
+             int v = (((c & 0x07) << 18) + ((c1 & 0x3f) << 12)
+                      + ((c2 & 0x3f) << 6) + (c3 & 0x3f));
+             if (v < 0x10000 || v > 0x10ffff)
+               utf8_error (parser);
+             json_byte_workspace_put (parser, c2);
+             json_byte_workspace_put (parser, c3);
+             chars_delta += 3;
             }
           else
-           json_signal_error (parser, Qjson_utf8_decode_error);
+           utf8_error (parser);
         }
        else if (c == '\\')
         {
@@ -1249,6 +1219,7 @@ json_parse_string (struct json_parser *parser)
                   json_byte_workspace_put (parser, 0xc0 | num >> 6);
                   json_byte_workspace_put (parser,
                                            0x80 | (num & 0x3f));
+                 chars_delta += 1;
                 }
               else if (num < 0x10000)
                 {
@@ -1258,6 +1229,7 @@ json_parse_string (struct json_parser *parser)
                                             | ((num >> 6) & 0x3f)));
                   json_byte_workspace_put (parser,
                                            0x80 | (num & 0x3f));
+                 chars_delta += 2;
                 }
               else
                 {
@@ -1270,6 +1242,7 @@ json_parse_string (struct json_parser *parser)
                                             | ((num >> 6) & 0x3f)));
                   json_byte_workspace_put (parser,
                                            0x80 | (num & 0x3f));
+                 chars_delta += 3;
                 }
             }
           else
@@ -1566,16 +1539,11 @@ json_parse_object (struct json_parser *parser)
           if (c != '"')
             json_signal_error (parser, Qjson_parse_error);
  
-         json_byte_workspace_reset (parser);
           switch (parser->conf.object_type)
             {
             case json_object_hashtable:
               {
-               json_parse_string (parser);
-               Lisp_Object key
-                 = make_string_from_utf8 ((char *) parser->byte_workspace,
-                                          (parser->byte_workspace_current
-                                           - parser->byte_workspace));
+               Lisp_Object key = json_parse_string (parser, false, false);
                 Lisp_Object value = json_parse_object_member_value (parser);
                 json_make_object_workspace_for (parser, 2);
                 parser->object_workspace[parser->object_workspace_current] = key;
@@ -1586,13 +1554,7 @@ json_parse_object (struct json_parser *parser)
               }
             case json_object_alist:
               {
-               json_parse_string (parser);
-               char *workspace = (char *) parser->byte_workspace;
-               ptrdiff_t nbytes
-                 = parser->byte_workspace_current - parser->byte_workspace;
-               Lisp_Object key = Fintern (make_string_from_utf8 (workspace,
-                                                                 nbytes),
-                                          Qnil);
+               Lisp_Object key = json_parse_string (parser, true, false);
                 Lisp_Object value = json_parse_object_member_value (parser);
                 Lisp_Object nc = Fcons (Fcons (key, value), Qnil);
                 *cdr = nc;
@@ -1601,11 +1563,7 @@ json_parse_object (struct json_parser *parser)
               }
             case json_object_plist:
               {
-               json_byte_workspace_put (parser, ':');
-               json_parse_string (parser);
-               Lisp_Object key = intern_1 ((char *) parser->byte_workspace,
-                                           (parser->byte_workspace_current
-                                            - parser->byte_workspace));
+               Lisp_Object key = json_parse_string (parser, true, true);
                 Lisp_Object value = json_parse_object_member_value (parser);
                 Lisp_Object nc = Fcons (key, Qnil);
                 *cdr = nc;
@@ -1692,15 +1650,7 @@ json_parse_value (struct json_parser *parser, int c)
    else if (c == '[')
      return json_parse_array (parser);
    else if (c == '"')
-    {
-      json_byte_workspace_reset (parser);
-      json_parse_string (parser);
-      Lisp_Object result
-       = make_string_from_utf8 ((const char *) parser->byte_workspace,
-                                (parser->byte_workspace_current
-                                 - parser->byte_workspace));
-      return result;
-    }
+    return json_parse_string (parser, false, false);
    else if ((c >= '0' && c <= '9') || (c == '-'))
      return json_parse_number (parser, c);
    else
@@ -1816,14 +1766,13 @@ usage: (json-parse-string STRING &rest ARGS) */)
  
    Lisp_Object string = args[0];
    CHECK_STRING (string);
-  Lisp_Object encoded = json_encode (string);
    struct json_configuration conf
      = { json_object_hashtable, json_array_array, QCnull, QCfalse };
    json_parse_args (nargs - 1, args + 1, &conf, true);
  
    struct json_parser p;
-  const unsigned char *begin = (const unsigned char *) SSDATA (encoded);
-  json_parser_init (&p, conf, begin, begin + SBYTES (encoded), NULL, NULL);
+  const unsigned char *begin = SDATA (string);
+  json_parser_init (&p, conf, begin, begin + SBYTES (string), NULL, NULL);
    record_unwind_protect_ptr (json_parser_done, &p);
  
    return unbind_to (count,
diff --git a/src/lisp.h b/src/lisp.h

index 43a29489a2554ca5d0247444874cae57a492dbfd..3cb4361e75e70a2d10c5ed6e43e4ba20d2bd6e51 100644 (file)
--- a/src/lisp.h
+++ b/src/lisp.h
@@ -4744,6 +4744,8 @@ extern ptrdiff_t evxprintf (char **, ptrdiff_t *, char *, ptrdiff_t,
  extern Lisp_Object intern_1 (const char *, ptrdiff_t);
  extern Lisp_Object intern_c_string_1 (const char *, ptrdiff_t);
  extern Lisp_Object intern_driver (Lisp_Object, Lisp_Object, Lisp_Object);
+extern Lisp_Object intern_c_multibyte (const char *str,
+                                      ptrdiff_t nchars, ptrdiff_t nbytes);
  extern void init_symbol (Lisp_Object, Lisp_Object);
  extern Lisp_Object oblookup (Lisp_Object, const char *, ptrdiff_t, ptrdiff_t);
  INLINE void
diff --git a/src/lread.c b/src/lread.c

index 1cb941e84fcb0925594388f2ddcfd745b0e37060..09a5589fd0c1ad6b90cc0a89624295910b2e12e1 100644 (file)
--- a/src/lread.c
+++ b/src/lread.c
@@ -4993,6 +4993,18 @@ intern_c_string_1 (const char *str, ptrdiff_t len)
    return tem;
  }
  
+/* Intern STR of NBYTES bytes and NCHARS characters in the default obarray.  */
+Lisp_Object
+intern_c_multibyte (const char *str, ptrdiff_t nchars, ptrdiff_t nbytes)
+{
+  Lisp_Object obarray = check_obarray (Vobarray);
+  Lisp_Object sym = oblookup (obarray, str, nchars, nbytes);
+  if (BARE_SYMBOL_P (sym))
+    return sym;
+  return intern_driver (make_multibyte_string (str, nchars, nbytes),
+                       obarray, sym);
+}
+
  static void
  define_symbol (Lisp_Object sym, char const *str)
  {
diff --git a/test/src/json-tests.el b/test/src/json-tests.el

index fb2384d4a8d2e4f168737847794623d7493cb10a..a1bafadaa872f690978a60376b1d7a39a5ea2359 100644 (file)
--- a/test/src/json-tests.el
+++ b/test/src/json-tests.el
@@ -25,6 +25,7 @@
  
  (require 'cl-lib)
  (require 'map)
+(require 'subr-x)
  
  (declare-function json-serialize "json.c" (object &rest args))
  (declare-function json-insert "json.c" (object &rest args))
@@ -155,6 +156,9 @@
    )
  
  (ert-deftest json-parse-string/object ()
+  :expected-result :failed
+  ;; FIXME: This currently fails. Should the parser deduplicate keys?
+  ;; Never, always, or for alist and plist only?
    (let ((input
           "{ \"abc\" : [1, 2, true], \"def\" : null, \"abc\" : [9, false] }\n"))
      (let ((actual (json-parse-string input)))
@@ -167,6 +171,15 @@
      (should (equal (json-parse-string input :object-type 'plist)
                     '(:abc [9 :false] :def :null)))))
  
+(ert-deftest json-parse-string/object-unicode-keys ()
+  (let ((input "{\"é\":1,\"☃\":2,\"𐌐\":3}"))
+    (let ((actual (json-parse-string input)))
+      (should (equal (sort (hash-table-keys actual)) '("é" "☃" "𐌐"))))
+    (should (equal (json-parse-string input :object-type 'alist)
+                   '((é . 1) (☃ . 2) (𐌐 . 3))))
+    (should (equal (json-parse-string input :object-type 'plist)
+                   '(:é 1 :☃ 2 :𐌐 3)))))
+
  (ert-deftest json-parse-string/array ()
    (let ((input "[\"a\", 1, [\"b\", 2]]"))
      (should (equal (json-parse-string input)
@@ -182,8 +195,8 @@
                   ["\nasdфывfgh\t"]))
    (should (equal (json-parse-string "[\"\\uD834\\uDD1E\"]") ["\U0001D11E"]))
    (should-error (json-parse-string "foo") :type 'json-parse-error)
-  ;; FIXME: Is this the right behavior?
-  (should (equal (json-parse-string "[\"\u00C4\xC3\x84\"]") ["\u00C4\u00C4"])))
+  (should-error (json-parse-string "[\"\u00C4\xC3\x84\"]")
+                :type 'json-utf8-decode-error))
  
  (ert-deftest json-serialize/string ()
    (should (equal (json-serialize ["foo"]) "[\"foo\"]"))
@@ -201,9 +214,23 @@
    (should-error (json-serialize ["u\xCCv"]) :type 'wrong-type-argument)
    (should-error (json-serialize ["u\u00C4\xCCv"]) :type 'wrong-type-argument))
  
+(ert-deftest json-parse-string/short ()
+  :expected-result :failed
+  (should-error (json-parse-string "") :type 'json-end-of-file)
+  (should-error (json-parse-string " ") :type 'json-end-of-file)
+  ;; BUG: currently results in `json-end-of-file' for short non-empty inputs.
+  (dolist (s '("a" "ab" "abc" "abcd"
+               "t" "tr" "tru" "truE" "truee"
+               "n" "nu" "nul" "nulL" "nulll"
+               "f" "fa" "fal" "fals" "falsE" "falsee"))
+    (condition-case err
+        (json-parse-string s)
+      (error
+       (should (eq (car err) 'json-parse-error)))
+      (:success (error "parsing %S should fail" s)))))
+
  (ert-deftest json-parse-string/null ()
-  (should-error (json-parse-string "\x00") :type 'wrong-type-argument)
-  (should (json-parse-string "[\"a\\u0000b\"]"))
+  (should (equal (json-parse-string "[\"a\\u0000b\"]") ["a\0b"]))
    (let* ((string "{\"foo\":\"this is a string including a literal \\u0000\"}")
           (data (json-parse-string string)))
      (should (hash-table-p data))
@@ -214,30 +241,34 @@
  https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt.
  Test with both unibyte and multibyte strings."
    ;; Invalid UTF-8 code unit sequences.
-  (should-error (json-parse-string "[\"\x80\"]") :type 'json-parse-error)
-  (should-error (json-parse-string "[\"\u00C4\x80\"]") :type 'json-parse-error)
-  (should-error (json-parse-string "[\"\xBF\"]") :type 'json-parse-error)
-  (should-error (json-parse-string "[\"\u00C4\xBF\"]") :type 'json-parse-error)
-  (should-error (json-parse-string "[\"\xFE\"]") :type 'json-parse-error)
-  (should-error (json-parse-string "[\"\u00C4\xFE\"]") :type 'json-parse-error)
-  (should-error (json-parse-string "[\"\xC0\xAF\"]") :type 'json-parse-error)
+  (should-error (json-parse-string "[\"\x80\"]") :type 'json-utf8-decode-error)
+  (should-error (json-parse-string "[\"\u00C4\x80\"]")
+                :type 'json-utf8-decode-error)
+  (should-error (json-parse-string "[\"\xBF\"]") :type 'json-utf8-decode-error)
+  (should-error (json-parse-string "[\"\u00C4\xBF\"]")
+                :type 'json-utf8-decode-error)
+  (should-error (json-parse-string "[\"\xFE\"]") :type 'json-utf8-decode-error)
+  (should-error (json-parse-string "[\"\u00C4\xFE\"]")
+                :type 'json-utf8-decode-error)
+  (should-error (json-parse-string "[\"\xC0\xAF\"]")
+                :type 'json-utf8-decode-error)
    (should-error (json-parse-string "[\"\u00C4\xC0\xAF\"]")
-                :type 'json-parse-error)
+                :type 'json-utf8-decode-error)
    (should-error (json-parse-string "[\"\u00C4\xC0\x80\"]")
-                :type 'json-parse-error)
+                :type 'json-utf8-decode-error)
    ;; Surrogates.
    (should-error (json-parse-string "[\"\uDB7F\"]")
-                :type 'json-parse-error)
+                :type 'json-utf8-decode-error)
    (should-error (json-parse-string "[\"\xED\xAD\xBF\"]")
-                :type 'json-parse-error)
+                :type 'json-utf8-decode-error)
    (should-error (json-parse-string "[\"\u00C4\xED\xAD\xBF\"]")
-                :type 'json-parse-error)
+                :type 'json-utf8-decode-error)
    (should-error (json-parse-string "[\"\uDB7F\uDFFF\"]")
-                :type 'json-parse-error)
+                :type 'json-utf8-decode-error)
    (should-error (json-parse-string "[\"\xED\xAD\xBF\xED\xBF\xBF\"]")
-                :type 'json-parse-error)
+                :type 'json-utf8-decode-error)
    (should-error (json-parse-string "[\"\u00C4\xED\xAD\xBF\xED\xBF\xBF\"]")
-                :type 'json-parse-error))
+                :type 'json-utf8-decode-error))
  
  (ert-deftest json-parse-string/incomplete ()
    (should-error (json-parse-string "[123") :type 'json-end-of-file))
author	Mattias Engdegård <mattiase@acm.org>
	Sun, 31 Mar 2024 13:00:00 +0000 (15:00 +0200)
committer	Eshel Yaron <me@eshelyaron.com>
	Tue, 2 Apr 2024 13:24:56 +0000 (15:24 +0200)
etc/NEWS		patch \| blob \| history
src/json.c		patch \| blob \| history
src/lisp.h		patch \| blob \| history
src/lread.c		patch \| blob \| history
test/src/json-tests.el		patch \| blob \| history