}
-/* Note that all callers of make_string_from_utf8 and build_string_from_utf8
- below either pass only value UTF-8 strings or use the function for
- formatting error messages; in the latter case correctness isn't
- critical. */
-
-/* Return a unibyte string containing the sequence of UTF-8 encoding
- units of the UTF-8 representation of STRING. If STRING does not
- represent a sequence of Unicode scalar values, return a string with
- unspecified contents. */
-
-static Lisp_Object
-json_encode (Lisp_Object string)
-{
- /* FIXME: Raise an error if STRING is not a scalar value
- sequence. */
- return encode_string_utf_8 (string, Qnil, false, Qt, Qt);
-}
-
#define JSON_PARSER_INTERNAL_OBJECT_WORKSPACE_SIZE 64
#define JSON_PARSER_INTERNAL_BYTE_WORKSPACE_SIZE 512
return v[0] << 12 | v[1] << 8 | v[2] << 4 | v[3];
}
-/* Parses an utf-8 code-point encoding (except the first byte), and
- returns the numeric value of the code-point (without considering
- the first byte) */
-static int
-json_handle_utf8_tail_bytes (struct json_parser *parser, int n)
+static AVOID
+utf8_error (struct json_parser *parser)
{
- int v = 0;
- for (int i = 0; i < n; i++)
- {
- int c = json_input_get (parser);
- json_byte_workspace_put (parser, c);
- if ((c & 0xc0) != 0x80)
- json_signal_error (parser, Qjson_utf8_decode_error);
- v = (v << 6) | (c & 0x3f);
- }
- return v;
+ json_signal_error (parser, Qjson_utf8_decode_error);
}
-/* Reads a JSON string, and puts the result into the byte workspace */
-static void
-json_parse_string (struct json_parser *parser)
-{
- /* a single_uninteresting byte can be simply copied from the input
- to output, it doesn't need any extra care. This means all the
- characters between [0x20;0x7f], except the double quote and
- the backslash */
- static const char is_single_uninteresting[256] = {
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
- /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 1 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 2 */ 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- /* 3 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- /* 4 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- /* 5 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
- /* 6 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- /* 7 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- /* 8 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 9 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* a */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* b */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* c */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* d */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* e */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- };
-
+/* Parse a string literal. Optionally prepend a ':'.
+ Return the string or an interned symbol. */
+static Lisp_Object
+json_parse_string (struct json_parser *parser, bool intern, bool leading_colon)
+{
+ json_byte_workspace_reset (parser);
+ if (leading_colon)
+ json_byte_workspace_put (parser, ':');
+ ptrdiff_t chars_delta = 0; /* nchars - nbytes */
for (;;)
{
/* This if is only here for a possible speedup. If there are 4
int c1 = parser->input_current[1];
int c2 = parser->input_current[2];
int c3 = parser->input_current[3];
- bool v0 = is_single_uninteresting[c0];
- bool v1 = is_single_uninteresting[c1];
- bool v2 = is_single_uninteresting[c2];
- bool v3 = is_single_uninteresting[c3];
+ bool v0 = json_plain_char[c0];
+ bool v1 = json_plain_char[c1];
+ bool v2 = json_plain_char[c2];
+ bool v3 = json_plain_char[c3];
if (v0 && v1 && v2 && v3)
{
json_byte_workspace_put (parser, c0);
int c = json_input_get (parser);
parser->current_column++;
- if (is_single_uninteresting[c])
+ if (json_plain_char[c])
{
json_byte_workspace_put (parser, c);
continue;
}
if (c == '"')
- return;
- else if (c & 0x80)
{
- /* Handle utf-8 encoding */
+ ptrdiff_t nbytes
+ = parser->byte_workspace_current - parser->byte_workspace;
+ ptrdiff_t nchars = nbytes - chars_delta;
+ const char *str = (const char *)parser->byte_workspace;
+ return intern ? intern_c_multibyte (str, nchars, nbytes)
+ : make_multibyte_string (str, nchars, nbytes);
+ }
+
+ if (c & 0x80)
+ {
+ /* Parse UTF-8, strictly. This is the correct thing to do
+ whether or not the input is a unibyte or multibyte string. */
json_byte_workspace_put (parser, c);
- if (c < 0xc0)
- json_signal_error (parser, Qjson_utf8_decode_error);
- else if (c < 0xe0)
+ unsigned char c1 = json_input_get (parser);
+ if ((c1 & 0xc0) != 0x80)
+ utf8_error (parser);
+ json_byte_workspace_put (parser, c1);
+ if (c <= 0xc1)
+ utf8_error (parser);
+ else if (c <= 0xdf)
+ chars_delta += 1;
+ else if (c <= 0xef)
{
- int n = ((c & 0x1f) << 6
- | json_handle_utf8_tail_bytes (parser, 1));
- if (n < 0x80)
- json_signal_error (parser, Qjson_utf8_decode_error);
- }
- else if (c < 0xf0)
- {
- int n = ((c & 0xf) << 12
- | json_handle_utf8_tail_bytes (parser, 2));
- if (n < 0x800 || (n >= 0xd800 && n < 0xe000))
- json_signal_error (parser, Qjson_utf8_decode_error);
+ unsigned char c2 = json_input_get (parser);
+ if ((c2 & 0xc0) != 0x80)
+ utf8_error (parser);
+ int v = ((c & 0x0f) << 12) + ((c1 & 0x3f) << 6) + (c2 & 0x3f);
+ if (v < 0x800 || (v >= 0xd800 && v <= 0xdfff))
+ utf8_error (parser);
+ json_byte_workspace_put (parser, c2);
+ chars_delta += 2;
}
- else if (c < 0xf8)
+ else if (c <= 0xf7)
{
- int n = ((c & 0x7) << 18
- | json_handle_utf8_tail_bytes (parser, 3));
- if (n < 0x10000 || n > 0x10ffff)
- json_signal_error (parser, Qjson_utf8_decode_error);
+ unsigned char c2 = json_input_get (parser);
+ unsigned char c3 = json_input_get (parser);
+ if ((c2 & 0xc0) != 0x80 || (c3 & 0xc0) != 0x80)
+ utf8_error (parser);
+ int v = (((c & 0x07) << 18) + ((c1 & 0x3f) << 12)
+ + ((c2 & 0x3f) << 6) + (c3 & 0x3f));
+ if (v < 0x10000 || v > 0x10ffff)
+ utf8_error (parser);
+ json_byte_workspace_put (parser, c2);
+ json_byte_workspace_put (parser, c3);
+ chars_delta += 3;
}
else
- json_signal_error (parser, Qjson_utf8_decode_error);
+ utf8_error (parser);
}
else if (c == '\\')
{
json_byte_workspace_put (parser, 0xc0 | num >> 6);
json_byte_workspace_put (parser,
0x80 | (num & 0x3f));
+ chars_delta += 1;
}
else if (num < 0x10000)
{
| ((num >> 6) & 0x3f)));
json_byte_workspace_put (parser,
0x80 | (num & 0x3f));
+ chars_delta += 2;
}
else
{
| ((num >> 6) & 0x3f)));
json_byte_workspace_put (parser,
0x80 | (num & 0x3f));
+ chars_delta += 3;
}
}
else
if (c != '"')
json_signal_error (parser, Qjson_parse_error);
- json_byte_workspace_reset (parser);
switch (parser->conf.object_type)
{
case json_object_hashtable:
{
- json_parse_string (parser);
- Lisp_Object key
- = make_string_from_utf8 ((char *) parser->byte_workspace,
- (parser->byte_workspace_current
- - parser->byte_workspace));
+ Lisp_Object key = json_parse_string (parser, false, false);
Lisp_Object value = json_parse_object_member_value (parser);
json_make_object_workspace_for (parser, 2);
parser->object_workspace[parser->object_workspace_current] = key;
}
case json_object_alist:
{
- json_parse_string (parser);
- char *workspace = (char *) parser->byte_workspace;
- ptrdiff_t nbytes
- = parser->byte_workspace_current - parser->byte_workspace;
- Lisp_Object key = Fintern (make_string_from_utf8 (workspace,
- nbytes),
- Qnil);
+ Lisp_Object key = json_parse_string (parser, true, false);
Lisp_Object value = json_parse_object_member_value (parser);
Lisp_Object nc = Fcons (Fcons (key, value), Qnil);
*cdr = nc;
}
case json_object_plist:
{
- json_byte_workspace_put (parser, ':');
- json_parse_string (parser);
- Lisp_Object key = intern_1 ((char *) parser->byte_workspace,
- (parser->byte_workspace_current
- - parser->byte_workspace));
+ Lisp_Object key = json_parse_string (parser, true, true);
Lisp_Object value = json_parse_object_member_value (parser);
Lisp_Object nc = Fcons (key, Qnil);
*cdr = nc;
else if (c == '[')
return json_parse_array (parser);
else if (c == '"')
- {
- json_byte_workspace_reset (parser);
- json_parse_string (parser);
- Lisp_Object result
- = make_string_from_utf8 ((const char *) parser->byte_workspace,
- (parser->byte_workspace_current
- - parser->byte_workspace));
- return result;
- }
+ return json_parse_string (parser, false, false);
else if ((c >= '0' && c <= '9') || (c == '-'))
return json_parse_number (parser, c);
else
Lisp_Object string = args[0];
CHECK_STRING (string);
- Lisp_Object encoded = json_encode (string);
struct json_configuration conf
= { json_object_hashtable, json_array_array, QCnull, QCfalse };
json_parse_args (nargs - 1, args + 1, &conf, true);
struct json_parser p;
- const unsigned char *begin = (const unsigned char *) SSDATA (encoded);
- json_parser_init (&p, conf, begin, begin + SBYTES (encoded), NULL, NULL);
+ const unsigned char *begin = SDATA (string);
+ json_parser_init (&p, conf, begin, begin + SBYTES (string), NULL, NULL);
record_unwind_protect_ptr (json_parser_done, &p);
return unbind_to (count,
(require 'cl-lib)
(require 'map)
+(require 'subr-x)
(declare-function json-serialize "json.c" (object &rest args))
(declare-function json-insert "json.c" (object &rest args))
)
(ert-deftest json-parse-string/object ()
+ :expected-result :failed
+ ;; FIXME: This currently fails. Should the parser deduplicate keys?
+ ;; Never, always, or for alist and plist only?
(let ((input
"{ \"abc\" : [1, 2, true], \"def\" : null, \"abc\" : [9, false] }\n"))
(let ((actual (json-parse-string input)))
(should (equal (json-parse-string input :object-type 'plist)
'(:abc [9 :false] :def :null)))))
+(ert-deftest json-parse-string/object-unicode-keys ()
+ (let ((input "{\"é\":1,\"☃\":2,\"𐌐\":3}"))
+ (let ((actual (json-parse-string input)))
+ (should (equal (sort (hash-table-keys actual)) '("é" "☃" "𐌐"))))
+ (should (equal (json-parse-string input :object-type 'alist)
+ '((é . 1) (☃ . 2) (𐌐 . 3))))
+ (should (equal (json-parse-string input :object-type 'plist)
+ '(:é 1 :☃ 2 :𐌐 3)))))
+
(ert-deftest json-parse-string/array ()
(let ((input "[\"a\", 1, [\"b\", 2]]"))
(should (equal (json-parse-string input)
["\nasdфывfgh\t"]))
(should (equal (json-parse-string "[\"\\uD834\\uDD1E\"]") ["\U0001D11E"]))
(should-error (json-parse-string "foo") :type 'json-parse-error)
- ;; FIXME: Is this the right behavior?
- (should (equal (json-parse-string "[\"\u00C4\xC3\x84\"]") ["\u00C4\u00C4"])))
+ (should-error (json-parse-string "[\"\u00C4\xC3\x84\"]")
+ :type 'json-utf8-decode-error))
(ert-deftest json-serialize/string ()
(should (equal (json-serialize ["foo"]) "[\"foo\"]"))
(should-error (json-serialize ["u\xCCv"]) :type 'wrong-type-argument)
(should-error (json-serialize ["u\u00C4\xCCv"]) :type 'wrong-type-argument))
+(ert-deftest json-parse-string/short ()
+ :expected-result :failed
+ (should-error (json-parse-string "") :type 'json-end-of-file)
+ (should-error (json-parse-string " ") :type 'json-end-of-file)
+ ;; BUG: currently results in `json-end-of-file' for short non-empty inputs.
+ (dolist (s '("a" "ab" "abc" "abcd"
+ "t" "tr" "tru" "truE" "truee"
+ "n" "nu" "nul" "nulL" "nulll"
+ "f" "fa" "fal" "fals" "falsE" "falsee"))
+ (condition-case err
+ (json-parse-string s)
+ (error
+ (should (eq (car err) 'json-parse-error)))
+ (:success (error "parsing %S should fail" s)))))
+
(ert-deftest json-parse-string/null ()
- (should-error (json-parse-string "\x00") :type 'wrong-type-argument)
- (should (json-parse-string "[\"a\\u0000b\"]"))
+ (should (equal (json-parse-string "[\"a\\u0000b\"]") ["a\0b"]))
(let* ((string "{\"foo\":\"this is a string including a literal \\u0000\"}")
(data (json-parse-string string)))
(should (hash-table-p data))
https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt.
Test with both unibyte and multibyte strings."
;; Invalid UTF-8 code unit sequences.
- (should-error (json-parse-string "[\"\x80\"]") :type 'json-parse-error)
- (should-error (json-parse-string "[\"\u00C4\x80\"]") :type 'json-parse-error)
- (should-error (json-parse-string "[\"\xBF\"]") :type 'json-parse-error)
- (should-error (json-parse-string "[\"\u00C4\xBF\"]") :type 'json-parse-error)
- (should-error (json-parse-string "[\"\xFE\"]") :type 'json-parse-error)
- (should-error (json-parse-string "[\"\u00C4\xFE\"]") :type 'json-parse-error)
- (should-error (json-parse-string "[\"\xC0\xAF\"]") :type 'json-parse-error)
+ (should-error (json-parse-string "[\"\x80\"]") :type 'json-utf8-decode-error)
+ (should-error (json-parse-string "[\"\u00C4\x80\"]")
+ :type 'json-utf8-decode-error)
+ (should-error (json-parse-string "[\"\xBF\"]") :type 'json-utf8-decode-error)
+ (should-error (json-parse-string "[\"\u00C4\xBF\"]")
+ :type 'json-utf8-decode-error)
+ (should-error (json-parse-string "[\"\xFE\"]") :type 'json-utf8-decode-error)
+ (should-error (json-parse-string "[\"\u00C4\xFE\"]")
+ :type 'json-utf8-decode-error)
+ (should-error (json-parse-string "[\"\xC0\xAF\"]")
+ :type 'json-utf8-decode-error)
(should-error (json-parse-string "[\"\u00C4\xC0\xAF\"]")
- :type 'json-parse-error)
+ :type 'json-utf8-decode-error)
(should-error (json-parse-string "[\"\u00C4\xC0\x80\"]")
- :type 'json-parse-error)
+ :type 'json-utf8-decode-error)
;; Surrogates.
(should-error (json-parse-string "[\"\uDB7F\"]")
- :type 'json-parse-error)
+ :type 'json-utf8-decode-error)
(should-error (json-parse-string "[\"\xED\xAD\xBF\"]")
- :type 'json-parse-error)
+ :type 'json-utf8-decode-error)
(should-error (json-parse-string "[\"\u00C4\xED\xAD\xBF\"]")
- :type 'json-parse-error)
+ :type 'json-utf8-decode-error)
(should-error (json-parse-string "[\"\uDB7F\uDFFF\"]")
- :type 'json-parse-error)
+ :type 'json-utf8-decode-error)
(should-error (json-parse-string "[\"\xED\xAD\xBF\xED\xBF\xBF\"]")
- :type 'json-parse-error)
+ :type 'json-utf8-decode-error)
(should-error (json-parse-string "[\"\u00C4\xED\xAD\xBF\xED\xBF\xBF\"]")
- :type 'json-parse-error))
+ :type 'json-utf8-decode-error))
(ert-deftest json-parse-string/incomplete ()
(should-error (json-parse-string "[123") :type 'json-end-of-file))