Improve error reporting when serializing non-Unicode strings to JSON

author Philipp Stephani <phst@google.com>

Sat, 23 Dec 2017 16:56:36 +0000 (17:56 +0100)

committer Philipp Stephani <phst@google.com>

Sat, 30 Dec 2017 22:16:51 +0000 (23:16 +0100)
author Philipp Stephani <phst@google.com>
Sat, 23 Dec 2017 16:56:36 +0000 (17:56 +0100)
committer Philipp Stephani <phst@google.com>
Sat, 30 Dec 2017 22:16:51 +0000 (23:16 +0100)
diff --git a/src/coding.c b/src/coding.c

index 1705838ffad07e747e1f3573df581b42e8034b51..5ea1e395f2070d922c4f5f9e78e15e5051b5c8bf 100644 (file)
--- a/src/coding.c
+++ b/src/coding.c
@@ -6360,6 +6360,27 @@ check_utf_8 (struct coding_system *coding)
  }
  
  
+/* Return whether STRING is a valid UTF-8 string.  STRING must be a
+   unibyte string.  */
+
+bool
+utf8_string_p (Lisp_Object string)
+{
+  eassert (!STRING_MULTIBYTE (string));
+  struct coding_system coding;
+  setup_coding_system (Qutf_8_unix, &coding);
+  /* We initialize only the fields that check_utf_8 accesses.  */
+  coding.head_ascii = -1;
+  coding.src_pos = 0;
+  coding.src_pos_byte = 0;
+  coding.src_chars = SCHARS (string);
+  coding.src_bytes = SBYTES (string);
+  coding.src_object = string;
+  coding.eol_seen = EOL_SEEN_NONE;
+  return check_utf_8 (&coding) != -1;
+}
+
+
  /* Detect how end-of-line of a text of length SRC_BYTES pointed by
     SOURCE is encoded.  If CATEGORY is one of
     coding_category_utf_16_XXXX, assume that CR and LF are encoded by
@@ -10846,6 +10867,7 @@ syms_of_coding (void)
    DEFSYM (Qiso_2022, "iso-2022");
  
    DEFSYM (Qutf_8, "utf-8");
+  DEFSYM (Qutf_8_unix, "utf-8-unix");
    DEFSYM (Qutf_8_emacs, "utf-8-emacs");
  
  #if defined (WINDOWSNT) || defined (CYGWIN)
diff --git a/src/coding.h b/src/coding.h

index 66d125b07e65c9e2d65ae75b4f91d437262f2366..bc4ef52e1ed59a141a4fa4c3bf76b4a628229cd3 100644 (file)
--- a/src/coding.h
+++ b/src/coding.h
@@ -665,6 +665,7 @@ struct coding_system
  /* Extern declarations.  */
  extern Lisp_Object code_conversion_save (bool, bool);
  extern bool encode_coding_utf_8 (struct coding_system *);
+extern bool utf8_string_p (Lisp_Object);
  extern void setup_coding_system (Lisp_Object, struct coding_system *);
  extern Lisp_Object coding_charset_list (struct coding_system *);
  extern Lisp_Object coding_system_charset_list (Lisp_Object);
diff --git a/src/json.c b/src/json.c

index 88db86ad2e302256a7e5d9146f849c8abb40c6b3..93dcc730dae232c1b04fe93886c305be58b71790 100644 (file)
--- a/src/json.c
+++ b/src/json.c
@@ -316,6 +316,15 @@ json_check (json_t *object)
    return object;
  }
  
+/* If STRING is not a valid UTF-8 string, signal an error of type
+   `wrong-type-argument'.  STRING must be a unibyte string.  */
+
+static void
+json_check_utf8 (Lisp_Object string)
+{
+  CHECK_TYPE (utf8_string_p (string), Qutf_8_string_p, string);
+}
+
  static json_t *lisp_to_json (Lisp_Object);
  
  /* Convert a Lisp object to a toplevel JSON object (array or object).
@@ -363,9 +372,12 @@ lisp_to_json_toplevel_1 (Lisp_Object lisp, json_t **json)
              int status = json_object_set_new (*json, key_str,
                                                lisp_to_json (HASH_VALUE (h, i)));
              if (status == -1)
-              /* FIXME: A failure here might also indicate that the
-                 key is not a valid Unicode string.  */
-              json_out_of_memory ();
+              {
+                /* A failure can be caused either by an invalid key or
+                   by low memory.  */
+                json_check_utf8 (key);
+                json_out_of_memory ();
+              }
            }
        clear_unwind_protect (count);
        return unbind_to (count, Qnil);
@@ -447,9 +459,15 @@ lisp_to_json (Lisp_Object lisp)
    else if (STRINGP (lisp))
      {
        Lisp_Object encoded = json_encode (lisp);
-      /* FIXME: We might throw an out-of-memory error here if the
-         string is not valid Unicode.  */
-      return json_check (json_stringn (SSDATA (encoded), SBYTES (encoded)));
+      json_t *json = json_stringn (SSDATA (encoded), SBYTES (encoded));
+      if (json == NULL)
+        {
+          /* A failure can be caused either by an invalid string or by
+             low memory.  */
+          json_check_utf8 (encoded);
+          json_out_of_memory ();
+        }
+      return json;
      }
  
    /* LISP now must be a vector, hashtable, or alist.  */
@@ -863,8 +881,7 @@ syms_of_json (void)
  
    DEFSYM (Qstring_without_embedded_nulls_p, "string-without-embedded-nulls-p");
    DEFSYM (Qjson_value_p, "json-value-p");
-
-  DEFSYM (Qutf_8_unix, "utf-8-unix");
+  DEFSYM (Qutf_8_string_p, "utf-8-string-p");
  
    DEFSYM (Qjson_error, "json-error");
    DEFSYM (Qjson_out_of_memory, "json-out-of-memory");
diff --git a/test/src/json-tests.el b/test/src/json-tests.el

index e394583bc76adfdbc095d9bb1dbd1ce080b541a8..107cab89083ac40b7627d27a9cfcc969330e0602 100644 (file)
--- a/test/src/json-tests.el
+++ b/test/src/json-tests.el
@@ -108,13 +108,11 @@
  
  (ert-deftest json-serialize/invalid-unicode ()
    (skip-unless (fboundp 'json-serialize))
-  ;; FIXME: "out of memory" is the wrong error signal, but we don't
-  ;; currently distinguish between error types when serializing.
-  (should-error (json-serialize ["a\uDBBBb"]) :type 'json-out-of-memory)
-  (should-error (json-serialize ["u\x110000v"]) :type 'json-out-of-memory)
-  (should-error (json-serialize ["u\x3FFFFFv"]) :type 'json-out-of-memory)
-  (should-error (json-serialize ["u\xCCv"]) :type 'json-out-of-memory)
-  (should-error (json-serialize ["u\u00C4\xCCv"]) :type 'json-out-of-memory))
+  (should-error (json-serialize ["a\uDBBBb"]) :type 'wrong-type-argument)
+  (should-error (json-serialize ["u\x110000v"]) :type 'wrong-type-argument)
+  (should-error (json-serialize ["u\x3FFFFFv"]) :type 'wrong-type-argument)
+  (should-error (json-serialize ["u\xCCv"]) :type 'wrong-type-argument)
+  (should-error (json-serialize ["u\u00C4\xCCv"]) :type 'wrong-type-argument))
  
  (ert-deftest json-parse-string/null ()
    (skip-unless (fboundp 'json-parse-string))
author	Philipp Stephani <phst@google.com>
	Sat, 23 Dec 2017 16:56:36 +0000 (17:56 +0100)
committer	Philipp Stephani <phst@google.com>
	Sat, 30 Dec 2017 22:16:51 +0000 (23:16 +0100)
src/coding.c		patch \| blob \| history
src/coding.h		patch \| blob \| history
src/json.c		patch \| blob \| history
test/src/json-tests.el		patch \| blob \| history