From f8581bcf6a1942ebd331cae20e32945a3a86a3d1 Mon Sep 17 00:00:00 2001 From: Philipp Stephani Date: Sat, 23 May 2020 13:56:09 +0200 Subject: [PATCH] Reject invalid characters in XML strings (Bug#41094). * lisp/xml.el (xml-escape-string): Search for invalid characters. (xml-invalid-character): New error symbol. * test/lisp/xml-tests.el (xml-print-invalid-cdata): New unit test. * etc/NEWS: Document new behavior. --- etc/NEWS | 7 +++++++ lisp/xml.el | 13 ++++++++++++- test/lisp/xml-tests.el | 10 ++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/etc/NEWS b/etc/NEWS index 32b59cb76fc..efad273da6c 100644 --- a/etc/NEWS +++ b/etc/NEWS @@ -393,6 +393,13 @@ component are now rejected by 'json-read' and friends. This makes them more compliant with the JSON specification and consistent with the native JSON parsing functions. +** xml.el + +*** XML serialization functions now reject invalid characters. +Previously 'xml-print' would produce invalid XML when given a string +with characters that are not valid in XML (see +https://www.w3.org/TR/xml/#charsets). Now it rejects such strings. + * New Modes and Packages in Emacs 28.1 diff --git a/lisp/xml.el b/lisp/xml.el index dc774a202cf..767cf042846 100644 --- a/lisp/xml.el +++ b/lisp/xml.el @@ -1023,9 +1023,17 @@ entity references (e.g., replace each & with &). XML character data must not contain & or < characters, nor the > character under some circumstances. The XML spec does not impose restriction on \" or \\=', but we just substitute for these too -\(as is permitted by the spec)." +\(as is permitted by the spec). + +If STRING contains characters that are invalid in XML (as defined +by https://www.w3.org/TR/xml/#charsets), signal an error of type +`xml-invalid-character'." (with-temp-buffer (insert string) + (goto-char (point-min)) + (when (re-search-forward + "[^\u0009\u000A\u000D\u0020-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]") + (signal 'xml-invalid-character (list (char-before) (match-beginning 0)))) (dolist (substitution '(("&" . "&") ("<" . "<") (">" . ">") @@ -1036,6 +1044,9 @@ restriction on \" or \\=', but we just substitute for these too (replace-match (cdr substitution) t t nil))) (buffer-string))) +(define-error 'xml-invalid-character "Invalid XML character" + 'wrong-type-argument) + (defun xml-debug-print-internal (xml indent-string) "Outputs the XML tree in the current buffer. The first line is indented with INDENT-STRING." diff --git a/test/lisp/xml-tests.el b/test/lisp/xml-tests.el index 57e685cd347..72c78d00e3e 100644 --- a/test/lisp/xml-tests.el +++ b/test/lisp/xml-tests.el @@ -164,6 +164,16 @@ Parser is called with and without 'symbol-qnames argument.") (should (equal (cdr xml-parse-test--namespace-attribute-qnames) (xml-parse-region nil nil nil nil 'symbol-qnames))))) +(ert-deftest xml-print-invalid-cdata () + "Check that Bug#41094 is fixed." + (with-temp-buffer + (should (equal (should-error (xml-print '((foo () "\0"))) + :type 'xml-invalid-character) + '(xml-invalid-character 0 1))) + (should (equal (should-error (xml-print '((foo () "\u00FF \xFF"))) + :type 'xml-invalid-character) + '(xml-invalid-character #x3FFFFF 3))))) + ;; Local Variables: ;; no-byte-compile: t ;; End: -- 2.39.5