From: Chong Yidong Date: Mon, 2 Jul 2012 16:21:54 +0000 (+0800) Subject: * lisp/xml.el: Handle entity and character reference expansion correctly. X-Git-Tag: emacs-24.2.90~1199^2~254 X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=a7aef6f5c6e22b167ea0234ab84c0308201d681b;p=emacs.git * lisp/xml.el: Handle entity and character reference expansion correctly. (xml-default-ns): New variable. (xml-entity-alist): Use XML spec definitions for lt and amp. (xml-parse-region): Make first two arguments optional. Discard text properties. (xml-parse-tag-1): New function, spun off from xml-parse-tag. All callers changed. (xml-parse-tag): Call xml-parse-tag-1. For backward compatibility, this function should not modify buffer contents. (xml-parse-tag-1): Fix opening-tag regexp. (xml-parse-string): Rewrite, handling entity and character references properly. (xml--entity-replacement-text): Signal an error if a parameter entity is undefined. * test/automated/xml-parse-tests.el (xml-parse-tests--data): More testcases. --- diff --git a/lisp/ChangeLog b/lisp/ChangeLog index 0e7c49342c7..bab4085587e 100644 --- a/lisp/ChangeLog +++ b/lisp/ChangeLog @@ -1,3 +1,21 @@ +2012-07-02 Chong Yidong + + * xml.el: Fix entity and character reference expansion, allowing + them to expand into markup as per XML spec. + (xml-default-ns): New variable. + (xml-entity-alist): Use XML spec definitions for lt and amp. + (xml-parse-region): Make first two arguments optional. Discard + text properties. + (xml-parse-tag-1): New function, spun off from xml-parse-tag. All + callers changed. + (xml-parse-tag): Call xml-parse-tag-1. For backward + compatibility, this function should not modify buffer contents. + (xml-parse-tag-1): Fix opening-tag regexp. + (xml-parse-string): Rewrite, handling entity and character + references properly. + (xml--entity-replacement-text): Signal an error if a parameter + entity is undefined. + 2012-07-02 Stefan Monnier * comint.el (comint-output-filter): Filter out repeated prompts. diff --git a/lisp/xml.el b/lisp/xml.el index 5c1d2390a23..a3e279b41bd 100644 --- a/lisp/xml.el +++ b/lisp/xml.el @@ -80,22 +80,23 @@ ;; a worthwhile tradeoff especially since we're usually parsing files ;; instead of hand-crafted XML. -;;******************************************************************* -;;** -;;** Macros to parse the list -;;** -;;******************************************************************* +;;; Macros to parse the list (defconst xml-undefined-entity "?" "What to substitute for undefined entities") +(defconst xml-default-ns '(("" . "") + ("xml" . "http://www.w3.org/XML/1998/namespace") + ("xmlns" . "http://www.w3.org/2000/xmlns/")) + "Alist mapping default XML namespaces to their URIs.") + (defvar xml-entity-alist - '(("lt" . "<") + '(("lt" . "<") ("gt" . ">") ("apos" . "'") ("quot" . "\"") - ("amp" . "&")) - "Alist of defined XML entities.") + ("amp" . "&")) + "Alist mapping XML entities to their replacement text.") (defvar xml-parameter-entity-alist nil "Alist of defined XML parametric entities.") @@ -156,11 +157,7 @@ An empty string is returned if the attribute was not found. See also `xml-get-attribute-or-nil'." (or (xml-get-attribute-or-nil node attribute) "")) -;;******************************************************************* -;;** -;;** Creating the list -;;** -;;******************************************************************* +;;; Creating the list ;;;###autoload (defun xml-parse-file (file &optional parse-dtd parse-ns) @@ -299,8 +296,10 @@ If PARSE-NS is non-nil, then QNAMES are expanded." ;; prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? ;;;###autoload -(defun xml-parse-region (beg end &optional buffer parse-dtd parse-ns) +(defun xml-parse-region (&optional beg end buffer parse-dtd parse-ns) "Parse the region from BEG to END in BUFFER. +If BEG is nil, it defaults to `point-min'. +If END is nil, it defaults to `point-max'. If BUFFER is nil, it defaults to the current buffer. Returns the XML list for the region, or raises an error if the region is not well-formed XML. @@ -312,7 +311,7 @@ If PARSE-NS is non-nil, then QNAMES are expanded." (unless buffer (setq buffer (current-buffer))) (with-temp-buffer - (insert-buffer-substring buffer beg end) + (insert-buffer-substring-no-properties buffer beg end) (xml--parse-buffer parse-dtd parse-ns))) (defun xml--parse-buffer (parse-dtd parse-ns) @@ -327,7 +326,7 @@ If PARSE-NS is non-nil, then QNAMES are expanded." (if (search-forward "<" nil t) (progn (forward-char -1) - (setq result (xml-parse-tag parse-dtd parse-ns)) + (setq result (xml-parse-tag-1 parse-dtd parse-ns)) (cond ((null result) ;; Not looking at an xml start tag. @@ -379,8 +378,7 @@ specify that the name shouldn't be given a namespace." (xml-parameter-entity-alist xml-parameter-entity-alist) children) (while (not (eobp)) - (let ((bit (xml-parse-tag - parse-dtd parse-ns))) + (let ((bit (xml-parse-tag-1 parse-dtd parse-ns))) (if children (setq children (append (list bit) children)) (if (stringp bit) @@ -392,30 +390,32 @@ specify that the name shouldn't be given a namespace." "Parse the tag at point. If PARSE-DTD is non-nil, the DTD of the document, if any, is parsed and returned as the first element in the list. -If PARSE-NS is non-nil, then QNAMES are expanded. -Returns one of: +If PARSE-NS is non-nil, expand QNAMES; if the value of PARSE-NS +is a list, use it as an alist mapping namespaces to URIs. + +Return one of: - a list : the matching node - nil : the point is not looking at a tag. - a pair : the first element is the DTD, the second is the node." + (let ((buf (current-buffer)) + (pos (point))) + (with-temp-buffer + (insert-buffer-substring-no-properties buf pos) + (goto-char (point-min)) + (xml-parse-tag-1 parse-dtd parse-ns)))) + +(defun xml-parse-tag-1 (&optional parse-dtd parse-ns) + "Like `xml-parse-tag', but possibly modify the buffer while working." (let ((xml-validating-parser (or parse-dtd xml-validating-parser)) - (xml-ns (if (consp parse-ns) - parse-ns - (if parse-ns - (list - ;; Default for empty prefix is no namespace - (cons "" "") - ;; "xml" namespace - (cons "xml" "http://www.w3.org/XML/1998/namespace") - ;; We need to seed the xmlns namespace - (cons "xmlns" "http://www.w3.org/2000/xmlns/")))))) + (xml-ns (cond ((consp parse-ns) parse-ns) + (parse-ns xml-default-ns)))) (cond - ;; Processing instructions (like the tag at the - ;; beginning of a document). + ;; Processing instructions, like . ((looking-at "<\\?") (search-forward "?>") (skip-syntax-forward " ") - (xml-parse-tag parse-dtd xml-ns)) - ;; Character data (CDATA) sections, in which no tag should be interpreted + (xml-parse-tag-1 parse-dtd xml-ns)) + ;; Character data (CDATA) sections, in which no tag should be interpreted ((looking-at "" nil t) @@ -423,33 +423,32 @@ Returns one of: (concat (buffer-substring-no-properties pos (match-beginning 0)) (xml-parse-string)))) - ;; DTD for the document + ;; DTD for the document ((looking-at "") + ;; FIXME: This loses the skipped-over spaces. (skip-syntax-forward " ") (unless (eobp) (let ((xml-sub-parser t)) - (xml-parse-tag parse-dtd xml-ns)))) - ;; end tag + (xml-parse-tag-1 parse-dtd xml-ns)))) + ;; end tag ((looking-at "[:space:]]+\\)") + ;; opening tag + ((looking-at (eval-when-compile (concat "<\\(" xml-name-re "\\)"))) (goto-char (match-end 1)) - ;; Parse this node (let* ((node-name (match-string-no-properties 1)) ;; Parse the attribute list. (attrs (xml-parse-attlist xml-ns)) children) - ;; add the xmlns:* attrs to our cache (when (consp xml-ns) (dolist (attr attrs) @@ -458,70 +457,114 @@ Returns one of: (caar attr))) (push (cons (cdar attr) (cdr attr)) xml-ns)))) - (setq children (list attrs (xml-maybe-do-ns node-name "" xml-ns))) + (cond + ;; is this an empty element ? + ((looking-at "/>") + (forward-char 2) + (nreverse children)) + ;; is this a valid start tag ? + ((eq (char-after) ?>) + (forward-char 1) + ;; Now check that we have the right end-tag. + (let ((end (concat ""))) + (while (not (looking-at end)) + (cond + ((eobp) + (error "XML: (Not Well-Formed) End of buffer while reading element `%s'" + node-name)) + ((looking-at "" nil t) + (match-beginning 0) + (point-max)))) + node-name)) + ;; Read a sub-element and push it onto CHILDREN. + ((= (char-after) ?<) + (let ((tag (xml-parse-tag-1 nil xml-ns))) + (when tag + (push tag children)))) + ;; Read some character data. + (t + (let ((expansion (xml-parse-string))) + (push (if (stringp (car children)) + ;; If two strings were separated by a + ;; comment, concat them. + (concat (pop children) expansion) + expansion) + children))))) + ;; Move point past the end-tag. + (goto-char (match-end 0)) + (nreverse children))) + ;; Otherwise this was an invalid start tag (expected ">" not found.) + (t + (error "XML: (Well-Formed) Couldn't parse tag: %s" + (buffer-substring-no-properties (- (point) 10) (+ (point) 1))))))) - ;; is this an empty element ? - (if (looking-at "/>") - (progn - (forward-char 2) - (nreverse children)) - - ;; is this a valid start tag ? - (if (eq (char-after) ?>) - (progn - (forward-char 1) - ;; Now check that we have the right end-tag. Note that this - ;; one might contain spaces after the tag name - (let ((end (concat ""))) - (while (not (looking-at end)) - (cond - ((looking-at "", but didn't see it.) - (error "XML: (Well-Formed) Couldn't parse tag: %s" - (buffer-substring-no-properties (- (point) 10) (+ (point) 1))))))) - (t ;; (Not one of PI, CDATA, Comment, End tag, or Start tag) - (unless xml-sub-parser ; Usually, we error out. + ;; (Not one of PI, CDATA, Comment, End tag, or Start tag) + (t + (unless xml-sub-parser ; Usually, we error out. (error "XML: (Well-Formed) Invalid character")) - ;; However, if we're parsing incrementally, then we need to deal ;; with stray CDATA. (xml-parse-string))))) (defun xml-parse-string () - "Parse the next whatever. Could be a string, or an element." - (let* ((pos (point)) - (string (progn (skip-chars-forward "^<") - (buffer-substring-no-properties pos (point))))) - ;; Clean up the string. As per XML specifications, the XML - ;; processor should always pass the whole string to the - ;; application. But \r's should be replaced: - ;; http://www.w3.org/TR/2000/REC-xml-20001006#sec-line-ends - (setq pos 0) - (while (string-match "\r\n?" string pos) - (setq string (replace-match "\n" t t string)) - (setq pos (1+ (match-beginning 0)))) - - (xml-substitute-special string))) + "Parse character data at point, and return it as a string. +Leave point at the start of the next thing to parse. This +function can modify the buffer by expanding entity and character +references." + (let ((start (point)) + ref val) + (while (and (not (eobp)) + (not (looking-at "<"))) + ;; Find the next < or & character. + (skip-chars-forward "^<&") + (when (eq (char-after) ?&) + ;; If we find an entity or character reference, expand it. + (unless (looking-at (eval-when-compile + (concat "&\\(?:#\\([0-9]+\\)\\|#x\\([0-9a-fA-F]+\\)\\|\\(" + xml-name-re "\\)\\);"))) + (error "XML: (Not Well-Formed) Invalid entity reference")) + ;; For a character reference, the next entity or character + ;; reference must be after the replacement. [4.6] "Numerical + ;; character references are expanded immediately when + ;; recognized and MUST be treated as character data." + (cond ((setq ref (match-string 1)) + ;; Decimal character reference + (setq val (save-match-data + (decode-char 'ucs (string-to-number ref)))) + (and (null val) + xml-validating-parser + (error "XML: (Validity) Invalid character `%s'" ref)) + (replace-match (or (string val) xml-undefined-entity) t t)) + ;; Hexadecimal character reference + ((setq ref (match-string 2)) + (setq val (save-match-data + (decode-char 'ucs (string-to-number ref 16)))) + (and (null val) + xml-validating-parser + (error "XML: (Validity) Invalid character `x%s'" ref)) + (replace-match (or (string val) xml-undefined-entity) t t)) + ;; For an entity reference, search again from the start + ;; of the replaced text, since the replacement can + ;; contain entity or character references, or markup. + ((setq ref (match-string 3)) + (setq val (assoc ref xml-entity-alist)) + (and (null val) + xml-validating-parser + (error "XML: (Validity) Undefined entity `%s'" ref)) + (replace-match (cdr val) t t) + (goto-char (match-beginning 0)))))) + ;; [2.11] Clean up line breaks. + (let ((end-marker (point-marker))) + (goto-char start) + (while (re-search-forward "\r\n?" end-marker t) + (replace-match "\n" t t)) + (goto-char end-marker) + (buffer-substring start (point))))) (defun xml-parse-attlist (&optional xml-ns) "Return the attribute-list after point. @@ -564,15 +607,11 @@ Leave point at the first non-blank character after the tag." (skip-syntax-forward " ")) (nreverse attlist))) -;;******************************************************************* -;;** -;;** The DTD (document type declaration) -;;** The following functions know how to skip or parse the DTD of -;;** a document -;;** -;;******************************************************************* +;;; DTD (document type declaration) -;; Fixme: This fails at least if the DTD contains conditional sections. +;; The following functions know how to skip or parse the DTD of a +;; document. FIXME: it fails at least if the DTD contains conditional +;; sections. (defun xml-skip-dtd () "Skip the DTD at point. @@ -789,9 +828,10 @@ references and parameter-entity references." ;; Parameter entity reference ((setq ref (match-string 3 string)) (setq val (assoc ref xml-parameter-entity-alist)) - (if val - (push (cdr val) children) - (push (concat "%" ref ";") children)))) + (and (null val) + xml-validating-parser + (error "XML: (Validity) Undefined parameter entity `%s'" ref)) + (push (or (cdr val) xml-undefined-entity) children))) (setq string remainder))) (mapconcat 'identity (nreverse (cons string children)) ""))) @@ -828,79 +868,40 @@ references and parameter-entity references." (t elem)))) -;;******************************************************************* -;;** -;;** Substituting special XML sequences -;;** -;;******************************************************************* +;;; Substituting special XML sequences (defun xml-substitute-special (string) - "Return STRING, after substituting entity references." - ;; This originally made repeated passes through the string from the - ;; beginning, which isn't correct, since then either "&amp;" or - ;; "&amp;" won't DTRT. - - (let ((point 0) - children end-point) - (while (string-match "&\\([^;]*\\);" string point) - (setq end-point (match-end 0)) - (let* ((this-part (match-string-no-properties 1 string)) - (prev-part (substring string point (match-beginning 0))) - (entity (assoc this-part xml-entity-alist)) - (expansion - (cond ((string-match "#\\([0-9]+\\)" this-part) - (let ((c (decode-char - 'ucs - (string-to-number (match-string-no-properties 1 this-part))))) - (if c (string c)))) - ((string-match "#x\\([[:xdigit:]]+\\)" this-part) - (let ((c (decode-char - 'ucs - (string-to-number (match-string-no-properties 1 this-part) 16)))) - (if c (string c)))) - (entity - (cdr entity)) - ((eq (length this-part) 0) - (error "XML: (Not Well-Formed) No entity given")) - (t - (if xml-validating-parser - (error "XML: (Validity) Undefined entity `%s'" - this-part) - xml-undefined-entity))))) - - (cond ((null children) - ;; FIXME: If we have an entity that expands into XML, this won't work. - (setq children - (concat prev-part expansion))) - ((stringp children) - (if (stringp expansion) - (setq children (concat children prev-part expansion)) - (setq children (list expansion (concat prev-part children))))) - ((and (stringp expansion) - (stringp (car children))) - (setcar children (concat prev-part expansion (car children)))) - ((stringp expansion) - (setq children (append (concat prev-part expansion) - children))) - ((stringp (car children)) - (setcar children (concat (car children) prev-part)) - (setq children (append expansion children))) - (t - (setq children (list expansion - prev-part - children)))) - (setq point end-point))) - (cond ((stringp children) - (concat children (substring string point))) - ((stringp (car (last children))) - (concat (car (last children)) (substring string point))) - ((null children) - string) - (t - (concat (mapconcat 'identity - (nreverse children) - "") - (substring string point)))))) + "Return STRING, after substituting entity and character references. +STRING is assumed to occur in an XML attribute value." + (let ((ref-re (eval-when-compile + (concat "&\\(?:#\\(x\\)?\\([0-9]+\\)\\|\\(" + xml-name-re "\\)\\);"))) + children) + (while (string-match ref-re string) + (push (substring string 0 (match-beginning 0)) children) + (let* ((remainder (substring string (match-end 0))) + (ref (match-string 2 string))) + (if ref + ;; [4.6] Character references are included as + ;; character data. + (let ((val (decode-char 'ucs (string-to-number + ref (if (match-string 1 string) 16))))) + (push (cond (val (string val)) + (xml-validating-parser + (error "XML: (Validity) Undefined character `x%s'" ref)) + (t xml-undefined-entity)) + children) + (setq string remainder)) + ;; [4.4.5] Entity references are "included in literal". + ;; Note that we don't need do anything special to treat + ;; quotes as normal data characters. + (setq ref (match-string 3 string)) + (let ((val (or (cdr (assoc ref xml-entity-alist)) + (if xml-validating-parser + (error "XML: (Validity) Undefined entity `%s'" ref) + xml-undefined-entity)))) + (setq string (concat val remainder)))))) + (mapconcat 'identity (nreverse (cons string children)) ""))) (defun xml-substitute-numeric-entities (string) "Substitute SGML numeric entities by their respective utf characters. @@ -921,12 +922,7 @@ by \"*\"." string) nil)) -;;******************************************************************* -;;** -;;** Printing a tree. -;;** This function is intended mainly for debugging purposes. -;;** -;;******************************************************************* +;;; Printing a parse tree (mainly for debugging). (defun xml-debug-print (xml &optional indent-string) "Outputs the XML in the current buffer. diff --git a/test/ChangeLog b/test/ChangeLog index d9d9bc5a9fa..3ff7124893a 100644 --- a/test/ChangeLog +++ b/test/ChangeLog @@ -1,3 +1,8 @@ +2012-07-02 Chong Yidong + + * automated/xml-parse-tests.el (xml-parse-tests--data): More + testcases. + 2012-07-01 Chong Yidong * automated/xml-parse-tests.el: New file. diff --git a/test/automated/xml-parse-tests.el b/test/automated/xml-parse-tests.el index 8322a8c6ff9..ec3d7ca3065 100644 --- a/test/automated/xml-parse-tests.el +++ b/test/automated/xml-parse-tests.el @@ -33,15 +33,26 @@ '(;; General entity substitution ("]>&ent;;" . ((foo ((a . "b")) (bar nil "AbC;")))) + ("&amp;&apos;'<>"" . + ((foo () "&''<>\""))) ;; Parameter entity substitution ("]>&ent;;" . ((foo ((a . "b")) (bar nil "AbC;")))) ;; Tricky parameter entity substitution (like XML spec Appendix D) ("' > %xx; ]>A&ent;C" . - ((foo nil "AbC"))) + ((foo () "AbC"))) ;; Bug#7172 (" ]>" . - ((foo nil)))) + ((foo ()))) + ;; Entities referencing entities, in character data + ("]>&abc;" . + ((foo () "aBc"))) + ;; Entities referencing entities, in attribute values + ("]>1" . + ((foo ((a . "-aBc-")) "1"))) + ;; Character references must be treated as character data + ("AT&T;" . ((foo () "AT&T;"))) + ("&amp;" . ((foo () "&")))) "Alist of XML strings and their expected parse trees.") (ert-deftest xml-parse-tests ()