From c39443c1d651bab2eb023f4c38db418c3dc04160 Mon Sep 17 00:00:00 2001 From: Ulf Jasper Date: Fri, 21 Nov 2014 16:31:30 +0100 Subject: [PATCH] 'libxml-parse(html|xml)-region': new optional param 'discard-comments'. * doc/lispref/text.texi (Parsing HTML/XML): Document new optional parameter 'discard-comments' of 'libxml-parse(html|xml)-region'. * src/xml.c (parse_region): Take care of new optional parameter 'discard-comments' of 'libxml-parse(html|xml)-region'. (Flibxml_parse_html_region, Flibxml_parse_xml_region): New optional parameter 'discard-comments'. * test/automated/libxml-tests.el (libxml-tests--data-comments-preserved): Renamed from 'libxml-tests--data'. (libxml-tests--data-comments-discarded): New. (libxml-tests): Check whether 'libxml-parse-xml-region' is discarding comments correctly. --- doc/lispref/ChangeLog | 5 ++++ doc/lispref/text.texi | 7 +++-- src/ChangeLog | 7 +++++ src/xml.c | 47 +++++++++++++++++++--------------- test/ChangeLog | 9 +++++++ test/automated/libxml-tests.el | 26 ++++++++++++++++--- 6 files changed, 74 insertions(+), 27 deletions(-) diff --git a/doc/lispref/ChangeLog b/doc/lispref/ChangeLog index 6706f936c5e..0c8792af81f 100644 --- a/doc/lispref/ChangeLog +++ b/doc/lispref/ChangeLog @@ -1,3 +1,8 @@ +2014-11-21 Ulf Jasper + + * text.texi (Parsing HTML/XML): Document new optional parameter + 'discard-comments' of 'libxml-parse(html|xml)-region'. + 2014-11-18 Leo Liu * functions.texi (Advising Named Functions): Document diff --git a/doc/lispref/text.texi b/doc/lispref/text.texi index d1a1e6fa6b9..7c88a5b25d1 100644 --- a/doc/lispref/text.texi +++ b/doc/lispref/text.texi @@ -4324,7 +4324,7 @@ coding instead. When Emacs is compiled with libxml2 support, the following functions are available to parse HTML or XML text into Lisp object trees. -@defun libxml-parse-html-region start end &optional base-url +@defun libxml-parse-html-region start end &optional base-url discard-comments This function parses the text between @var{start} and @var{end} as HTML, and returns a list representing the HTML @dfn{parse tree}. It attempts to handle ``real world'' HTML by robustly coping with syntax @@ -4333,6 +4333,9 @@ mistakes. The optional argument @var{base-url}, if non-@code{nil}, should be a string specifying the base URL for relative URLs occurring in links. +If the optional argument @var{discard-comments} is non-@code{nil}, +then the parse tree is created without any comments. + In the parse tree, each HTML node is represented by a list in which the first element is a symbol representing the node name, the second element is an alist of node attributes, and the remaining elements are @@ -4368,7 +4371,7 @@ buffer. The argument @var{dom} should be a list as generated by @end defun @cindex parsing xml -@defun libxml-parse-xml-region start end &optional base-url +@defun libxml-parse-xml-region start end &optional base-url discard-comments This function is the same as @code{libxml-parse-html-region}, except that it parses the text as XML rather than HTML (so it is stricter about syntax). diff --git a/src/ChangeLog b/src/ChangeLog index b169479a274..f8c9c5afd69 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,10 @@ +2014-11-21 Ulf Jasper + + * xml.c (parse_region): Take care of new optional parameter + 'discard-comments' of 'libxml-parse(html|xml)-region'. + (Flibxml_parse_html_region, Flibxml_parse_xml_region): New + optional parameter 'discard-comments'. + 2014-11-17 Paul Eggert Improve time stamp handling, and be more consistent about it. diff --git a/src/xml.c b/src/xml.c index 7e99beb1d05..d418202182b 100644 --- a/src/xml.c +++ b/src/xml.c @@ -175,7 +175,7 @@ make_dom (xmlNode *node) } static Lisp_Object -parse_region (Lisp_Object start, Lisp_Object end, Lisp_Object base_url, int htmlp) +parse_region (Lisp_Object start, Lisp_Object end, Lisp_Object base_url, Lisp_Object discard_comments, int htmlp) { xmlDoc *doc; Lisp_Object result = Qnil; @@ -214,21 +214,24 @@ parse_region (Lisp_Object start, Lisp_Object end, Lisp_Object base_url, int html if (doc != NULL) { - /* If the document is just comments, then this should get us the - nodes anyway. */ - xmlNode *n = doc->children; Lisp_Object r = Qnil; - - while (n) { - if (!NILP (r)) - result = Fcons (r, result); - r = make_dom (n); - n = n->next; - } + if (NILP(discard_comments)) + { + /* If the document has toplevel comments, then this should + get us the nodes and the comments. */ + xmlNode *n = doc->children; + + while (n) { + if (!NILP (r)) + result = Fcons (r, result); + r = make_dom (n); + n = n->next; + } + } if (NILP (result)) { - /* The document isn't just comments, so get the tree the - proper way. */ + /* The document doesn't have toplevel comments or we discarded + them. Get the tree the proper way. */ xmlNode *node = fn_xmlDocGetRootElement (doc); if (node != NULL) result = make_dom (node); @@ -251,25 +254,27 @@ xml_cleanup_parser (void) DEFUN ("libxml-parse-html-region", Flibxml_parse_html_region, Slibxml_parse_html_region, - 2, 3, 0, + 2, 4, 0, doc: /* Parse the region as an HTML document and return the parse tree. -If BASE-URL is non-nil, it is used to expand relative URLs. */) - (Lisp_Object start, Lisp_Object end, Lisp_Object base_url) +If BASE-URL is non-nil, it is used to expand relative URLs. +If DISCARD-COMMENTS is non-nil, all HTML comments are discarded. */) + (Lisp_Object start, Lisp_Object end, Lisp_Object base_url, Lisp_Object discard_comments) { if (init_libxml2_functions ()) - return parse_region (start, end, base_url, 1); + return parse_region (start, end, base_url, discard_comments, 1); return Qnil; } DEFUN ("libxml-parse-xml-region", Flibxml_parse_xml_region, Slibxml_parse_xml_region, - 2, 3, 0, + 2, 4, 0, doc: /* Parse the region as an XML document and return the parse tree. -If BASE-URL is non-nil, it is used to expand relative URLs. */) - (Lisp_Object start, Lisp_Object end, Lisp_Object base_url) +If BASE-URL is non-nil, it is used to expand relative URLs. +If DISCARD-COMMENTS is non-nil, all HTML comments are discarded. */) + (Lisp_Object start, Lisp_Object end, Lisp_Object base_url, Lisp_Object discard_comments) { if (init_libxml2_functions ()) - return parse_region (start, end, base_url, 0); + return parse_region (start, end, base_url, discard_comments, 0); return Qnil; } diff --git a/test/ChangeLog b/test/ChangeLog index 475b6a3f473..d0988e42076 100644 --- a/test/ChangeLog +++ b/test/ChangeLog @@ -1,3 +1,12 @@ +2014-11-21 Ulf Jasper + + * automated/libxml-tests.el + (libxml-tests--data-comments-preserved): Renamed from + 'libxml-tests--data'. + (libxml-tests--data-comments-discarded): New. + (libxml-tests): Check whether 'libxml-parse-xml-region' is + discarding comments correctly. + 2014-11-17 Michal Nazarewicz * automated/tildify-tests.el (tildify-test-html, tildify-test-xml): diff --git a/test/automated/libxml-tests.el b/test/automated/libxml-tests.el index ced0df7b3c4..6b6d0170562 100644 --- a/test/automated/libxml-tests.el +++ b/test/automated/libxml-tests.el @@ -27,7 +27,7 @@ (require 'ert) -(defvar libxml-tests--data +(defvar libxml-tests--data-comments-preserved `(;; simple case ("bar" . (foo ((baz . "true")) "bar")) @@ -40,17 +40,35 @@ "blub") . (top nil (comment nil "comment-a") (foo ((a . "b")) (bar nil "blub")) (comment nil "comment-b") (comment nil "comment-c")))) - "Alist of XML strings and their expected parse trees.") + "Alist of XML strings and their expected parse trees for preserved comments.") + +(defvar libxml-tests--data-comments-discarded + `(;; simple case + ("bar" + . (foo ((baz . "true")) "bar")) + ;; toplevel comments -- first document child must not get lost + (,(concat "bar" + "") + . (foo nil "bar")) + (,(concat "" + "blub") + . (foo ((a . "b")) (bar nil "blub")))) + "Alist of XML strings and their expected parse trees for discarded comments.") (ert-deftest libxml-tests () "Test libxml." (when (fboundp 'libxml-parse-xml-region) (with-temp-buffer - (dolist (test libxml-tests--data) + (dolist (test libxml-tests--data-comments-preserved) + (erase-buffer) + (insert (car test)) + (should (equal (cdr test) + (libxml-parse-xml-region (point-min) (point-max))))) + (dolist (test libxml-tests--data-comments-discarded) (erase-buffer) (insert (car test)) (should (equal (cdr test) - (libxml-parse-xml-region (point-min) (point-max)))))))) + (libxml-parse-xml-region (point-min) (point-max) nil t))))))) ;;; libxml-tests.el ends here -- 2.39.5