From: Jim Porter Date: Sun, 20 Jul 2025 20:48:22 +0000 (-0700) Subject: Add new function 'dom-inner-text' X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=587e6d4715981e180995888b1794ba4bfdeb3613;p=emacs.git Add new function 'dom-inner-text' This replaces 'dom-text' and 'dom-texts', and is both more correct and more efficient than them. * lisp/dom.el (dom-text, dom-texts): Make obsolete in favor of... (dom-inner-text--1, dom-inner-text): ... these new functions. Update callers. * doc/lispref/text.texi (Document Object Model): Update documentation to refer to 'dom-inner-text'. * etc/NEWS: Announce this change. (cherry picked from commit 2e53c7d08ba0de2468ec25cf9a9d7079604f9409) --- diff --git a/doc/lispref/text.texi b/doc/lispref/text.texi index 0d293d14f93..b8437d954f7 100644 --- a/doc/lispref/text.texi +++ b/doc/lispref/text.texi @@ -5729,14 +5729,10 @@ Return all the non-string children of the node. @item dom-attributes @var{node} Return the key/value pair list of attributes of the node. -@item dom-text @var{node} -Return all the textual elements of the node as a concatenated string. - -@item dom-texts @var{node} +@item dom-inner-text @var{node} Return all the textual elements of the node, as well as the textual elements of all the children of the node, recursively, as a -concatenated string. This function also takes an optional separator -to be inserted between the textual elements. +concatenated string. @item dom-parent @var{dom} @var{node} Return the parent of @var{node} in @var{dom}. diff --git a/lisp/dom.el b/lisp/dom.el index c65dfd66dcc..89f450415ea 100644 --- a/lisp/dom.el +++ b/lisp/dom.el @@ -75,10 +75,12 @@ A typical attribute is `href'." (defun dom-text (node) "Return all the text bits in the current node concatenated." + (declare (obsolete 'dom-inner-text "31.1")) (mapconcat #'identity (cl-remove-if-not #'stringp (dom-children node)) " ")) (defun dom-texts (node &optional separator) "Return all textual data under NODE concatenated with SEPARATOR in-between." + (declare (obsolete 'dom-inner-text "31.1")) (if (eq (dom-tag node) 'script) "" (mapconcat @@ -93,6 +95,25 @@ A typical attribute is `href'." (dom-children node) (or separator " ")))) +(defun dom-inner-text--1 (node) + (dolist (child (dom-children node)) + (cond + ((stringp child) (insert child)) + ((memq (dom-tag child) '(script comment))) + (t (dom-inner-text--1 child))))) + +(defun dom-inner-text (node) + "Return all textual data under NODE as a single string." + (let ((children (dom-children node))) + (if (and (length= children 1) + (stringp (car children))) + ;; Copy the string content when returning to be consistent with + ;; the other branch of this `if' expression. + (copy-sequence (car children)) + (with-work-buffer + (dom-inner-text--1 node) + (buffer-string))))) + (defun dom-child-by-tag (dom tag) "Return the first child of DOM that is of type TAG." (assoc tag (dom-children dom))) diff --git a/lisp/gnus/nnatom.el b/lisp/gnus/nnatom.el index b4a37b45885..6e34751df17 100644 --- a/lisp/gnus/nnatom.el +++ b/lisp/gnus/nnatom.el @@ -110,7 +110,8 @@ (defun nnatom--dom-line (node) "Return NODE's text as a single, whitespace-trimmed line." - (string-trim (replace-regexp-in-string "[\r\n]+" " " (dom-text node) t))) + (string-trim (replace-regexp-in-string + "[\r\n]+" " " (dom-inner-text node) t))) (defun nnatom--read-title (group) "Return the title of GROUP, or nil." @@ -245,7 +246,7 @@ return the subject. Otherwise, return nil." (dom-print (dom-child-by-tag part 'div) nil t) (buffer-substring-no-properties (point-min) (point-max))) - (dom-text part))) + (dom-inner-text part))) (type (if (member type atypes) (concat "text/" type) type)) (type (or (cdr (assoc type mtypes)) type))) (unless (string-blank-p part) diff --git a/lisp/net/eww.el b/lisp/net/eww.el index aa70163c63c..14bd66d5064 100644 --- a/lisp/net/eww.el +++ b/lisp/net/eww.el @@ -1007,7 +1007,7 @@ This replaces the region with the preprocessed HTML." (plist-put eww-data :title (replace-regexp-in-string "^ \\| $" "" - (replace-regexp-in-string "[ \t\r\n]+" " " (dom-text dom)))) + (replace-regexp-in-string "[ \t\r\n]+" " " (dom-inner-text dom)))) (eww--after-page-change)) (defun eww-display-raw (buffer &optional encode) @@ -1178,7 +1178,7 @@ non-nil, don't actually compute a score; just call the callback." (setq score 2 noscore t)) ((eq (dom-tag node) 'a) - (setq score (- (length (split-string (dom-text node)))) + (setq score (- (length (split-string (dom-inner-text node)))) noscore t)) (t (setq score -1)))) @@ -1203,7 +1203,7 @@ If EWW can't create a readable version, return nil instead." (when (and score (> score best-score) ;; We set a lower bound to how long we accept that ;; the readable portion of the page is going to be. - (> (length (split-string (dom-texts node))) 100)) + (> (length (split-string (dom-inner-text node))) 100)) (setq best-score score best-node node)) ;; Keep track of any and <link> tags we find to include @@ -1218,7 +1218,7 @@ If EWW can't create a readable version, return nil instead." ;; directly in our list in addition to as a child of some ;; other node in the list. This is ok for <title> and <link> ;; tags, but might need changed if supporting other tags. - (let* ((inner-text (dom-texts node "")) + (let* ((inner-text (dom-inner-text node)) (new-node `(,(dom-tag node) ,(dom-attributes node) ,@(when (length> inner-text 0) @@ -1250,7 +1250,7 @@ If EWW can't create a readable version, return nil instead." most-negative-fixnum)) ;; We set a lower bound to how long we accept that the ;; readable portion of the page is going to be. - (when (> (length (split-string (dom-texts highest))) 100) + (when (> (length (split-string (dom-inner-text highest))) 100) (setq result highest)))) result)) @@ -1875,7 +1875,7 @@ See URL `https://developer.mozilla.org/en-US/docs/Web/HTML/Element/Input'.") 'display (make-string (length value) ?*))))))))) (defun eww-tag-textarea (dom) - (let ((value (or (dom-text dom) "")) + (let ((value (or (dom-inner-text dom) "")) (lines (string-to-number (or (dom-attr dom 'rows) "10"))) (width (string-to-number (or (dom-attr dom 'cols) "10"))) start end form) @@ -1951,7 +1951,7 @@ See URL `https://developer.mozilla.org/en-US/docs/Web/HTML/Element/Input'.") (dolist (elem (dom-by-tag dom 'option)) (when (dom-attr elem 'selected) (nconc menu (list :value (dom-attr elem 'value)))) - (let ((display (dom-text elem))) + (let ((display (dom-inner-text elem))) (setq max (max max (length display))) (push (list 'item :value (dom-attr elem 'value)