From 568f1488a69e8cb0961571ff8f158df8891c3c44 Mon Sep 17 00:00:00 2001 From: Lars Ingebrigtsen Date: Sat, 14 Sep 2019 16:07:34 +0200 Subject: [PATCH] Make eww more liberal when interpreting some invalid HTML * lisp/net/eww.el (eww--preprocess-html): New function (bug#37009) to be more lenient with invalid HTML and translate common invalid HTML like "a <= b" into "a <= b" to be more liberal in what we accept before parsing. (eww-display-html): Use it. (eww-readable): Ditto. --- lisp/net/eww.el | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/lisp/net/eww.el b/lisp/net/eww.el index 77e6cec9b04..2013604c9e7 100644 --- a/lisp/net/eww.el +++ b/lisp/net/eww.el @@ -326,6 +326,18 @@ the default EWW buffer." #'url-hexify-string (split-string url) "+")))))) url) +(defun eww--preprocess-html (start end) + "Translate all < characters that do not look like start of tags into <." + (save-excursion + (save-restriction + (narrow-to-region start end) + (goto-char start) + (let ((case-fold-search t)) + (while (re-search-forward "<[^0-9a-z!/]" nil t) + (goto-char (match-beginning 0)) + (delete-region (point) (1+ (point))) + (insert "<")))))) + ;;;###autoload (defalias 'browse-web 'eww) ;;;###autoload @@ -479,6 +491,7 @@ Currently this means either text/html or application/xhtml+xml." ;; Remove CRLF and replace NUL with � before parsing. (while (re-search-forward "\\(\r$\\)\\|\0" nil t) (replace-match (if (match-beginning 1) "" "�") t t))) + (eww--preprocess-html (point) (point-max)) (libxml-parse-html-region (point) (point-max)))))) (source (and (null document) (buffer-substring (point) (point-max))))) @@ -716,6 +729,7 @@ the like." (condition-case nil (decode-coding-region (point-min) (point-max) 'utf-8) (coding-system-error nil)) + (eww--preprocess-html (point-min) (point-max)) (libxml-parse-html-region (point-min) (point-max)))) (base (plist-get eww-data :url))) (eww-score-readability dom) -- 2.39.5