From c912b478a7752a75faa1790350421d084b71999b Mon Sep 17 00:00:00 2001 From: Katsumi Yamaoka Date: Tue, 7 Dec 2010 05:06:56 +0000 Subject: [PATCH] Decode extra numeric entities. mm-util.el (mm-extra-numeric-entities): New variable. mm-url.el (mm-url-decode-entities): mm-decode.el (mm-shr): Use it to decode extra numeric entities. --- lisp/gnus/ChangeLog | 7 +++++++ lisp/gnus/mm-decode.el | 28 ++++++++++++++++++++-------- lisp/gnus/mm-url.el | 19 +++++++++++-------- lisp/gnus/mm-util.el | 15 +++++++++++++++ 4 files changed, 53 insertions(+), 16 deletions(-) diff --git a/lisp/gnus/ChangeLog b/lisp/gnus/ChangeLog index 4a5d103e08b..af7161d3e93 100644 --- a/lisp/gnus/ChangeLog +++ b/lisp/gnus/ChangeLog @@ -1,3 +1,10 @@ +2010-12-07 Katsumi Yamaoka + + * mm-util.el (mm-extra-numeric-entities): New variable. + + * mm-url.el (mm-url-decode-entities): + * mm-decode.el (mm-shr): Use it to decode extra numeric entities. + 2010-12-07 Stefan Monnier * message.el: Use completion-at-point. diff --git a/lisp/gnus/mm-decode.el b/lisp/gnus/mm-decode.el index bd9e704144e..216ed6624d9 100644 --- a/lisp/gnus/mm-decode.el +++ b/lisp/gnus/mm-decode.el @@ -1699,7 +1699,7 @@ If RECURSIVE, search recursively." (when handle (mm-with-part handle (buffer-string)))))) - shr-inhibit-images shr-blocked-images charset) + shr-inhibit-images shr-blocked-images charset char) (if (and (boundp 'gnus-summary-buffer) (buffer-name gnus-summary-buffer)) (with-current-buffer gnus-summary-buffer @@ -1714,13 +1714,25 @@ If RECURSIVE, search recursively." (narrow-to-region (point) (point)) (shr-insert-document (mm-with-part handle - (when (and charset - (setq charset (mm-charset-to-coding-system charset)) - (not (eq charset 'ascii))) - (insert (prog1 - (mm-decode-coding-string (buffer-string) charset) - (erase-buffer) - (mm-enable-multibyte)))) + (insert (prog1 + (if (and charset + (setq charset + (mm-charset-to-coding-system charset)) + (not (eq charset 'ascii))) + (mm-decode-coding-string (buffer-string) charset) + (mm-string-as-multibyte (buffer-string))) + (erase-buffer) + (mm-enable-multibyte))) + (goto-char (point-min)) + (setq case-fold-search t) + (while (re-search-forward + "&#\\(?:x\\([89][0-9a-f]\\)\\|\\(1[2-5][0-9]\\)\\);" nil t) + (when (setq char + (cdr (assq (if (match-beginning 1) + (string-to-number (match-string 1) 16) + (string-to-number (match-string 2))) + mm-extra-numeric-entities))) + (replace-match (char-to-string char)))) (libxml-parse-html-region (point-min) (point-max)))) (mm-handle-set-undisplayer handle diff --git a/lisp/gnus/mm-url.el b/lisp/gnus/mm-url.el index 0da136e1efc..0c2b80c9ca7 100644 --- a/lisp/gnus/mm-url.el +++ b/lisp/gnus/mm-url.el @@ -365,16 +365,19 @@ If FOLLOW-REFRESH is non-nil, redirect refresh url in META." (defun mm-url-decode-entities () "Decode all HTML entities." (goto-char (point-min)) - (while (re-search-forward "&\\(#[0-9]+\\|#x[0-9a-f]+\\|[a-z]+[0-9]*\\);" nil t) + (while (re-search-forward "&\\(#[0-9]+\\|#x[0-9a-f]+\\|[a-z]+[0-9]*\\);" + nil t) (let* ((entity (match-string 1)) (elem (if (eq (aref entity 0) ?\#) - (let ((c (mm-ucs-to-char - ;; Hex number: ㈒ - (if (eq (aref entity 1) ?x) - (string-to-number (substring entity 2) - 16) - ;; Decimal number:  - (string-to-number (substring entity 1)))))) + (let ((c + ;; Hex number: ㈒ + (if (eq (aref entity 1) ?x) + (string-to-number (substring entity 2) + 16) + ;; Decimal number:  + (string-to-number (substring entity 1))))) + (setq c (or (cdr (assq c mm-extra-numeric-entities)) + (mm-ucs-to-char c))) (if (mm-char-or-char-int-p c) c ?#)) (or (cdr (assq (intern entity) mm-url-html-entities)) diff --git a/lisp/gnus/mm-util.el b/lisp/gnus/mm-util.el index 2f6464d43f2..af4ac588d84 100644 --- a/lisp/gnus/mm-util.el +++ b/lisp/gnus/mm-util.el @@ -866,6 +866,21 @@ variable is set, it overrides the default priority." Setting it to nil is useful on Emacsen supporting Unicode if sending mail with multiple parts is preferred to sending a Unicode one.") +(defvar mm-extra-numeric-entities + (mapcar + (lambda (item) + (cons (car item) (mm-ucs-to-char (cdr item)))) + '((#x80 . #x20AC) (#x82 . #x201A) (#x83 . #x0192) (#x84 . #x201E) + (#x85 . #x2026) (#x86 . #x2020) (#x87 . #x2021) (#x88 . #x02C6) + (#x89 . #x2030) (#x8A . #x0160) (#x8B . #x2039) (#x8C . #x0152) + (#x8E . #x017D) (#x91 . #x2018) (#x92 . #x2019) (#x93 . #x201C) + (#x94 . #x201D) (#x95 . #x2022) (#x96 . #x2013) (#x97 . #x2014) + (#x98 . #x02DC) (#x99 . #x2122) (#x9A . #x0161) (#x9B . #x203A) + (#x9C . #x0153) (#x9E . #x017E) (#x9F . #x0178))) + "*Alist of extra numeric entities and characters other than ISO 10646. +This table is used for decoding extra numeric entities to characters, +like \"€\" to the euro sign, mainly in html messages.") + ;;; Internal variables: ;;; Functions: -- 2.39.5