]> git.eshelyaron.com Git - emacs.git/commitdiff
Fix visiting and saving UTF-16 encoded XML files
authorEli Zaretskii <eliz@gnu.org>
Sat, 18 May 2019 07:15:19 +0000 (10:15 +0300)
committerEli Zaretskii <eliz@gnu.org>
Sat, 18 May 2019 07:15:19 +0000 (10:15 +0300)
* lisp/international/mule.el (sgml-xml-auto-coding-function):
When the 'enncoding' tag specifies a UTF-16 encoding, enforce
saving the buffer with BOM, per the XML spec.
(xml-find-file-coding-system): Recognize UTF-16 encodings with
BOM.  (Bug#35766)  (Bug#8282)
* lisp/international/mule-cmds.el (select-safe-coding-system):
Don't consider UTF-16 encodings with and without BOM as
"different", so as not to annoy users with redundant questions
about mismatch between the XML/SGML header and the selected
explicit encoding.

lisp/international/mule-cmds.el
lisp/international/mule.el

index dfa9e4e6c8ceb4313a628275b5ccd54e89634e07..27296ecfb2cde74035009b6f7c9a1337e18735a6 100644 (file)
@@ -1029,7 +1029,13 @@ It is highly recommended to fix it before writing to a file."
                 ;; This check perhaps isn't ideal, but is probably
                 ;; the best thing to do.
                 (not (auto-coding-alist-lookup (or file buffer-file-name "")))
-                (not (coding-system-equal coding-system auto-cs)))
+                (not (coding-system-equal coding-system auto-cs))
+                 ;; coding-system-equal barfs on 'charset'.
+                 (or (equal (coding-system-type auto-cs) 'charset)
+                     (equal (coding-system-type coding-system) 'charset)
+                     (not (coding-system-equal (coding-system-type auto-cs)
+                                               (coding-system-type
+                                                coding-system)))))
            (unless (yes-or-no-p
                     (format "Selected encoding %s disagrees with \
 %s specified by file contents.  Really save (else edit coding cookies \
index b5414de0dba637f3a6af3da1dc8e891e9c8b1877..21f3118a98ecaf0afbc83fdd1dbaabec4d40c068 100644 (file)
@@ -2498,7 +2498,18 @@ This function is intended to be added to `auto-coding-functions'."
       (when end
        (if (re-search-forward "encoding=[\"']\\(.+?\\)[\"']" end t)
            (let* ((match (match-string 1))
-                  (sym (intern (downcase match))))
+                   (sym-name (downcase match))
+                   (sym-name
+                    ;; https://www.w3.org/TR/xml/#charencoding says:
+                    ;; "Entities encoded in UTF-16 MUST [...] begin
+                    ;; with the Byte Order Mark."  The trick below is
+                    ;; based on the fact that utf-16be/le don't
+                    ;; specify BOM, while utf-16-be/le do.
+                    (cond
+                     ((equal sym-name "utf-16le") "utf-16-le")
+                     ((equal sym-name "utf-16be") "utf-16-be")
+                     (t sym-name)))
+                  (sym (intern sym-name)))
              (if (coding-system-p sym)
                   ;; If the encoding tag is UTF-8 and the buffer's
                   ;; encoding is one of the variants of UTF-8, use the
@@ -2587,9 +2598,14 @@ added by processing software."
       (let ((detected
              (with-coding-priority '(utf-8)
                (coding-system-base
-                (detect-coding-region (point-min) (point-max) t)))))
-        ;; Pure ASCII always comes back as undecided.
+                (detect-coding-region (point-min) (point-max) t))))
+            (bom (list (char-after 1) (char-after 2))))
         (cond
+         ((equal bom '(#xFE #xFF))
+          'utf-16be-with-signature)
+         ((equal bom '(#xFF #xFE))
+          'utf-16le-with-signature)
+         ;; Pure ASCII always comes back as undecided.
          ((memq detected '(utf-8 undecided))
           'utf-8)
          ((eq detected 'utf-16le-with-signature) 'utf-16le-with-signature)