From c657861758d1fd5b70dac9869336d33f9b36a609 Mon Sep 17 00:00:00 2001 From: Jason Rumney Date: Mon, 18 Feb 2008 01:45:54 +0000 Subject: [PATCH] * international/mule.el (sgml-xml-auto-coding-function): Detect and warn if file encoding is not utf-8 and encoding not specified. (xml-find-file-coding-system): New function. * international/mule-conf.el (file-coding-system-alist): Use it. --- lisp/ChangeLog | 7 ++++++ lisp/international/mule-conf.el | 6 +---- lisp/international/mule.el | 41 ++++++++++++++++++++++++++++++++- 3 files changed, 48 insertions(+), 6 deletions(-) diff --git a/lisp/ChangeLog b/lisp/ChangeLog index 7a236b3f620..fc62689b43a 100644 --- a/lisp/ChangeLog +++ b/lisp/ChangeLog @@ -1,3 +1,10 @@ +2008-02-18 Jason Rumney + + * international/mule.el (sgml-xml-auto-coding-function): Detect + and warn if file encoding is not utf-8 and encoding not specified. + (xml-find-file-coding-system): New function. + * international/mule-conf.el (file-coding-system-alist): Use it. + 2008-02-17 Glenn Morris * international/mule-cmds.el (set-locale-environment): Pass diff --git a/lisp/international/mule-conf.el b/lisp/international/mule-conf.el index 1184612ed40..cd3b5a352fd 100644 --- a/lisp/international/mule-conf.el +++ b/lisp/international/mule-conf.el @@ -1470,11 +1470,7 @@ for decoding and encoding files, process I/O, etc." (setq file-coding-system-alist '(("\\.elc\\'" . utf-8-emacs) ("\\.utf\\(-8\\)?\\'" . utf-8) - ;; This is the defined default for XML documents. It may be - ;; overridden by a charset specification in the header. That - ;; should be grokked by the auto-coding mechanism, but rms - ;; vetoed that. -- fx - ("\\.xml\\'" . utf-8) + ("\\.xml\\'" . xml-find-file-coding-system) ;; We use raw-text for reading loaddefs.el so that if it ;; happens to have DOS or Mac EOLs, they are converted to ;; newlines. This is required to make the special treatment diff --git a/lisp/international/mule.el b/lisp/international/mule.el index c1723523b28..7952c7a6878 100644 --- a/lisp/international/mule.el +++ b/lisp/international/mule.el @@ -2288,7 +2288,22 @@ This function is intended to be added to `auto-coding-functions'." sym (message "Warning: unknown coding system \"%s\"" match) nil)) - 'utf-8))))) + ;; Files without an encoding tag should be UTF-8. But users + ;; may be naive about encodings, and have saved the file from + ;; another editor that does not help them get the encoding right. + ;; Detect the encoding and warn the user if it is detected as + ;; something other than UTF-8. + (let ((detected + (with-coding-priority '(utf-8) + (coding-system-base + (detect-coding-region (point-min) size t))))) + ;; Pure ASCII always comes back as undecided. + (if (memq detected '(utf-8 undecided)) + 'utf-8 + (warn "File contents detected as %s. + Consider adding an encoding attribute to the xml declaration, + or saving as utf-8, as mandated by the xml specification." detected) + detected))))))) (defun sgml-html-meta-auto-coding-function (size) "If the buffer has an HTML meta tag, use it to determine encoding. @@ -2314,6 +2329,30 @@ This function is intended to be added to `auto-coding-functions'." (message "Warning: unknown coding system \"%s\"" match) nil))))) +(defun xml-find-file-coding-system (args) + "Determine the coding system of an XML file without a declaration. +Strictly speaking, the file should be utf-8, but mistakes are +made, and there are genuine cases where XML fragments are saved, +with the encoding properly specified in a master document, or +added by processing software." + (if (eq (car args) 'insert-file-contents) + (let ((detected + (with-coding-priority '(utf-8) + (coding-system-base + (detect-coding-region (point-min) (point-max) t))))) + ;; Pure ASCII always comes back as undecided. + (if (memq detected '(utf-8 undecided)) + 'utf-8 + (warn "File contents detected as %s. + Consider adding an xml declaration with the encoding specified, + or saving as utf-8, as mandated by the xml specification." detected) + detected)) + ;; Don't interfere with the user's wishes for saving the buffer. + ;; We did what we could when the buffer was created to ensure the + ;; correct encoding was used, or the user was warned, so any + ;; non-conformity here is deliberate on the part of the user. + 'undecided)) + ;;; (provide 'mule) -- 2.39.5