From 6b4d96c2f04e5a08c4f9fff144743ff16c151dae Mon Sep 17 00:00:00 2001 From: Kenichi Handa Date: Wed, 4 Aug 2010 17:06:52 +0900 Subject: [PATCH] Modify the coding system compound-text-with-extensions to conform to the spec of Compound Text. --- lisp/ChangeLog | 17 ++++ lisp/international/mule-conf.el | 14 +-- lisp/international/mule.el | 147 +++++++++++++++++--------------- lisp/language/cyrillic.el | 7 -- 4 files changed, 104 insertions(+), 81 deletions(-) diff --git a/lisp/ChangeLog b/lisp/ChangeLog index 670f07c2683..775ddcdc2e7 100644 --- a/lisp/ChangeLog +++ b/lisp/ChangeLog @@ -1,3 +1,20 @@ +2010-08-04 Kenichi Handa + + * language/cyrillic.el: Don't add "microsoft-cp1251" to + ctext-non-standard-encodings-alist here. + + * international/mule.el (ctext-non-standard-encodings-alist): Add + "koi8-r" and "microsoft-cp1251". + (ctext-standard-encodings): New variable. + (ctext-non-standard-encodings-table): List only elements for + non-standard encodings. + (ctext-pre-write-conversion): Adjusted for the above change. + Check ctext-standard-encodings. + + * international/mule-conf.el (compound-text): Doc fix. + (ctext-no-compositions): Doc fix. + (compound-text-with-extensions): Doc fix. + 2010-07-23 Juanma Barranquero * help-fns.el (find-lisp-object-file-name): Doc fix (bug#6494). diff --git a/lisp/international/mule-conf.el b/lisp/international/mule-conf.el index f53b69eed8b..9ee8d22463a 100644 --- a/lisp/international/mule-conf.el +++ b/lisp/international/mule-conf.el @@ -1410,9 +1410,10 @@ is treated as a character." :flags '(ascii-at-eol ascii-at-cntl designation single-shift composition)) (define-coding-system 'compound-text - "Compound text based generic encoding for decoding unknown messages. - -This coding system does not support extended segments of CTEXT." + "Compound text based generic encoding. +This coding system is an extension of X's \"Compound Text Encoding\". +It encodes many characters using the normal ISO-2022 designation sequences, +but it doesn't support extended segments of CTEXT." :coding-type 'iso-2022 :mnemonic ?x :charset-list 'iso-2022 @@ -1432,7 +1433,7 @@ This coding system does not support extended segments of CTEXT." ;; not have a mime-charset property, to prevent it from showing up ;; close to the beginning of coding systems ordered by priority. (define-coding-system 'ctext-no-compositions - "Compound text based generic encoding for decoding unknown messages. + "Compound text based generic encoding. Like `compound-text', but does not produce escape sequences for compositions." :coding-type 'iso-2022 @@ -1445,8 +1446,9 @@ Like `compound-text', but does not produce escape sequences for compositions." (define-coding-system 'compound-text-with-extensions "Compound text encoding with ICCCM Extended Segment extensions. -See the variable `ctext-non-standard-encodings-alist' for the -detail about how extended segments are handled. +See the variables `ctext-standard-encodings' and +`ctext-non-standard-encodings-alist' for the detail about how +extended segments are handled. This coding system should be used only for X selections. It is inappropriate for decoding and encoding files, process I/O, etc." diff --git a/lisp/international/mule.el b/lisp/international/mule.el index 7e7e55728c8..e030acbef02 100644 --- a/lisp/international/mule.el +++ b/lisp/international/mule.el @@ -1408,7 +1408,9 @@ This function is provided for backward compatibility." '(("big5-0" big5 2 big5) ("ISO8859-14" iso-8859-14 1 latin-iso8859-14) ("ISO8859-15" iso-8859-15 1 latin-iso8859-15) - ("gbk-0" gbk 2 chinese-gbk))) + ("gbk-0" gbk 2 chinese-gbk) + ("koi8-r" koi8-r 1 koi8-r) + ("microsoft-cp1251" windows-1251 1 windows-1251))) "Alist of non-standard encoding names vs the corresponding usages in CTEXT. It controls how extended segments of a compound text are handled @@ -1497,6 +1499,20 @@ Each element must be one of the names listed in the variable (goto-char (point-min)) (- (point-max) (point))))) +(defvar ctext-standard-encodings + '(ascii latin-jisx0201 katakana-jisx0201 + latin-iso8859-1 latin-iso8859-2 latin-iso8859-3 latin-iso8859-4 + greek-iso8859-7 arabic-iso8859-6 hebrew-iso8859-8 cyrillic-iso8859-5 + latin-iso8859-9 + chinese-gb2312 japanese-jisx0208 korean-ksc5601) + "List of approved standard encodings (i.e. charsets) of X's Compound Text. +Coding-system `compound-text-with-extensions' encodes a character +belonging to any of those charsets using the normal ISO2022 +designation sequence unless the current language environment or +the variable `ctext-non-standard-encodings' decide to use an extended +segment of CTEXT for that character. See also the documentation +of `ctext-non-standard-encodings-alist'.") + ;; Return an alist of CHARSET vs CTEXT-USAGE-INFO generated from ;; `ctext-non-standard-encodings' and a list specified by the key ;; `ctext-non-standard-encodings' for the currrent language @@ -1508,77 +1524,74 @@ Each element must be one of the names listed in the variable ;; is encoded using UTF-8 encoding extention. (defun ctext-non-standard-encodings-table () - (let (table) - ;; Setup charsets specified by the key - ;; `ctext-non-standard-encodings' for the current language - ;; environment and in `ctext-non-standard-encodings'. - (dolist (encoding (append - (get-language-info current-language-environment - 'ctext-non-standard-encodings) - ctext-non-standard-encodings)) - (let* ((slot (assoc encoding ctext-non-standard-encodings-alist)) + (let* ((table (append ctext-non-standard-encodings + (copy-sequence + (get-language-info current-language-environment + 'ctext-non-standard-encodings)))) + (tail table) + elt) + (while tail + (setq elt (car tail)) + (let* ((slot (assoc elt ctext-non-standard-encodings-alist)) (charset (nth 3 slot))) (if (charsetp charset) - (push (cons charset slot) table) - (dolist (cs charset) - (push (cons cs slot) table))))) - - ;; Next prepend charsets for ISO2022 designation sequence. - (dolist (charset charset-list) - (let ((final (plist-get (charset-plist charset) :iso-final-char))) - (if (and (integerp final) - (>= final #x40) (<= final #x7e) - ;; Exclude ascii and chinese-cns11643-X. - (not (eq charset 'ascii)) - (not (string-match "cns11643" (symbol-name charset)))) - (push (cons charset nil) table)))) - - ;; Returned reversed list so that the charsets specified by the - ;; key `ctext-non-standard-encodings' for the current language - ;; have the highest priority. - (nreverse table))) + (setcar tail (cons charset slot)) + (setcar tail (cons (car charset) slot)) + (dolist (cs (cdr charset)) + (setcdr tail + (cons (cons (car cs) slot) (cdr tail))) + (setq tail (cdr tail)))) + (setq tail (cdr tail)))) + table)) (defun ctext-pre-write-conversion (from to) "Encode characters between FROM and TO as Compound Text w/Extended Segments. -If FROM is a string, or if the current buffer is not the one set up for us -by `encode-coding-string', generate a new temp buffer, insert the text, -and convert it in the temporary buffer. Otherwise, convert in-place." +If FROM is a string, generate a new temp buffer, insert the text, +and convert it in the temporary buffer. Otherwise, convert +in-place." (save-match-data ;; Setup a working buffer if necessary. (when (stringp from) (set-buffer (generate-new-buffer " *temp")) (set-buffer-multibyte (multibyte-string-p from)) - (insert from)) - - ;; Now we can encode the whole buffer. - (let ((encoding-table (ctext-non-standard-encodings-table)) - last-coding-system-used - last-pos last-encoding-info - encoding-info end-pos ch) - (goto-char (setq last-pos (point-min))) - (setq end-pos (point-marker)) - (while (re-search-forward "[^\000-\177]+" nil t) - ;; Found a sequence of non-ASCII characters. - (setq last-pos (match-beginning 0) - ch (char-after last-pos) - last-encoding-info (catch 'tag - (dolist (elt encoding-table) - (if (encode-char ch (car elt)) - (throw 'tag (cdr elt)))) - 'utf-8)) - (set-marker end-pos (match-end 0)) - (goto-char (1+ last-pos)) - (catch 'tag - (while t - (setq encoding-info - (if (< (point) end-pos) - (catch 'tag - (setq ch (following-char)) - (dolist (elt encoding-table) - (if (encode-char ch (car elt)) - (throw 'tag (cdr elt)))) - 'utf-8))) + (insert from) + (setq from 1 to (point-max))) + (save-restriction + (narrow-to-region from to) + (let ((encoding-table (ctext-non-standard-encodings-table)) + (charset-list ctext-standard-encodings) + last-coding-system-used + last-pos last-encoding-info + encoding-info end-pos ch charset) + (dolist (elt encoding-table) + (push (car elt) charset-list)) + (goto-char (setq last-pos from)) + (setq end-pos (point-marker)) + (while (re-search-forward "[^\000-\177]+" nil t) + ;; Found a sequence of non-ASCII characters. + (setq last-pos (match-beginning 0) + ch (char-after last-pos) + charset (char-charset ch charset-list) + last-encoding-info + (if charset + (or (cdr (assq charset encoding-table)) + charset) + 'utf-8)) + (set-marker end-pos (match-end 0)) + (goto-char (1+ last-pos)) + (while (marker-position end-pos) + (if (< (point) end-pos) + (progn + (setq charset (char-charset (following-char) charset-list) + encoding-info + (if charset + (or (cdr (assq charset encoding-table)) + charset) + 'utf-8)) + (forward-char 1)) + (setq encoding-info nil) + (set-marker end-pos nil)) (unless (eq last-encoding-info encoding-info) (cond ((consp last-encoding-info) ;; Encode the previous range using an extended @@ -1609,14 +1622,12 @@ and convert it in the temporary buffer. Otherwise, convert in-place." (save-excursion (goto-char last-pos) (insert "\e%G")) - (insert "\e%@"))) + (insert "\e%@")) + (t + (put-text-property last-pos (point) 'charset charset))) (setq last-pos (point) - last-encoding-info encoding-info)) - (if (< (point) end-pos) - (forward-char 1) - (throw 'tag nil))))) - (set-marker end-pos nil) - (goto-char (point-min)))) + last-encoding-info encoding-info)))) + (goto-char (point-min))))) ;; Must return nil, as build_annotations_2 expects that. nil) diff --git a/lisp/language/cyrillic.el b/lisp/language/cyrillic.el index 7d2f082579f..b293ad1ff0b 100644 --- a/lisp/language/cyrillic.el +++ b/lisp/language/cyrillic.el @@ -239,13 +239,6 @@ Support for Russian using koi8-r and the russian-computer input method.") (documentation . "Support for Tajik using KOI8-T.")) '("Cyrillic")) -(let ((elt `("microsoft-cp1251" windows-1251 1 - ,(get 'encode-windows-1251 'translation-table))) - (slot (assoc "microsoft-cp1251" ctext-non-standard-encodings-alist))) - (if slot - (setcdr slot (cdr elt)) - (push elt ctext-non-standard-encodings-alist))) - (set-language-info-alist "Bulgarian" `((coding-system windows-1251) (coding-priority windows-1251) -- 2.39.2