From 6c805023c6310ab97c05478d1e0ad8119e3d9763 Mon Sep 17 00:00:00 2001 From: Kenichi Handa Date: Wed, 3 Dec 2003 08:24:42 +0000 Subject: [PATCH] (ctext-non-standard-encodings-alist): Change the format. (ctext-non-standard-encodings): New variable. (ctext-post-read-conversion): Fully re-written. (ctext-non-standard-designations-alist): Delete it. (ctext-non-standard-encodings-table): New function. (ctext-pre-write-conversion): Fully re-written. --- lisp/international/mule.el | 242 +++++++++++++++++++------------------ 1 file changed, 126 insertions(+), 116 deletions(-) diff --git a/lisp/international/mule.el b/lisp/international/mule.el index 2c7160b381b..1674f7bf61a 100644 --- a/lisp/international/mule.el +++ b/lisp/international/mule.el @@ -1330,12 +1330,42 @@ ARG is a list of coding categories ordered by priority." ;;; X selections (defvar ctext-non-standard-encodings-alist - '(("ISO8859-15" . iso-8859-15) - ("ISO8859-14" . iso-8859-14) - ("KOI8-R" . koi8-r) - ("BIG5-0" . big5)) - "Alist of non-standard encoding names vs Emacs coding systems. -This alist is used to decode an extened segment of a compound text.") + '(("big5-0" big5 2 (chinese-big5-1 chinese-big5-2)) + ("ISO8859-14" iso-8859-14 1 latin-iso8859-14) + ("ISO8859-15" iso-8859-15 1 latin-iso8859-15)) + "Alist of non-standard encoding names vs the corresponding usages in CTEXT. + +It controls how extended segments of a compound text are handled +by the coding system `compound-text-with-extensions'. + +Each element has the form (ENCODING-NAME CODING-SYSTEM N-OCTET CHARSET). + +ENCODING-NAME is an encoding name of an \"extended segments\". + +CODING-SYSTEM is the coding-system to encode (or decode) the +characters into (or from) the extended segment. + +N-OCTET is the number of octets (bytes) that encodes a character +in the segment. It can be 0 (meaning the number of octets per +character is variable), 1, 2, 3, or 4. + +CHARSET is a charater set containing characters that are encoded +in the segment. It can be a list of character sets. It can also +be a char-table, in which case characters that have non-nil value +in the char-table are the target. + +On decoding CTEXT, all encoding names listed here are recognized. + +On encoding CTEXT, encoding names in the variable +`ctext-non-standard-encodings' (which see) and in the information +listed for the current language environment under the key +`ctext-non-standard-encodings' are used.") + +(defvar ctext-non-standard-encodings + '("big5-0") + "List of non-standard encoding names used in extended segments of CTEXT. +Each element must be one of the names listed in the variable +`ctext-non-standard-encodings-alist' (which see).") (defvar ctext-non-standard-encodings-regexp (string-to-multibyte @@ -1347,13 +1377,9 @@ This alist is used to decode an extened segment of a compound text.") "\\(\e%G[^\e]*\e%@\\)"))) ;; Functions to support "Non-Standard Character Set Encodings" defined -;; by the COMPOUND-TEXT spec. -;; We support that by decoding the whole data by `ctext' which just -;; pertains byte sequences belonging to ``extended segment'', then -;; decoding those byte sequences one by one in Lisp. -;; This function also supports "The UTF-8 encoding" described in the -;; section 7 of the documentation fo COMPOUND-TEXT distributed with -;; XFree86. +;; by the COMPOUND-TEXT spec. They also support "The UTF-8 encoding" +;; described in the section 7 of the documentation of COMPOUND-TEXT +;; distributed with XFree86. (defun ctext-post-read-conversion (len) "Decode LEN characters encoded as Compound Text with Extended Segments." @@ -1365,7 +1391,6 @@ This alist is used to decode an extened segment of a compound text.") pos bytes) (or in-workbuf (narrow-to-region (point) (+ (point) len))) - (decode-coding-region (point-min) (point-max) 'ctext) (if in-workbuf (set-buffer-multibyte t)) (while (re-search-forward ctext-non-standard-encodings-regexp @@ -1376,11 +1401,14 @@ This alist is used to decode an extened segment of a compound text.") (let* ((M (char-after (+ pos 4))) (L (char-after (+ pos 5))) (encoding (match-string 2)) - (coding (or (cdr (assoc-ignore-case - encoding - ctext-non-standard-encodings-alist)) - (coding-system-p - (intern (downcase encoding)))))) + (encoding-info (assoc-ignore-case + encoding + ctext-non-standard-encodings-alist)) + (coding (if encoding-info + (nth 1 encoding-info) + (setq encoding (intern (downcase encoding))) + (and (coding-system-p encoding) + encoding)))) (setq bytes (- (+ (* (- M 128) 128) (- L 128)) (- (point) (+ pos 6)))) (when coding @@ -1388,66 +1416,39 @@ This alist is used to decode an extened segment of a compound text.") (forward-char bytes) (decode-coding-region (- (point) bytes) (point) coding))) ;; ESC % G --UTF-8-BYTES-- ESC % @ - (setq bytes (- (point) pos)) - (decode-coding-region (- (point) bytes) (point) 'utf-8)))) + (delete-char -3) + (delete-region pos (+ pos 3)) + (decode-coding-region pos (point) 'utf-8)))) (goto-char (point-min)) (- (point-max) (point))))) -;; From X registry 2001/06/01 -;; 20. NON-STANDARD CHARACTER SET ENCODINGS - -;; See Section 6 of the Compound Text standard. - -;; Name Reference -;; ---- --------- -;; "DEC.CNS11643.1986-2" [53] -;; CNS11643 2-plane using the recommended -;; internal representation scheme -;; "DEC.DTSCS.1990-2" [54] -;; DEC Taiwan Supplemental Character Set -;; "fujitsu.u90x03" [87] -;; "ILA" [62] -;; registry prefix -;; "IPSYS" [59] -;; registry prefix -;; "omron_UDC" [45] -;; omron User Defined Charset -;; "omron_UDC_ja" [45] -;; omron User Defined Charset for Japanese -;; "omron_UDC_zh" [45] -;; omron User Defined Charset for Chinese(Main land) -;; "omron_UDC_tw" [45] -;; omron User Defined Charset for Chinese(Taiwan) - -;; If you add charsets here, be sure to modify the regexp used by -;; ctext-pre-write-conversion to look up non-standard charsets. -(defvar ctext-non-standard-designations-alist - '(("$(0" . (big5 "big5-0" 2)) - ("$(1" . (big5 "big5-0" 2)) - ;; The following are actually standard; generating extended - ;; segments for them is wrong and screws e.g. Latin-9 users. - ;; 8859-{10,13,16} aren't Emacs charsets anyhow. -- fx -;; ("-V" . (t "iso8859-10" 1)) -;; ("-Y" . (t "iso8859-13" 1)) -;; ("-_" . (t "iso8859-14" 1)) -;; ("-b" . (t "iso8859-15" 1)) -;; ("-f" . (t "iso8859-16" 1)) - ) - "Alist of ctext control sequences that introduce character sets which -are not in the list of approved encodings, and the corresponding -coding system, identifier string, and number of octets per encoded -character. - -Each element has the form (CTLSEQ . (ENCODING CHARSET NOCTETS)). CTLSEQ -is the control sequence (sans the leading ESC) that introduces the character -set in the text encoded by compound-text. ENCODING is a coding system -symbol; if it is t, it means that the ctext coding system already encodes -the text correctly, and only the leading control sequence needs to be altered. -If ENCODING is a coding system, we need to re-encode the text with that -coding system. CHARSET is the name of the charset we need to put into -the leading control sequence. NOCTETS is the number of octets (bytes) that -encode each character in this charset. NOCTETS can be 0 (meaning the number -of octets per character is variable), 1, 2, 3, or 4.") +;; Return a char table of extended segment usage for each character. +;; Each value of the char table is nil, one of the elements of +;; `ctext-non-standard-encodings-alist', or the symbol `utf-8'. + +(defun ctext-non-standard-encodings-table () + (let ((table (make-char-table 'translation-table))) + (aset table (make-char 'mule-unicode-0100-24ff) 'utf-8) + (aset table (make-char 'mule-unicode-2500-33ff) 'utf-8) + (aset table (make-char 'mule-unicode-e000-ffff) 'utf-8) + (dolist (encoding (reverse + (append + (get-language-info current-language-environment + 'ctext-non-standard-encodings) + ctext-non-standard-encodings))) + (let* ((slot (assoc encoding ctext-non-standard-encodings-alist)) + (charset (nth 3 slot))) + (if charset + (cond ((charsetp charset) + (aset table (make-char charset) slot)) + ((listp charset) + (dolist (elt charset) + (aset table (make-char elt) slot))) + ((char-table-p charset) + (map-char-table #'(lambda (k v) + (if (and v (> k 128)) (aset table k slot))) + charset)))))) + table)) (defun ctext-pre-write-conversion (from to) "Encode characters between FROM and TO as Compound Text w/Extended Segments. @@ -1470,47 +1471,56 @@ text, and convert it in the temporary buffer. Otherwise, convert in-place." (insert-buffer-substring buf from to)))) ;; Now we can encode the whole buffer. - (let ((case-fold-search nil) + (let ((encoding-table (ctext-non-standard-encodings-table)) last-coding-system-used - pos posend desig encode-info encoding chset noctets textlen) - (goto-char (point-min)) - ;; At first encode the whole buffer. - (encode-coding-region (point-min) (point-max) 'ctext-no-compositions) - ;; Then replace ISO-2022 charset designations with extended - ;; segments, for those charsets that are not part of the - ;; official X registry. The regexp below finds the leading - ;; sequences for big5. - (while (re-search-forward "\e\\(\$([01]\\)" nil 'move) - (setq pos (match-beginning 0) - posend (point) - desig (match-string 1) - encode-info (cdr (assoc desig - ctext-non-standard-designations-alist)) - encoding (car encode-info) - chset (cadr encode-info) - noctets (car (cddr encode-info))) - (skip-chars-forward "^\e") - (cond - ((eq encoding t) ; only the leading sequence needs to be changed - (setq textlen (+ (- (point) posend) (length chset) 1)) - ;; Generate the control sequence for an extended segment. - (replace-match (format "\e%%/%d%c%c%s" - noctets - (+ (/ textlen 128) 128) - (+ (% textlen 128) 128) - chset) - t t)) - ((coding-system-p encoding) ; need to recode the entire segment... - (decode-coding-region pos (point) 'ctext-no-compositions) - (encode-coding-region pos (point) encoding) - (setq textlen (+ (- (point) pos) (length chset) 1)) - (save-excursion - (goto-char pos) - (insert (format "\e%%/%d%c%c%s" - noctets - (+ (/ textlen 128) 128) - (+ (% textlen 128) 128) - chset)))))) + last-pos last-encoding-info + encoding-info end-pos) + (goto-char (setq last-pos (point-min))) + (setq end-pos (point-marker)) + (while (re-search-forward "[^\000-\177]+" nil t) + ;; Found a sequence of non-ASCII characters. + (setq last-pos (match-beginning 0) + last-encoding-info (aref encoding-table (char-after last-pos))) + (set-marker end-pos (match-end 0)) + (goto-char (1+ last-pos)) + (catch 'tag + (while t + (setq encoding-info + (if (< (point) end-pos) + (aref encoding-table (following-char)))) + (unless (eq last-encoding-info encoding-info) + (cond ((consp last-encoding-info) + ;; Encode the previous range using an extended + ;; segment. + (let ((encoding-name (car last-encoding-info)) + (coding-system (nth 1 last-encoding-info)) + (noctets (nth 2 last-encoding-info)) + len) + (encode-coding-region last-pos (point) coding-system) + (setq len (+ (length encoding-name) 1 + (- (point) last-pos))) + (save-excursion + (goto-char last-pos) + (insert (string-to-multibyte + (format "\e%%/%d%c%c%s" + noctets + (+ (/ len 128) 128) + (+ (% len 128) 128) + encoding-name)))))) + ((eq last-encoding-info 'utf-8) + ;; Encode the previous range using UTF-8 encoding + ;; extention. + (encode-coding-region last-pos (point) 'mule-utf-8) + (save-excursion + (goto-char last-pos) + (insert "\e%G")) + (insert "\e%@"))) + (setq last-pos (point) + last-encoding-info encoding-info)) + (if (< (point) end-pos) + (forward-char 1) + (throw 'tag nil))))) + (set-marker end-pos nil) (goto-char (point-min)))) ;; Must return nil, as build_annotations_2 expects that. nil) -- 2.39.5