(ctext-non-standard-encodings-alist): Change the format.

author Kenichi Handa <handa@m17n.org>

Wed, 3 Dec 2003 08:24:42 +0000 (08:24 +0000)

committer Kenichi Handa <handa@m17n.org>

Wed, 3 Dec 2003 08:24:42 +0000 (08:24 +0000)
author Kenichi Handa <handa@m17n.org>
Wed, 3 Dec 2003 08:24:42 +0000 (08:24 +0000)
committer Kenichi Handa <handa@m17n.org>
Wed, 3 Dec 2003 08:24:42 +0000 (08:24 +0000)
diff --git a/lisp/international/mule.el b/lisp/international/mule.el

index 2c7160b381be9bd2b07d816c136a5e1915bd2604..1674f7bf61acdd04eb84eb25755891c8cfa47948 100644 (file)
--- a/lisp/international/mule.el
+++ b/lisp/international/mule.el
@@ -1330,12 +1330,42 @@ ARG is a list of coding categories ordered by priority."
  ;;; X selections
  
  (defvar ctext-non-standard-encodings-alist
-  '(("ISO8859-15" . iso-8859-15)
-    ("ISO8859-14" . iso-8859-14)
-    ("KOI8-R" . koi8-r)
-    ("BIG5-0" . big5))
-  "Alist of non-standard encoding names vs Emacs coding systems.
-This alist is used to decode an extened segment of a compound text.")
+  '(("big5-0" big5 2 (chinese-big5-1 chinese-big5-2))
+    ("ISO8859-14" iso-8859-14 1 latin-iso8859-14)
+    ("ISO8859-15" iso-8859-15 1 latin-iso8859-15))
+  "Alist of non-standard encoding names vs the corresponding usages in CTEXT.
+
+It controls how extended segments of a compound text are handled
+by the coding system `compound-text-with-extensions'.
+
+Each element has the form (ENCODING-NAME CODING-SYSTEM N-OCTET CHARSET).
+
+ENCODING-NAME is an encoding name of an \"extended segments\".
+
+CODING-SYSTEM is the coding-system to encode (or decode) the
+characters into (or from) the extended segment.
+
+N-OCTET is the number of octets (bytes) that encodes a character
+in the segment.  It can be 0 (meaning the number of octets per
+character is variable), 1, 2, 3, or 4.
+
+CHARSET is a charater set containing characters that are encoded
+in the segment.  It can be a list of character sets.  It can also
+be a char-table, in which case characters that have non-nil value
+in the char-table are the target.
+
+On decoding CTEXT, all encoding names listed here are recognized.
+
+On encoding CTEXT, encoding names in the variable
+`ctext-non-standard-encodings' (which see) and in the information
+listed for the current language environment under the key
+`ctext-non-standard-encodings' are used.")
+
+(defvar ctext-non-standard-encodings
+  '("big5-0")
+  "List of non-standard encoding names used in extended segments of CTEXT.
+Each element must be one of the names listed in the variable
+`ctext-non-standard-encodings-alist' (which see).")
  
  (defvar ctext-non-standard-encodings-regexp
    (string-to-multibyte
@@ -1347,13 +1377,9 @@ This alist is used to decode an extened segment of a compound text.")
      "\\(\e%G[^\e]*\e%@\\)")))
  
  ;; Functions to support "Non-Standard Character Set Encodings" defined
-;; by the COMPOUND-TEXT spec.
-;; We support that by decoding the whole data by `ctext' which just
-;; pertains byte sequences belonging to ``extended segment'', then
-;; decoding those byte sequences one by one in Lisp.
-;; This function also supports "The UTF-8 encoding" described in the
-;; section 7 of the documentation fo COMPOUND-TEXT distributed with
-;; XFree86.
+;; by the COMPOUND-TEXT spec.  They also support "The UTF-8 encoding"
+;; described in the section 7 of the documentation of COMPOUND-TEXT
+;; distributed with XFree86.
  
  (defun ctext-post-read-conversion (len)
    "Decode LEN characters encoded as Compound Text with Extended Segments."
@@ -1365,7 +1391,6 @@ This alist is used to decode an extened segment of a compound text.")
             pos bytes)
         (or in-workbuf
             (narrow-to-region (point) (+ (point) len)))
-       (decode-coding-region (point-min) (point-max) 'ctext)
         (if in-workbuf
             (set-buffer-multibyte t))
         (while (re-search-forward ctext-non-standard-encodings-regexp
@@ -1376,11 +1401,14 @@ This alist is used to decode an extened segment of a compound text.")
               (let* ((M (char-after (+ pos 4)))
                      (L (char-after (+ pos 5)))
                      (encoding (match-string 2))
-                    (coding (or (cdr (assoc-ignore-case 
-                                      encoding
-                                      ctext-non-standard-encodings-alist))
-                                (coding-system-p
-                                 (intern (downcase encoding))))))
+                    (encoding-info (assoc-ignore-case 
+                                    encoding
+                                    ctext-non-standard-encodings-alist))
+                    (coding (if encoding-info
+                                (nth 1 encoding-info)
+                              (setq encoding (intern (downcase encoding)))
+                              (and (coding-system-p encoding)
+                                   encoding))))
                 (setq bytes (- (+ (* (- M 128) 128) (- L 128))
                                (- (point) (+ pos 6))))
                 (when coding
@@ -1388,66 +1416,39 @@ This alist is used to decode an extened segment of a compound text.")
                   (forward-char bytes)
                   (decode-coding-region (- (point) bytes) (point) coding)))
             ;; ESC % G --UTF-8-BYTES-- ESC % @
-           (setq bytes (- (point) pos))
-           (decode-coding-region (- (point) bytes) (point) 'utf-8))))
+           (delete-char -3)
+           (delete-region pos (+ pos 3))
+           (decode-coding-region pos (point) 'utf-8))))
        (goto-char (point-min))
        (- (point-max) (point)))))
  
-;; From X registry 2001/06/01
-;; 20. NON-STANDARD CHARACTER SET ENCODINGS
-
-;; See Section 6 of the Compound Text standard.
-
-;; Name                                                Reference
-;; ----                                                ---------
-;; "DEC.CNS11643.1986-2"                               [53]
-;;     CNS11643 2-plane using the recommended
-;;     internal representation scheme
-;; "DEC.DTSCS.1990-2"                          [54]
-;;     DEC Taiwan Supplemental Character Set
-;; "fujitsu.u90x03"                            [87]
-;; "ILA"                                               [62]
-;;     registry prefix
-;; "IPSYS"                                             [59]
-;;     registry prefix
-;; "omron_UDC"                                 [45]
-;;         omron User Defined Charset
-;; "omron_UDC_ja"                                      [45]
-;;         omron User Defined Charset for Japanese
-;; "omron_UDC_zh"                                      [45]
-;;         omron User Defined Charset for Chinese(Main land)
-;; "omron_UDC_tw"                                      [45]
-;;         omron User Defined Charset for Chinese(Taiwan)
-
-;; If you add charsets here, be sure to modify the regexp used by
-;; ctext-pre-write-conversion to look up non-standard charsets.
-(defvar ctext-non-standard-designations-alist
-  '(("$(0" . (big5 "big5-0" 2))
-    ("$(1" . (big5 "big5-0" 2))
-    ;; The following are actually standard; generating extended
-    ;; segments for them is wrong and screws e.g. Latin-9 users.
-    ;; 8859-{10,13,16} aren't Emacs charsets anyhow.  -- fx
-;;     ("-V"  . (t "iso8859-10" 1))
-;;     ("-Y"  . (t "iso8859-13" 1))
-;;     ("-_"  . (t "iso8859-14" 1))
-;;     ("-b"  . (t "iso8859-15" 1))
-;;     ("-f"  . (t "iso8859-16" 1))
-    )
-  "Alist of ctext control sequences that introduce character sets which
-are not in the list of approved encodings, and the corresponding
-coding system, identifier string, and number of octets per encoded
-character.
-
-Each element has the form (CTLSEQ . (ENCODING CHARSET NOCTETS)).  CTLSEQ
-is the control sequence (sans the leading ESC) that introduces the character
-set in the text encoded by compound-text.  ENCODING is a coding system
-symbol; if it is t, it means that the ctext coding system already encodes
-the text correctly, and only the leading control sequence needs to be altered.
-If ENCODING is a coding system, we need to re-encode the text with that
-coding system.  CHARSET is the name of the charset we need to put into
-the leading control sequence.  NOCTETS is the number of octets (bytes) that
-encode each character in this charset.  NOCTETS can be 0 (meaning the number
-of octets per character is variable), 1, 2, 3, or 4.")
+;; Return a char table of extended segment usage for each character.
+;; Each value of the char table is nil, one of the elements of
+;; `ctext-non-standard-encodings-alist', or the symbol `utf-8'.
+
+(defun ctext-non-standard-encodings-table ()
+  (let ((table (make-char-table 'translation-table)))
+    (aset table (make-char 'mule-unicode-0100-24ff) 'utf-8)
+    (aset table (make-char 'mule-unicode-2500-33ff) 'utf-8)
+    (aset table (make-char 'mule-unicode-e000-ffff) 'utf-8)
+    (dolist (encoding (reverse
+                      (append
+                       (get-language-info current-language-environment
+                                          'ctext-non-standard-encodings)
+                       ctext-non-standard-encodings)))
+      (let* ((slot (assoc encoding ctext-non-standard-encodings-alist))
+            (charset (nth 3 slot)))
+       (if charset
+           (cond ((charsetp charset)
+                  (aset table (make-char charset) slot))
+                 ((listp charset)
+                  (dolist (elt charset)
+                    (aset table (make-char elt) slot)))
+                 ((char-table-p charset)
+                  (map-char-table #'(lambda (k v) 
+                                  (if (and v (> k 128)) (aset table k slot)))
+                                  charset))))))
+    table))
  
  (defun ctext-pre-write-conversion (from to)
    "Encode characters between FROM and TO as Compound Text w/Extended Segments.
@@ -1470,47 +1471,56 @@ text, and convert it in the temporary buffer.  Otherwise, convert in-place."
              (insert-buffer-substring buf from to))))
  
      ;; Now we can encode the whole buffer.
-    (let ((case-fold-search nil)
+    (let ((encoding-table (ctext-non-standard-encodings-table))
           last-coding-system-used
-         pos posend desig encode-info encoding chset noctets textlen)
-      (goto-char (point-min))
-      ;; At first encode the whole buffer.
-      (encode-coding-region (point-min) (point-max) 'ctext-no-compositions)
-      ;; Then replace ISO-2022 charset designations with extended
-      ;; segments, for those charsets that are not part of the
-      ;; official X registry.  The regexp below finds the leading
-      ;; sequences for big5.
-      (while (re-search-forward "\e\\(\$([01]\\)" nil 'move)
-       (setq pos (match-beginning 0)
-             posend (point)
-             desig (match-string 1)
-             encode-info (cdr (assoc desig
-                                     ctext-non-standard-designations-alist))
-             encoding (car encode-info)
-             chset (cadr encode-info)
-             noctets (car (cddr encode-info)))
-       (skip-chars-forward "^\e")
-       (cond
-        ((eq encoding t)  ; only the leading sequence needs to be changed
-         (setq textlen (+ (- (point) posend) (length chset) 1))
-         ;; Generate the control sequence for an extended segment.
-         (replace-match (format "\e%%/%d%c%c%s\ 2"
-                                noctets
-                                (+ (/ textlen 128) 128)
-                                (+ (% textlen 128) 128)
-                                chset)
-                        t t))
-        ((coding-system-p encoding) ; need to recode the entire segment...
-         (decode-coding-region pos (point) 'ctext-no-compositions)
-         (encode-coding-region pos (point) encoding)
-         (setq textlen (+ (- (point) pos) (length chset) 1))
-         (save-excursion
-           (goto-char pos)
-           (insert (format "\e%%/%d%c%c%s\ 2"
-                           noctets
-                           (+ (/ textlen 128) 128)
-                           (+ (% textlen 128) 128)
-                           chset))))))
+         last-pos last-encoding-info
+         encoding-info end-pos)
+      (goto-char (setq last-pos (point-min)))
+      (setq end-pos (point-marker))
+      (while (re-search-forward "[^\000-\177]+" nil t)
+       ;; Found a sequence of non-ASCII characters.
+       (setq last-pos (match-beginning 0)
+             last-encoding-info (aref encoding-table (char-after last-pos)))
+       (set-marker end-pos (match-end 0))
+       (goto-char (1+ last-pos))
+       (catch 'tag
+         (while t
+           (setq encoding-info
+                 (if (< (point) end-pos)
+                     (aref encoding-table (following-char))))
+           (unless (eq last-encoding-info encoding-info)
+             (cond ((consp last-encoding-info)
+                    ;; Encode the previous range using an extended
+                    ;; segment.
+                    (let ((encoding-name (car last-encoding-info))
+                          (coding-system (nth 1 last-encoding-info))
+                          (noctets (nth 2 last-encoding-info))
+                          len)
+                      (encode-coding-region last-pos (point) coding-system)
+                      (setq len (+ (length encoding-name) 1
+                                   (- (point) last-pos)))
+                      (save-excursion
+                        (goto-char last-pos)
+                        (insert (string-to-multibyte 
+                                 (format "\e%%/%d%c%c%s\ 2"
+                                         noctets
+                                         (+ (/ len 128) 128)
+                                         (+ (% len 128) 128)
+                                         encoding-name))))))
+                   ((eq last-encoding-info 'utf-8)
+                    ;; Encode the previous range using UTF-8 encoding
+                    ;; extention.
+                    (encode-coding-region last-pos (point) 'mule-utf-8)
+                    (save-excursion
+                      (goto-char last-pos)
+                      (insert "\e%G"))
+                    (insert "\e%@")))
+             (setq last-pos (point)
+                   last-encoding-info encoding-info))
+           (if (< (point) end-pos)
+               (forward-char 1)
+             (throw 'tag nil)))))
+      (set-marker end-pos nil)
        (goto-char (point-min))))
    ;; Must return nil, as build_annotations_2 expects that.
    nil)
author	Kenichi Handa <handa@m17n.org>
	Wed, 3 Dec 2003 08:24:42 +0000 (08:24 +0000)
committer	Kenichi Handa <handa@m17n.org>
	Wed, 3 Dec 2003 08:24:42 +0000 (08:24 +0000)