(ucs-mule-to-mule-unicode): Don't define

author Kenichi Handa <handa@m17n.org>

Mon, 30 Sep 2002 06:35:13 +0000 (06:35 +0000)

committer Kenichi Handa <handa@m17n.org>

Mon, 30 Sep 2002 06:35:13 +0000 (06:35 +0000)
author Kenichi Handa <handa@m17n.org>
Mon, 30 Sep 2002 06:35:13 +0000 (06:35 +0000)
committer Kenichi Handa <handa@m17n.org>
Mon, 30 Sep 2002 06:35:13 +0000 (06:35 +0000)
diff --git a/lisp/international/utf-8.el b/lisp/international/utf-8.el

index e201c025ade57a3eaa35e557df19681c1394cedc..fcc35243231fb96df380aa4aac7c8bb522c51201 100644 (file)
--- a/lisp/international/utf-8.el
+++ b/lisp/international/utf-8.el
@@ -46,12 +46,13 @@
  ;; Fixme: note that reading and writing invalid utf-8 may not be
  ;; idempotent -- to represent the bytes to fix that needs a new charset.
  ;;
-;; Characters from other character sets can be encoded with
-;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and
-;; registering the translation with `register-char-codings'.  Hash
-;; tables `utf-8-subst-table' and `utf-8-subst-rev-table' are used to
-;; support encoding and decoding of about a quarter of the CJK space
-;; between U+3400 and U+DFFF.
+;; Characters from other character sets can be encoded with mule-utf-8
+;; by populating the translation-table
+;; `utf-translation-table-for-encode' and registering the translation
+;; with `register-char-codings'.  Hash tables
+;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are
+;; used to support encoding and decoding of about a quarter of the CJK
+;; space between U+3400 and U+DFFF.
  
  ;; UTF-8 is defined in RFC 2279.  A sketch of the encoding is:
  
@@ -64,34 +65,58 @@
  
  ;;; Code:
  
-(defvar ucs-mule-to-mule-unicode (make-translation-table)
-  "Translation table for encoding to `mule-utf-8'.")
-(define-translation-table 'ucs-mule-to-mule-unicode
-  ucs-mule-to-mule-unicode)
+(defvar ucs-mule-to-mule-unicode (make-char-table 'translation-table nil)
+  "Char table mapping characters to latin-iso8859-1 or mule-unicode-*.
  
-(defvar utf-8-subst-table (make-hash-table :test 'eq))
-(defvar utf-8-subst-rev-table (make-hash-table :test 'eq))
-(define-translation-hash-table 'utf-8-subst-table utf-8-subst-table)
-(define-translation-hash-table 'utf-8-subst-rev-table utf-8-subst-rev-table)
+If `unify-8859-on-encoding-mode' is non-nil, this table populates the
+translation-table named `utf-translation-table-for-encode'.")
+
+(define-translation-table 'utf-translation-table-for-encode)
  
-(defvar utf-8-translation-table-for-decode (make-translation-table)
-  "Translation table applied after decoding utf-8 to mule-unicode.
-This is only actually applied to characters which would normally be
-decoded into mule-unicode-0100-24ff.")
-(define-translation-table 'utf-8-translation-table-for-decode
-  utf-8-translation-table-for-decode)
  
  ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the
  ;; space of mule-unicode.  For Latin scripts this isn't very
  ;; important.  Hebrew and Arabic might go here too when there's proper
  ;; support for them.
-(defvar utf-8-fragmentation-table (make-translation-table)
-  "Char table normally mapping non-Latin mule-unicode-... characters to iso8859.
-Used as the value of `utf-8-translation-table-for-decode' in
-`utf-8-fragment-on-decoding' mode.")
+
+(defvar utf-fragmentation-table (make-char-table 'translation-table nil)
+  "Char-table normally mapping non-Latin mule-unicode-* chars to iso-8859-*.
+
+If `utf-fragment-on-decoding' is non-nil, this table populates the
+translation-table named `utf-translation-table-for-decode'")
+
+(defvar utf-defragmentation-table (make-char-table 'translation-table nil)
+  "Char-table for reverse mapping of `utf-fragmentation-table'.
+
+If `utf-fragment-on-decoding' is non-nil and
+`unify-8859-on-encoding-mode' is nil, this table populates the
+translation-table named `utf-translation-table-for-encode'")
+
+(define-translation-table 'utf-translation-table-for-decode)
+
+
+(defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq)
+  "Hash table mapping Emacs CJK character sets to Unicode code points.
+
+If `utf-translate-cjk' is non-nil, this table populates the
+translation-hash-table named `utf-subst-table-for-encode'.")
+
+(define-translation-hash-table 'utf-subst-table-for-encode 
+  (make-hash-table :test 'eq))
+
+(defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq)
+  "Hash table mapping Unicode code points to Emacs CJK character sets.
+
+If `utf-translate-cjk' is non-nil, this table populates the
+translation-hash-table named `utf-subst-table-for-decode'.")
+
+(define-translation-hash-table 'utf-subst-table-for-decode
+  (make-hash-table :test 'eq))
+
  (mapc
   (lambda (pair)
-   (aset utf-8-fragmentation-table (car pair) (cdr pair)))
+   (aset utf-fragmentation-table (car pair) (cdr pair))
+   (aset utf-defragmentation-table (cdr pair) (car pair)))
   '((?\e$,1&d\e(B . ?\e,F4\e(B) (?\e$,1&e\e(B . ?\e,F5\e(B) (?\e$,1&f\e(B . ?\e,F6\e(B) (?\e$,1&h\e(B . ?\e,F8\e(B) (?\e$,1&i\e(B . ?\e,F9\e(B)
     (?\e$,1&j\e(B . ?\e,F:\e(B) (?\e$,1&l\e(B . ?\e,F<\e(B) (?\e$,1&n\e(B . ?\e,F>\e(B) (?\e$,1&o\e(B . ?\e,F?\e(B) (?\e$,1&p\e(B . ?\e,F@\e(B)
     (?\e$,1&q\e(B . ?\e,FA\e(B) (?\e$,1&r\e(B . ?\e,FB\e(B) (?\e$,1&s\e(B . ?\e,FC\e(B) (?\e$,1&t\e(B . ?\e,FD\e(B) (?\e$,1&u\e(B . ?\e,FE\e(B)
@@ -128,8 +153,9 @@ Used as the value of `utf-8-translation-table-for-decode' in
     (?\e$,1(w\e(B . ?\e,Lw\e(B) (?\e$,1(x\e(B . ?\e,Lx\e(B) (?\e$,1(y\e(B . ?\e,Ly\e(B) (?\e$,1(z\e(B . ?\e,Lz\e(B) (?\e$,1({\e(B . ?\e,L{\e(B)
     (?\e$,1(|\e(B . ?\e,L|\e(B) (?\e$,1(~\e(B . ?\e,L~\e(B) (?\e$,1(\7f\e(B . ?\e,L\7f\e(B)))
  
-(defcustom utf-8-fragment-on-decoding nil
-  "Whether or not to decode some scripts in UTF-8 text into iso8859 charsets.
+
+(defcustom utf-fragment-on-decoding nil
+  "Whether or not to decode some chars in UTF-8/16 text into iso8859 charsets.
  Setting this means that the relevant Cyrillic and Greek characters are
  decoded into the iso8859 charsets rather than into
  mule-unicode-0100-24ff.  The iso8859 charsets take half as much space
@@ -140,40 +166,81 @@ for mechanisms to make this largely transparent.
  
  Setting this variable outside customize has no effect."
    :set (lambda (s v)
-        (setq utf-8-translation-table-for-decode
-              (if v
-                  utf-8-fragmentation-table
-                (make-translation-table)))
-        (define-translation-table 'utf-8-translation-table-for-decode
-          utf-8-translation-table-for-decode)
+        (if v
+            (progn
+              (define-translation-table 'utf-translation-table-for-decode
+                utf-fragmentation-table)
+              ;; Even if unify-8859-on-encoding-mode is off, make
+              ;; mule-utf-* encode characters in
+              ;; utf-fragmentation-table.
+              (unless (eq (get 'utf-translation-table-for-encode
+                               'translation-table)
+                          ucs-mule-to-mule-unicode)
+                (define-translation-table 'utf-translation-table-for-encode
+                  utf-defragmentation-table)
+                (dolist (coding '(mule-utf-8 mule-utf-16-be mule-utf-16-le))
+                  (register-char-codings coding utf-defragmentation-table))))
+          (define-translation-table 'utf-translation-table-for-decode)
+          ;; When unify-8859-on-encoding-mode is off, be sure to make
+          ;; mule-utf-* disabled for characters in
+          ;; utf-fragmentation-table.
+          (unless (eq (get 'utf-translation-table-for-encode
+                           'translation-table)
+                      ucs-mule-to-mule-unicode)
+            (define-translation-table 'utf-translation-table-for-encode)
+            (map-char-table
+             (lambda (key val)
+               (if (and (>= key 128) val)
+                   (aset char-coding-system-table key
+                         (delq 'mule-utf-8
+                               (delq 'mule-utf-16-le
+                                     (delq 'mule-utf-16-be
+                                           (aref char-coding-system-table
+                                                 key)))))))
+             utf-defragmentation-table)))
          (set-default s v))
    :version "21.4"
    :type 'boolean
    :group 'mule)
  
-(defcustom utf-8-translate-cjk nil
-  "Whether the `mule-utf-8' coding system should encode many CJK characters.
+(defcustom utf-translate-cjk nil
+  "Whether the UTF based coding systems should decode/encode CJK characters.
  
-Enabling this loads tables which enable the coding system to encode
-characters in the charsets `korean-ksc5601', `chinese-gb2312' and
+Enabling this loads tables which enable the coding systems:
+    mule-utf-8, mule-utf-16-le, mule-utf-16-be
+to encode characters in the charsets `korean-ksc5601', `chinese-gb2312' and
  `japanese-jisx0208', and to decode the corresponding unicodes into
  such characters.  This works by loading the library `utf-8-subst'; see
  its commentary.  The tables are fairly large (about 33000 entries), so this
  option is not the default."
    :link '(emacs-commentary-link "utf-8-subst")
    :set (lambda (s v)
-        (when v
-          (require 'utf-8-subst)
-          (let ((table (make-char-table 'translation-table)))
-            (coding-system-put 'mule-utf-8 'safe-charsets
-                               (append (coding-system-get 'mule-utf-8
-                                                          'safe-charsets)
-                                       '(korean-ksc5601 chinese-gb2312
-                                                        japanese-jisx0208)))
-            (maphash (lambda (k v)
-                       (aset table k v))
-                     utf-8-subst-rev-table)
-            (register-char-codings 'mule-utf-8 table)))
+        (if v
+            (progn
+              (require 'utf-8-subst)
+              (let ((table (make-char-table 'translation-table)))
+                (maphash (lambda (k v)
+                           (aset table k t))
+                         ucs-mule-cjk-to-unicode)
+                (register-char-codings 'mule-utf-8 table)
+                (register-char-codings 'mule-utf-16-le table)
+                (register-char-codings 'mule-utf-16-be table))
+              (define-translation-hash-table 'utf-subst-table-for-decode
+                ucs-unicode-to-mule-cjk)
+              (define-translation-hash-table 'utf-subst-table-for-encode
+                ucs-mule-cjk-to-unicode))
+          (map-char-table
+           (lambda (k v)
+             (if (gethash k ucs-mule-cjk-to-unicode)
+                 (aset char-coding-system-table k
+                       (delq 'mule-utf-8
+                             (delq 'mule-utf-16-le
+                                   (delq 'mule-utf-16-be v))))))
+           char-coding-system-table)
+          (define-translation-hash-table 'utf-subst-table-for-decode
+            (make-hash-table :test 'eq))
+          (define-translation-hash-table 'utf-subst-table-for-encode
+            (make-hash-table :test 'eq)))
          (set-default s v))
    :version "21.4"
    :type 'boolean
@@ -263,7 +330,7 @@ option is not the default."
                          (r1 %= 96)
                          (r1 += (r2 + 32))
                          (translate-character
-                         utf-8-translation-table-for-decode r0 r1)
+                         utf-translation-table-for-decode r0 r1)
                          (write-multibyte-character r0 r1))))))))
  
             ;; 3byte encoding
@@ -308,14 +375,15 @@ option is not the default."
                          (r1 = (r7 + 32))
                          (r1 += ((r3 + 32) << 7))
                          (translate-character
-                         utf-8-translation-table-for-decode r0 r1)
+                         utf-translation-table-for-decode r0 r1)
                          (write-multibyte-character r0 r1))
                     
                       ;; mule-unicode-2500-33ff
                       ;; Fixme: Perhaps allow translation via
-                     ;; utf-8-subst-table for #x2e80 up, so that we use
-                     ;; consistent charsets for all of CJK.  Would need
-                     ;; corresponding change to encoding tables.
+                     ;; utf-subst-table-for-decode for #x2e80 up, so
+                     ;; that we use consistent charsets for all of
+                     ;; CJK.  Would need corresponding change to
+                     ;; encoding tables.
                       (if (r3 < #x3400)
                           ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
                            (r3 -= #x2500)
@@ -329,7 +397,7 @@ option is not the default."
                         ;; them as eight-bit-{control|graphic}.
                         (if (r3 < #xd800)
                             ((r4 = r3)  ; don't zap r3
-                            (lookup-integer utf-8-subst-table r4 r5)
+                            (lookup-integer utf-subst-table-for-decode r4 r5)
                              (if r7
                                  ;; got a translation
                                  ((write-multibyte-character r4 r5)
@@ -370,7 +438,7 @@ option is not the default."
               (if (r0 < #xfe)
                   ;; 4byte encoding
                   ;; keep those bytes as eight-bit-{control|graphic}
-                 ;; Fixme: allow lookup in utf-8-subst-table.
+                 ;; Fixme: allow lookup in utf-subst-table-for-decode.
                   ((read r1 r2 r3)
                    ;; r0 > #xf0, thus eight-bit-graphic
                    (write-multibyte-character r6 r0)
@@ -409,8 +477,8 @@ option is not the default."
  
    "CCL program to decode UTF-8.
  Basic decoding is done into the charsets ascii, latin-iso8859-1 and
-mule-unicode-*, but see also `utf-8-translation-table-for-decode' and
-`utf-8-subst-table'.
+mule-unicode-*, but see also `utf-fragmentation-table' and
+`ucs-mule-cjk-to-unicode'.
  Encodings of un-representable Unicode characters are decoded asis into
  eight-bit-control and eight-bit-graphic characters.")
  
@@ -421,7 +489,7 @@ eight-bit-control and eight-bit-graphic characters.")
        (if (r5 < 0)
           ((r1 = -1)
            (read-multibyte-character r0 r1)
-          (translate-character ucs-mule-to-mule-unicode r0 r1))
+          (translate-character utf-translation-table-for-encode r0 r1))
         (;; We have already done read-multibyte-character.
          (r0 = r5)
          (r1 = r6)
@@ -516,7 +584,7 @@ eight-bit-control and eight-bit-graphic characters.")
                                 ((write #xc2)
                                  (write r1)))))))
  
-                   ((lookup-character utf-8-subst-rev-table r0 r1)
+                   ((lookup-character utf-subst-table-for-encode r0 r1)
                      (if r7             ; lookup succeeded
                          ((r1 = (((r0 & #xf000) >> 12) | #xe0))
                           (r2 = ((r0 & #x3f) | #x80))
@@ -538,10 +606,6 @@ eight-bit-control and eight-bit-graphic characters.")
  
    "CCL program to encode into UTF-8.")
  
-;; Dummy definition so that the CCL can be checked correctly; the
-;; actual data are loaded on demand.
-(unless (boundp 'ucs-mule-8859-to-mule-unicode)        ; don't zap it
-  (define-translation-table 'ucs-mule-8859-to-mule-unicode))
  
  (define-ccl-program ccl-untranslated-to-ucs
    `(0
@@ -648,7 +712,7 @@ Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
  ;; ucs-tables is preloaded
  ;; (defun utf-8-pre-write-conversion (beg end)
  ;;   "Semi-dummy pre-write function effectively to autoload ucs-tables."
-;;   ;; Ensure translation table is loaded.
+;;   ;; Ensure translation-table is loaded.
  ;;   (require 'ucs-tables)
  ;;   ;; Don't do this again.
  ;;   (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
@@ -657,33 +721,21 @@ Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
  (make-coding-system
   'mule-utf-8 4 ?u
   "UTF-8 encoding for Emacs-supported Unicode characters.
-The supported Emacs character sets are the following, plus any other
-characters included in the tables `ucs-mule-to-mule-unicode' and
-`utf-8-subst-rev-table':
- ascii
- eight-bit-control
- eight-bit-graphic
- latin-iso8859-1
- latin-iso8859-2
- latin-iso8859-3
- latin-iso8859-4
- cyrillic-iso8859-5
- greek-iso8859-7
- hebrew-iso8859-8
- latin-iso8859-9
- latin-iso8859-14
- latin-iso8859-15
- mule-unicode-0100-24ff
- mule-unicode-2500-33ff
- mule-unicode-e000-ffff
-
-Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
-may be decoded into korean-ksc5601, chinese-gb2312, japanese-jisx0208
-\(see user option `utf-8-translate-cjk'); otherwise, sequences of
-eight-bit-control and eight-bit-graphic characters are used to
-preserve their byte sequences, and these are composed to display as a
-single character.  Emacs characters that otherwise can't be encoded
-are encoded as U+FFFD."
+It supports Unicode characters of these ranges:
+    U+0000..U+33FF, U+E000..U+FFFF.
+They correspond to these Emacs character sets:
+    ascii, latin-iso8859-1, mule-unicode-0100-24ff,
+    mule-unicode-2500-33ff, mule-unicode-e000-ffff
+
+On decoding (e.g. reading a file), Unicode characters not in the above
+ranges are decoded into sequences of eight-bit-control and
+eight-bit-graphic characters to preserve their byte sequences.  The
+byte sequence is preserved on i/o for valid utf-8, but not necessarily
+for invalid utf-8.
+
+On encoding (e.g. writing a file), Emacs characters not belonging to
+any of the character sets listed above are encoded into the UTF-8 byte
+sequence representing U+FFFD (REPLACEMENT CHARACTER)."
  
   '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
   '((safe-charsets
@@ -691,24 +743,6 @@ are encoded as U+FFFD."
      eight-bit-control
      eight-bit-graphic
      latin-iso8859-1
-    latin-iso8859-15
-    latin-iso8859-14
-    latin-iso8859-9
-    hebrew-iso8859-8
-    greek-iso8859-7
-    cyrillic-iso8859-5
-    latin-iso8859-4
-    latin-iso8859-3
-    latin-iso8859-2
-    vietnamese-viscii-lower
-    vietnamese-viscii-upper
-    thai-tis620
-    ipa
-    ethiopic
-    indian-is13194
-    katakana-jisx0201
-    chinese-sisheng
-    lao
      mule-unicode-0100-24ff
      mule-unicode-2500-33ff
      mule-unicode-e000-ffff)
@@ -716,7 +750,11 @@ are encoded as U+FFFD."
     (coding-category . coding-category-utf-8)
     (valid-codes (0 . 255))
  ;;    (pre-write-conversion . utf-8-pre-write-conversion)
-   (post-read-conversion . utf-8-post-read-conversion)))
+   (post-read-conversion . utf-8-post-read-conversion)
+   (dependency unify-8859-on-encoding-mode
+              unify-8859-on-decoding-mode
+              utf-fragment-on-decoding
+              utf-translate-cjk)))
  
  (define-coding-system-alias 'utf-8 'mule-utf-8)
author	Kenichi Handa <handa@m17n.org>
	Mon, 30 Sep 2002 06:35:13 +0000 (06:35 +0000)
committer	Kenichi Handa <handa@m17n.org>
	Mon, 30 Sep 2002 06:35:13 +0000 (06:35 +0000)