;; Fixme: note that reading and writing invalid utf-8 may not be
;; idempotent -- to represent the bytes to fix that needs a new charset.
;;
-;; Characters from other character sets can be encoded with
-;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and
-;; registering the translation with `register-char-codings'. Hash
-;; tables `utf-8-subst-table' and `utf-8-subst-rev-table' are used to
-;; support encoding and decoding of about a quarter of the CJK space
-;; between U+3400 and U+DFFF.
+;; Characters from other character sets can be encoded with mule-utf-8
+;; by populating the translation-table
+;; `utf-translation-table-for-encode' and registering the translation
+;; with `register-char-codings'. Hash tables
+;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are
+;; used to support encoding and decoding of about a quarter of the CJK
+;; space between U+3400 and U+DFFF.
;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
;;; Code:
-(defvar ucs-mule-to-mule-unicode (make-translation-table)
- "Translation table for encoding to `mule-utf-8'.")
-(define-translation-table 'ucs-mule-to-mule-unicode
- ucs-mule-to-mule-unicode)
+(defvar ucs-mule-to-mule-unicode (make-char-table 'translation-table nil)
+ "Char table mapping characters to latin-iso8859-1 or mule-unicode-*.
-(defvar utf-8-subst-table (make-hash-table :test 'eq))
-(defvar utf-8-subst-rev-table (make-hash-table :test 'eq))
-(define-translation-hash-table 'utf-8-subst-table utf-8-subst-table)
-(define-translation-hash-table 'utf-8-subst-rev-table utf-8-subst-rev-table)
+If `unify-8859-on-encoding-mode' is non-nil, this table populates the
+translation-table named `utf-translation-table-for-encode'.")
+
+(define-translation-table 'utf-translation-table-for-encode)
-(defvar utf-8-translation-table-for-decode (make-translation-table)
- "Translation table applied after decoding utf-8 to mule-unicode.
-This is only actually applied to characters which would normally be
-decoded into mule-unicode-0100-24ff.")
-(define-translation-table 'utf-8-translation-table-for-decode
- utf-8-translation-table-for-decode)
;; Map Cyrillic and Greek to iso-8859 charsets, which take half the
;; space of mule-unicode. For Latin scripts this isn't very
;; important. Hebrew and Arabic might go here too when there's proper
;; support for them.
-(defvar utf-8-fragmentation-table (make-translation-table)
- "Char table normally mapping non-Latin mule-unicode-... characters to iso8859.
-Used as the value of `utf-8-translation-table-for-decode' in
-`utf-8-fragment-on-decoding' mode.")
+
+(defvar utf-fragmentation-table (make-char-table 'translation-table nil)
+ "Char-table normally mapping non-Latin mule-unicode-* chars to iso-8859-*.
+
+If `utf-fragment-on-decoding' is non-nil, this table populates the
+translation-table named `utf-translation-table-for-decode'")
+
+(defvar utf-defragmentation-table (make-char-table 'translation-table nil)
+ "Char-table for reverse mapping of `utf-fragmentation-table'.
+
+If `utf-fragment-on-decoding' is non-nil and
+`unify-8859-on-encoding-mode' is nil, this table populates the
+translation-table named `utf-translation-table-for-encode'")
+
+(define-translation-table 'utf-translation-table-for-decode)
+
+
+(defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq)
+ "Hash table mapping Emacs CJK character sets to Unicode code points.
+
+If `utf-translate-cjk' is non-nil, this table populates the
+translation-hash-table named `utf-subst-table-for-encode'.")
+
+(define-translation-hash-table 'utf-subst-table-for-encode
+ (make-hash-table :test 'eq))
+
+(defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq)
+ "Hash table mapping Unicode code points to Emacs CJK character sets.
+
+If `utf-translate-cjk' is non-nil, this table populates the
+translation-hash-table named `utf-subst-table-for-decode'.")
+
+(define-translation-hash-table 'utf-subst-table-for-decode
+ (make-hash-table :test 'eq))
+
(mapc
(lambda (pair)
- (aset utf-8-fragmentation-table (car pair) (cdr pair)))
+ (aset utf-fragmentation-table (car pair) (cdr pair))
+ (aset utf-defragmentation-table (cdr pair) (car pair)))
'((?\e$,1&d\e(B . ?\e,F4\e(B) (?\e$,1&e\e(B . ?\e,F5\e(B) (?\e$,1&f\e(B . ?\e,F6\e(B) (?\e$,1&h\e(B . ?\e,F8\e(B) (?\e$,1&i\e(B . ?\e,F9\e(B)
(?\e$,1&j\e(B . ?\e,F:\e(B) (?\e$,1&l\e(B . ?\e,F<\e(B) (?\e$,1&n\e(B . ?\e,F>\e(B) (?\e$,1&o\e(B . ?\e,F?\e(B) (?\e$,1&p\e(B . ?\e,F@\e(B)
(?\e$,1&q\e(B . ?\e,FA\e(B) (?\e$,1&r\e(B . ?\e,FB\e(B) (?\e$,1&s\e(B . ?\e,FC\e(B) (?\e$,1&t\e(B . ?\e,FD\e(B) (?\e$,1&u\e(B . ?\e,FE\e(B)
(?\e$,1(w\e(B . ?\e,Lw\e(B) (?\e$,1(x\e(B . ?\e,Lx\e(B) (?\e$,1(y\e(B . ?\e,Ly\e(B) (?\e$,1(z\e(B . ?\e,Lz\e(B) (?\e$,1({\e(B . ?\e,L{\e(B)
(?\e$,1(|\e(B . ?\e,L|\e(B) (?\e$,1(~\e(B . ?\e,L~\e(B) (?\e$,1(\7f\e(B . ?\e,L\7f\e(B)))
-(defcustom utf-8-fragment-on-decoding nil
- "Whether or not to decode some scripts in UTF-8 text into iso8859 charsets.
+
+(defcustom utf-fragment-on-decoding nil
+ "Whether or not to decode some chars in UTF-8/16 text into iso8859 charsets.
Setting this means that the relevant Cyrillic and Greek characters are
decoded into the iso8859 charsets rather than into
mule-unicode-0100-24ff. The iso8859 charsets take half as much space
Setting this variable outside customize has no effect."
:set (lambda (s v)
- (setq utf-8-translation-table-for-decode
- (if v
- utf-8-fragmentation-table
- (make-translation-table)))
- (define-translation-table 'utf-8-translation-table-for-decode
- utf-8-translation-table-for-decode)
+ (if v
+ (progn
+ (define-translation-table 'utf-translation-table-for-decode
+ utf-fragmentation-table)
+ ;; Even if unify-8859-on-encoding-mode is off, make
+ ;; mule-utf-* encode characters in
+ ;; utf-fragmentation-table.
+ (unless (eq (get 'utf-translation-table-for-encode
+ 'translation-table)
+ ucs-mule-to-mule-unicode)
+ (define-translation-table 'utf-translation-table-for-encode
+ utf-defragmentation-table)
+ (dolist (coding '(mule-utf-8 mule-utf-16-be mule-utf-16-le))
+ (register-char-codings coding utf-defragmentation-table))))
+ (define-translation-table 'utf-translation-table-for-decode)
+ ;; When unify-8859-on-encoding-mode is off, be sure to make
+ ;; mule-utf-* disabled for characters in
+ ;; utf-fragmentation-table.
+ (unless (eq (get 'utf-translation-table-for-encode
+ 'translation-table)
+ ucs-mule-to-mule-unicode)
+ (define-translation-table 'utf-translation-table-for-encode)
+ (map-char-table
+ (lambda (key val)
+ (if (and (>= key 128) val)
+ (aset char-coding-system-table key
+ (delq 'mule-utf-8
+ (delq 'mule-utf-16-le
+ (delq 'mule-utf-16-be
+ (aref char-coding-system-table
+ key)))))))
+ utf-defragmentation-table)))
(set-default s v))
:version "21.4"
:type 'boolean
:group 'mule)
-(defcustom utf-8-translate-cjk nil
- "Whether the `mule-utf-8' coding system should encode many CJK characters.
+(defcustom utf-translate-cjk nil
+ "Whether the UTF based coding systems should decode/encode CJK characters.
-Enabling this loads tables which enable the coding system to encode
-characters in the charsets `korean-ksc5601', `chinese-gb2312' and
+Enabling this loads tables which enable the coding systems:
+ mule-utf-8, mule-utf-16-le, mule-utf-16-be
+to encode characters in the charsets `korean-ksc5601', `chinese-gb2312' and
`japanese-jisx0208', and to decode the corresponding unicodes into
such characters. This works by loading the library `utf-8-subst'; see
its commentary. The tables are fairly large (about 33000 entries), so this
option is not the default."
:link '(emacs-commentary-link "utf-8-subst")
:set (lambda (s v)
- (when v
- (require 'utf-8-subst)
- (let ((table (make-char-table 'translation-table)))
- (coding-system-put 'mule-utf-8 'safe-charsets
- (append (coding-system-get 'mule-utf-8
- 'safe-charsets)
- '(korean-ksc5601 chinese-gb2312
- japanese-jisx0208)))
- (maphash (lambda (k v)
- (aset table k v))
- utf-8-subst-rev-table)
- (register-char-codings 'mule-utf-8 table)))
+ (if v
+ (progn
+ (require 'utf-8-subst)
+ (let ((table (make-char-table 'translation-table)))
+ (maphash (lambda (k v)
+ (aset table k t))
+ ucs-mule-cjk-to-unicode)
+ (register-char-codings 'mule-utf-8 table)
+ (register-char-codings 'mule-utf-16-le table)
+ (register-char-codings 'mule-utf-16-be table))
+ (define-translation-hash-table 'utf-subst-table-for-decode
+ ucs-unicode-to-mule-cjk)
+ (define-translation-hash-table 'utf-subst-table-for-encode
+ ucs-mule-cjk-to-unicode))
+ (map-char-table
+ (lambda (k v)
+ (if (gethash k ucs-mule-cjk-to-unicode)
+ (aset char-coding-system-table k
+ (delq 'mule-utf-8
+ (delq 'mule-utf-16-le
+ (delq 'mule-utf-16-be v))))))
+ char-coding-system-table)
+ (define-translation-hash-table 'utf-subst-table-for-decode
+ (make-hash-table :test 'eq))
+ (define-translation-hash-table 'utf-subst-table-for-encode
+ (make-hash-table :test 'eq)))
(set-default s v))
:version "21.4"
:type 'boolean
(r1 %= 96)
(r1 += (r2 + 32))
(translate-character
- utf-8-translation-table-for-decode r0 r1)
+ utf-translation-table-for-decode r0 r1)
(write-multibyte-character r0 r1))))))))
;; 3byte encoding
(r1 = (r7 + 32))
(r1 += ((r3 + 32) << 7))
(translate-character
- utf-8-translation-table-for-decode r0 r1)
+ utf-translation-table-for-decode r0 r1)
(write-multibyte-character r0 r1))
;; mule-unicode-2500-33ff
;; Fixme: Perhaps allow translation via
- ;; utf-8-subst-table for #x2e80 up, so that we use
- ;; consistent charsets for all of CJK. Would need
- ;; corresponding change to encoding tables.
+ ;; utf-subst-table-for-decode for #x2e80 up, so
+ ;; that we use consistent charsets for all of
+ ;; CJK. Would need corresponding change to
+ ;; encoding tables.
(if (r3 < #x3400)
((r0 = ,(charset-id 'mule-unicode-2500-33ff))
(r3 -= #x2500)
;; them as eight-bit-{control|graphic}.
(if (r3 < #xd800)
((r4 = r3) ; don't zap r3
- (lookup-integer utf-8-subst-table r4 r5)
+ (lookup-integer utf-subst-table-for-decode r4 r5)
(if r7
;; got a translation
((write-multibyte-character r4 r5)
(if (r0 < #xfe)
;; 4byte encoding
;; keep those bytes as eight-bit-{control|graphic}
- ;; Fixme: allow lookup in utf-8-subst-table.
+ ;; Fixme: allow lookup in utf-subst-table-for-decode.
((read r1 r2 r3)
;; r0 > #xf0, thus eight-bit-graphic
(write-multibyte-character r6 r0)
"CCL program to decode UTF-8.
Basic decoding is done into the charsets ascii, latin-iso8859-1 and
-mule-unicode-*, but see also `utf-8-translation-table-for-decode' and
-`utf-8-subst-table'.
+mule-unicode-*, but see also `utf-fragmentation-table' and
+`ucs-mule-cjk-to-unicode'.
Encodings of un-representable Unicode characters are decoded asis into
eight-bit-control and eight-bit-graphic characters.")
(if (r5 < 0)
((r1 = -1)
(read-multibyte-character r0 r1)
- (translate-character ucs-mule-to-mule-unicode r0 r1))
+ (translate-character utf-translation-table-for-encode r0 r1))
(;; We have already done read-multibyte-character.
(r0 = r5)
(r1 = r6)
((write #xc2)
(write r1)))))))
- ((lookup-character utf-8-subst-rev-table r0 r1)
+ ((lookup-character utf-subst-table-for-encode r0 r1)
(if r7 ; lookup succeeded
((r1 = (((r0 & #xf000) >> 12) | #xe0))
(r2 = ((r0 & #x3f) | #x80))
"CCL program to encode into UTF-8.")
-;; Dummy definition so that the CCL can be checked correctly; the
-;; actual data are loaded on demand.
-(unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it
- (define-translation-table 'ucs-mule-8859-to-mule-unicode))
(define-ccl-program ccl-untranslated-to-ucs
`(0
;; ucs-tables is preloaded
;; (defun utf-8-pre-write-conversion (beg end)
;; "Semi-dummy pre-write function effectively to autoload ucs-tables."
-;; ;; Ensure translation table is loaded.
+;; ;; Ensure translation-table is loaded.
;; (require 'ucs-tables)
;; ;; Don't do this again.
;; (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
(make-coding-system
'mule-utf-8 4 ?u
"UTF-8 encoding for Emacs-supported Unicode characters.
-The supported Emacs character sets are the following, plus any other
-characters included in the tables `ucs-mule-to-mule-unicode' and
-`utf-8-subst-rev-table':
- ascii
- eight-bit-control
- eight-bit-graphic
- latin-iso8859-1
- latin-iso8859-2
- latin-iso8859-3
- latin-iso8859-4
- cyrillic-iso8859-5
- greek-iso8859-7
- hebrew-iso8859-8
- latin-iso8859-9
- latin-iso8859-14
- latin-iso8859-15
- mule-unicode-0100-24ff
- mule-unicode-2500-33ff
- mule-unicode-e000-ffff
-
-Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
-may be decoded into korean-ksc5601, chinese-gb2312, japanese-jisx0208
-\(see user option `utf-8-translate-cjk'); otherwise, sequences of
-eight-bit-control and eight-bit-graphic characters are used to
-preserve their byte sequences, and these are composed to display as a
-single character. Emacs characters that otherwise can't be encoded
-are encoded as U+FFFD."
+It supports Unicode characters of these ranges:
+ U+0000..U+33FF, U+E000..U+FFFF.
+They correspond to these Emacs character sets:
+ ascii, latin-iso8859-1, mule-unicode-0100-24ff,
+ mule-unicode-2500-33ff, mule-unicode-e000-ffff
+
+On decoding (e.g. reading a file), Unicode characters not in the above
+ranges are decoded into sequences of eight-bit-control and
+eight-bit-graphic characters to preserve their byte sequences. The
+byte sequence is preserved on i/o for valid utf-8, but not necessarily
+for invalid utf-8.
+
+On encoding (e.g. writing a file), Emacs characters not belonging to
+any of the character sets listed above are encoded into the UTF-8 byte
+sequence representing U+FFFD (REPLACEMENT CHARACTER)."
'(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
'((safe-charsets
eight-bit-control
eight-bit-graphic
latin-iso8859-1
- latin-iso8859-15
- latin-iso8859-14
- latin-iso8859-9
- hebrew-iso8859-8
- greek-iso8859-7
- cyrillic-iso8859-5
- latin-iso8859-4
- latin-iso8859-3
- latin-iso8859-2
- vietnamese-viscii-lower
- vietnamese-viscii-upper
- thai-tis620
- ipa
- ethiopic
- indian-is13194
- katakana-jisx0201
- chinese-sisheng
- lao
mule-unicode-0100-24ff
mule-unicode-2500-33ff
mule-unicode-e000-ffff)
(coding-category . coding-category-utf-8)
(valid-codes (0 . 255))
;; (pre-write-conversion . utf-8-pre-write-conversion)
- (post-read-conversion . utf-8-post-read-conversion)))
+ (post-read-conversion . utf-8-post-read-conversion)
+ (dependency unify-8859-on-encoding-mode
+ unify-8859-on-decoding-mode
+ utf-fragment-on-decoding
+ utf-translate-cjk)))
(define-coding-system-alias 'utf-8 'mule-utf-8)