From: Kenichi Handa Date: Tue, 1 Oct 2002 06:57:47 +0000 (+0000) Subject: Synchronized with the code in 21.4 X-Git-Tag: emacs-pretest-21.2.91~10 X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=620d0ea95bd1d06c89b2d6b9318e996f03a09838;p=emacs.git Synchronized with the code in 21.4 and deleted codes for not yet supported features; utf-fragment-on-decoding and utf-translate-cjk. --- diff --git a/lisp/international/utf-8.el b/lisp/international/utf-8.el index 8d5c50b450b..e6d7434e16e 100644 --- a/lisp/international/utf-8.el +++ b/lisp/international/utf-8.el @@ -27,8 +27,8 @@ ;;; Commentary: -;; The coding-system `mule-utf-8' supports encoding/decoding of the -;; following character sets to and from UTF-8: +;; The coding-system `mule-utf-8' basically supports encoding/decoding +;; of the following character sets to and from UTF-8: ;; ;; ascii ;; eight-bit-control @@ -37,18 +37,19 @@ ;; mule-unicode-2500-33ff ;; mule-unicode-e000-ffff ;; -;; Characters of other character sets cannot be encoded with -;; mule-utf-8. -;; ;; On decoding, Unicode characters that do not fit into the above ;; character sets are handled as `eight-bit-control' or ;; `eight-bit-graphic' characters to retain the information about the -;; original byte sequence. +;; original byte sequence and text properties record the corresponding +;; unicode. +;; +;; Fixme: note that reading and writing invalid utf-8 may not be +;; idempotent -- to represent the bytes to fix that needs a new charset. ;; -;; Fixme: note that reading and writing invalid utf-8, even without -;; editing it, may alter the text. Fixing that needs a new charset to -;; represent the raw bytes in the eight-bit-control range, which are -;; otherwise valid unicodes. +;; Characters from other character sets can be encoded with mule-utf-8 +;; by populating the translation-table +;; `utf-translation-table-for-encode' and registering the translation +;; with `register-char-codings'. ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: @@ -61,6 +62,14 @@ ;;; Code: +(defvar ucs-mule-to-mule-unicode (make-char-table 'translation-table nil) + "Char table mapping characters to latin-iso8859-1 or mule-unicode-*. + +If `unify-8859-on-encoding-mode' is non-nil, this table populates the +translation-table named `utf-translation-table-for-encode'.") + +(define-translation-table 'utf-translation-table-for-encode) + (define-ccl-program ccl-decode-mule-utf-8 ;; ;; charset | bytes in utf-8 | bytes in emacs @@ -259,9 +268,10 @@ "CCL program to decode UTF-8. Basic decoding is done into the charsets ascii, latin-iso8859-1 and -mule-unicode-*. Encodings of un-representable Unicode characters are -decoded asis into eight-bit-control and eight-bit-graphic -characters.") +mule-unicode-*, but see also `utf-fragmentation-table' and +`ucs-mule-cjk-to-unicode'. +Encodings of un-representable Unicode characters are decoded asis into +eight-bit-control and eight-bit-graphic characters.") (define-ccl-program ccl-encode-mule-utf-8 `(1 @@ -269,7 +279,8 @@ characters.") (loop (if (r5 < 0) ((r1 = -1) - (read-multibyte-character r0 r1)) + (read-multibyte-character r0 r1) + (translate-character utf-translation-table-for-encode r0 r1)) (;; We have already done read-multibyte-character. (r0 = r5) (r1 = r6) @@ -376,30 +387,26 @@ characters.") ((write #xc2) (write r1))))) - "CCL program to encode into UTF-8. -Only characters from the charsets ascii, eight-bit-control, -eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized. -Others are encoded as U+FFFD.") + "CCL program to encode into UTF-8.") (make-coding-system 'mule-utf-8 4 ?u "UTF-8 encoding for Emacs-supported Unicode characters. -The supported Emacs character sets are: - ascii - eight-bit-control - eight-bit-graphic - latin-iso8859-1 - mule-unicode-0100-24ff - mule-unicode-2500-33ff - mule-unicode-e000-ffff - -Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF -are decoded into sequences of eight-bit-control and eight-bit-graphic -characters to preserve their byte sequences. The byte sequence is -preserved on i/o for valid utf-8, but not necessarily for invalid -utf-8. - -Emacs characters not from the above charsets are encoded into U+FFFD." +It supports Unicode characters of these ranges: + U+0000..U+33FF, U+E000..U+FFFF. +They correspond to these Emacs character sets: + ascii, latin-iso8859-1, mule-unicode-0100-24ff, + mule-unicode-2500-33ff, mule-unicode-e000-ffff + +On decoding (e.g. reading a file), Unicode characters not in the above +ranges are decoded into sequences of eight-bit-control and +eight-bit-graphic characters to preserve their byte sequences. The +byte sequence is preserved on i/o for valid utf-8, but not necessarily +for invalid utf-8. + +On encoding (e.g. writing a file), Emacs characters not belonging to +any of the character sets listed above are encoded into the UTF-8 byte +sequence representing U+FFFD (REPLACEMENT CHARACTER)." '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) '((safe-charsets @@ -412,7 +419,10 @@ Emacs characters not from the above charsets are encoded into U+FFFD." mule-unicode-e000-ffff) (mime-charset . utf-8) (coding-category . coding-category-utf-8) - (valid-codes (0 . 255)))) + (valid-codes (0 . 255)) + (post-read-conversion . utf-8-post-read-conversion) + (dependency unify-8859-on-encoding-mode + unify-8859-on-decoding-mode))) (define-coding-system-alias 'utf-8 'mule-utf-8)