;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
;; Licensed to the Free Software Foundation.
-;; Copyright (C) 2001 Free Software Foundation, Inc.
+;; Copyright (C) 2001, 2002 Free Software Foundation, Inc.
;; Author: TAKAHASHI Naoto <ntakahas@m17n.org>
+;; Maintainer: FSF
;; Keywords: multilingual, Unicode, UTF-8, i18n
;; This file is part of GNU Emacs.
;; mule-unicode-e000-ffff
;;
;; Characters of other character sets cannot be encoded with
-;; mule-utf-8. Note that the mule-unicode charsets currently lack
-;; case and syntax information, so things like `downcase' will only
-;; work for characters from ASCII and Latin-1.
+;; mule-utf-8.
;;
;; On decoding, Unicode characters that do not fit into the above
;; character sets are handled as `eight-bit-control' or
;; `eight-bit-graphic' characters to retain the information about the
;; original byte sequence.
+;;
+;; Fixme: note that reading and writing invalid utf-8, even without
+;; editing it, may alter the text. Fixing that needs a new charset to
+;; represent the raw bytes in the eight-bit-control range, which are
+;; otherwise valid unicodes.
;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
;; ascii | 1 | 1
;; -----------------------+----------------+---------------
;; eight-bit-control | 2 | 2
+ ;; eight-bit-graphic | 2 | 1
;; latin-iso8859-1 | 2 | 2
;; -----------------------+----------------+---------------
;; mule-unicode-0100-24ff | 2 | 4
;; 1byte encoding, i.e., ascii
(if (r0 < #x80)
(write r0)
-
- ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
- (if (r0 < #xe0)
- ((read r1)
-
- (if ((r1 & #b11000000) != #b10000000)
- ;; Invalid 2-byte sequence
- ((if (r0 < #xa0)
- (write-multibyte-character r5 r0)
- (write-multibyte-character r6 r0))
- (if (r1 < #x80)
- (write r1)
- (if (r1 < #xa0)
- (write-multibyte-character r5 r1)
- (write-multibyte-character r6 r1))))
-
- ((r0 &= #x1f)
- (r0 <<= 6)
- (r1 &= #x3f)
- (r1 += r0)
- ;; Now r1 holds scalar value
-
- ;; eight-bit-control
- (if (r1 < 160)
- ((write-multibyte-character r5 r1))
-
- ;; latin-iso8859-1
- (if (r1 < 256)
- ((r0 = ,(charset-id 'latin-iso8859-1))
- (r1 -= 128)
- (write-multibyte-character r0 r1))
-
- ;; mule-unicode-0100-24ff (< 0800)
- ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
- (r1 -= #x0100)
- (r2 = (((r1 / 96) + 32) << 7))
- (r1 %= 96)
- (r1 += (r2 + 32))
- (write-multibyte-character r0 r1)))))))
-
- ;; 3byte encoding
- ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
- (if (r0 < #xf0)
- ((read r1 r2)
-
- ;; This is set to 1 if the encoding is invalid.
- (r4 = 0)
-
- (r3 = (r1 & #b11000000))
- (r3 |= ((r2 >> 2) & #b00110000))
- (if (r3 != #b10100000)
- (r4 = 1)
- ((r3 = ((r0 & #x0f) << 12))
- (r3 += ((r1 & #x3f) << 6))
- (r3 += (r2 & #x3f))
- (if (r3 < #x0800)
- (r4 = 1))))
-
- (if (r4 != 0)
- ;; Invalid 3-byte sequence
+ (if (r0 < #xc0) ; continuation byte (invalid here)
+ (if (r0 < #xa0)
+ (write-multibyte-character r5 r0)
+ (write-multibyte-character r6 r0))
+ ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
+ (if (r0 < #xe0)
+ ((read r1)
+
+ (if ((r1 & #b11000000) != #b10000000)
+ ;; Invalid 2-byte sequence
((if (r0 < #xa0)
(write-multibyte-character r5 r0)
(write-multibyte-character r6 r0))
(write r1)
(if (r1 < #xa0)
(write-multibyte-character r5 r1)
- (write-multibyte-character r6 r1)))
- (if (r2 < #x80)
- (write r2)
- (if (r2 < #xa0)
- (write-multibyte-character r5 r2)
- (write-multibyte-character r6 r2))))
+ (write-multibyte-character r6 r1))))
+
+ ((r3 = r0) ; save in case of overlong sequence
+ (r2 = r1)
+ (r0 &= #x1f)
+ (r0 <<= 6)
+ (r1 &= #x3f)
+ (r1 += r0)
+ ;; Now r1 holds scalar value
+
+ (if (r1 < 128) ; `overlong sequence'
+ ((if (r3 < #xa0)
+ (write-multibyte-character r5 r3)
+ (write-multibyte-character r6 r3))
+ (if (r2 < #x80)
+ (write r2)
+ (if (r2 < #xa0)
+ (write-multibyte-character r5 r2)
+ (write-multibyte-character r6 r2))))
+
+ ;; eight-bit-control
+ (if (r1 < 160)
+ ((write-multibyte-character r5 r1))
+
+ ;; latin-iso8859-1
+ (if (r1 < 256)
+ ((r0 = ,(charset-id 'latin-iso8859-1))
+ (r1 -= 128)
+ (write-multibyte-character r0 r1))
+
+ ;; mule-unicode-0100-24ff (< 0800)
+ ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
+ (r1 -= #x0100)
+ (r2 = (((r1 / 96) + 32) << 7))
+ (r1 %= 96)
+ (r1 += (r2 + 32))
+ (write-multibyte-character r0 r1))))))))
+
+ ;; 3byte encoding
+ ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
+ (if (r0 < #xf0)
+ ((read r1 r2)
+
+ ;; This is set to 1 if the encoding is invalid.
+ (r4 = 0)
+
+ (r3 = (r1 & #b11000000))
+ (r3 |= ((r2 >> 2) & #b00110000))
+ (if (r3 != #b10100000)
+ (r4 = 1)
+ ((r3 = ((r0 & #x0f) << 12))
+ (r3 += ((r1 & #x3f) << 6))
+ (r3 += (r2 & #x3f))
+ (if (r3 < #x0800)
+ (r4 = 1))))
+
+ (if (r4 != 0)
+ ;; Invalid 3-byte sequence
+ ((if (r0 < #xa0)
+ (write-multibyte-character r5 r0)
+ (write-multibyte-character r6 r0))
+ (if (r1 < #x80)
+ (write r1)
+ (if (r1 < #xa0)
+ (write-multibyte-character r5 r1)
+ (write-multibyte-character r6 r1)))
+ (if (r2 < #x80)
+ (write r2)
+ (if (r2 < #xa0)
+ (write-multibyte-character r5 r2)
+ (write-multibyte-character r6 r2))))
- ;; mule-unicode-0100-24ff (>= 0800)
- ((if (r3 < #x2500)
- ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
- (r3 -= #x0100)
- (r3 //= 96)
- (r1 = (r7 + 32))
- (r1 += ((r3 + 32) << 7))
- (write-multibyte-character r0 r1))
-
- ;; mule-unicode-2500-33ff
- (if (r3 < #x3400)
- ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
- (r3 -= #x2500)
+ ;; mule-unicode-0100-24ff (>= 0800)
+ ((if (r3 < #x2500)
+ ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
+ (r3 -= #x0100)
(r3 //= 96)
(r1 = (r7 + 32))
(r1 += ((r3 + 32) << 7))
(write-multibyte-character r0 r1))
-
- ;; U+3400 .. U+DFFF
- ;; keep those bytes as eight-bit-{control|graphic}
- (if (r3 < #xe000)
- ( ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
- (r3 = r6)
- (write-multibyte-character r3 r0)
- (if (r1 < #xa0)
- (r3 = r5))
- (write-multibyte-character r3 r1)
- (if (r2 < #xa0)
- (r3 = r5)
- (r3 = r6))
- (write-multibyte-character r3 r2))
-
- ;; mule-unicode-e000-ffff
- ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
- (r3 -= #xe000)
- (r3 //= 96)
- (r1 = (r7 + 32))
- (r1 += ((r3 + 32) << 7))
- (write-multibyte-character r0 r1))))))))
-
- ;; 4byte encoding
- ;; keep those bytes as eight-bit-{control|graphic}
- ((read r1 r2 r3)
- ;; r0 > #xf0, thus eight-bit-graphic
- (write-multibyte-character r6 r0)
- (if (r1 < #xa0)
- (write-multibyte-character r5 r1)
- (write-multibyte-character r6 r1))
- (if (r2 < #xa0)
- (write-multibyte-character r5 r2)
- (write-multibyte-character r6 r2))
- (if (r3 < #xa0)
- (write-multibyte-character r5 r3)
- (write-multibyte-character r6 r3))))))
-
+
+ ;; mule-unicode-2500-33ff
+ (if (r3 < #x3400)
+ ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
+ (r3 -= #x2500)
+ (r3 //= 96)
+ (r1 = (r7 + 32))
+ (r1 += ((r3 + 32) << 7))
+ (write-multibyte-character r0 r1))
+
+ ;; U+3400 .. U+D7FF
+ ;; Keep them as eight-bit-{control|graphic}.
+ (if (r3 < #xe000)
+ ((r3 = r6)
+ (write-multibyte-character r3 r0)
+ (if (r1 < #xa0)
+ (r3 = r5))
+ (write-multibyte-character r3 r1)
+ (if (r2 < #xa0)
+ (r3 = r5)
+ (r3 = r6))
+ (write-multibyte-character r3 r2))
+ ;; mule-unicode-e000-ffff
+ ;; Fixme: fffe and ffff are invalid.
+ ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
+ (r3 -= #xe000)
+ (r3 //= 96)
+ (r1 = (r7 + 32))
+ (r1 += ((r3 + 32) << 7))
+ (write-multibyte-character r0 r1))))))))
+
+ (if (r0 < #xfe)
+ ;; 4byte encoding
+ ;; keep those bytes as eight-bit-{control|graphic}
+ ((read r1 r2 r3)
+ ;; r0 > #xf0, thus eight-bit-graphic
+ (write-multibyte-character r6 r0)
+ (if (r1 < #xa0)
+ (if (r1 < #x80) ; invalid byte
+ (write r1)
+ (write-multibyte-character r5 r1))
+ (write-multibyte-character r6 r1))
+ (if (r2 < #xa0)
+ (if (r2 < #x80) ; invalid byte
+ (write r2)
+ (write-multibyte-character r5 r2))
+ (write-multibyte-character r6 r2))
+ (if (r3 < #xa0)
+ (if (r3 < #x80) ; invalid byte
+ (write r3)
+ (write-multibyte-character r5 r3))
+ (write-multibyte-character r6 r3))
+ (if (r0 >= #xf8) ; 5- or 6-byte encoding
+ ((read r1)
+ (if (r1 < #xa0)
+ (if (r1 < #x80) ; invalid byte
+ (write r1)
+ (write-multibyte-character r5 r1))
+ (write-multibyte-character r6 r1))
+ (if (r0 >= #xfc) ; 6-byte
+ ((read r1)
+ (if (r1 < #xa0)
+ (if (r1 < #x80) ; invalid byte
+ (write r1)
+ (write-multibyte-character r5 r1))
+ (write-multibyte-character r6 r1)))))))
+ ;; else invalid byte >= #xfe
+ (write r0))))))
(repeat))))
"CCL program to decode UTF-8.
(if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
(r1 &= #x7f)
- (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000
+ (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000
(r0 = (((r1 & #xf000) >> 12) | #xe0))
(r2 = ((r1 & #x3f) | #x80))
(r1 &= #x0fc0)
Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
are decoded into sequences of eight-bit-control and eight-bit-graphic
-characters to preserve their byte sequences. Emacs characters out of
-these ranges are encoded into U+FFFD.
+characters to preserve their byte sequences. The byte sequence is
+preserved on i/o for valid utf-8, but not necessarily for invalid
+utf-8.
-Note that, currently, characters in the mule-unicode charsets have no
-syntax and case information. Thus, for instance, upper- and
-lower-casing commands won't work with them."
+Emacs characters not from the above charsets are encoded into U+FFFD."
'(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
'((safe-charsets