From: Richard M. Stallman Date: Mon, 17 Jun 2002 13:25:27 +0000 (+0000) Subject: (ccl-decode-mule-utf-8): Deal better with invalid utf-8. X-Git-Tag: emacs-pretest-21.2.91~228 X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=1167e50da7ed0f9a187a8304e109807dbfe929a1;p=emacs.git (ccl-decode-mule-utf-8): Deal better with invalid utf-8. (mule-utf-8): Doc fix. --- diff --git a/lisp/ChangeLog b/lisp/ChangeLog index 9918f75befe..4e547be07c3 100644 --- a/lisp/ChangeLog +++ b/lisp/ChangeLog @@ -1,3 +1,9 @@ +2002-06-17 Dave Love + + * international/utf-8.el (ccl-decode-mule-utf-8): Deal + better with invalid utf-8. + (mule-utf-8): Doc fix. + 2002-06-17 Eli Zaretskii * international/mule.el (ctext-pre-write-conversion): If FROM is a @@ -24,7 +30,7 @@ 2002-05-27 Glenn Morris - * scroll-al.el (minor-mode-alist): `scroll-all-mode', not + * scroll-all.el (minor-mode-alist): `scroll-all-mode', not `scroll-all-mode-mode'. (scroll-all-page-down-all, scroll-all-page-up-all) (scroll-all-check-to-scroll): Remove `fkey-' prefix from scroll diff --git a/lisp/international/utf-8.el b/lisp/international/utf-8.el index 85f60409567..97d59196c97 100644 --- a/lisp/international/utf-8.el +++ b/lisp/international/utf-8.el @@ -2,9 +2,10 @@ ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. ;; Licensed to the Free Software Foundation. -;; Copyright (C) 2001 Free Software Foundation, Inc. +;; Copyright (C) 2001, 2002 Free Software Foundation, Inc. ;; Author: TAKAHASHI Naoto +;; Maintainer: FSF ;; Keywords: multilingual, Unicode, UTF-8, i18n ;; This file is part of GNU Emacs. @@ -37,14 +38,17 @@ ;; mule-unicode-e000-ffff ;; ;; Characters of other character sets cannot be encoded with -;; mule-utf-8. Note that the mule-unicode charsets currently lack -;; case and syntax information, so things like `downcase' will only -;; work for characters from ASCII and Latin-1. +;; mule-utf-8. ;; ;; On decoding, Unicode characters that do not fit into the above ;; character sets are handled as `eight-bit-control' or ;; `eight-bit-graphic' characters to retain the information about the ;; original byte sequence. +;; +;; Fixme: note that reading and writing invalid utf-8, even without +;; editing it, may alter the text. Fixing that needs a new charset to +;; represent the raw bytes in the eight-bit-control range, which are +;; otherwise valid unicodes. ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: @@ -64,6 +68,7 @@ ;; ascii | 1 | 1 ;; -----------------------+----------------+--------------- ;; eight-bit-control | 2 | 2 + ;; eight-bit-graphic | 2 | 1 ;; latin-iso8859-1 | 2 | 2 ;; -----------------------+----------------+--------------- ;; mule-unicode-0100-24ff | 2 | 4 @@ -85,66 +90,16 @@ ;; 1byte encoding, i.e., ascii (if (r0 < #x80) (write r0) - - ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx - (if (r0 < #xe0) - ((read r1) - - (if ((r1 & #b11000000) != #b10000000) - ;; Invalid 2-byte sequence - ((if (r0 < #xa0) - (write-multibyte-character r5 r0) - (write-multibyte-character r6 r0)) - (if (r1 < #x80) - (write r1) - (if (r1 < #xa0) - (write-multibyte-character r5 r1) - (write-multibyte-character r6 r1)))) - - ((r0 &= #x1f) - (r0 <<= 6) - (r1 &= #x3f) - (r1 += r0) - ;; Now r1 holds scalar value - - ;; eight-bit-control - (if (r1 < 160) - ((write-multibyte-character r5 r1)) - - ;; latin-iso8859-1 - (if (r1 < 256) - ((r0 = ,(charset-id 'latin-iso8859-1)) - (r1 -= 128) - (write-multibyte-character r0 r1)) - - ;; mule-unicode-0100-24ff (< 0800) - ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) - (r1 -= #x0100) - (r2 = (((r1 / 96) + 32) << 7)) - (r1 %= 96) - (r1 += (r2 + 32)) - (write-multibyte-character r0 r1))))))) - - ;; 3byte encoding - ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx - (if (r0 < #xf0) - ((read r1 r2) - - ;; This is set to 1 if the encoding is invalid. - (r4 = 0) - - (r3 = (r1 & #b11000000)) - (r3 |= ((r2 >> 2) & #b00110000)) - (if (r3 != #b10100000) - (r4 = 1) - ((r3 = ((r0 & #x0f) << 12)) - (r3 += ((r1 & #x3f) << 6)) - (r3 += (r2 & #x3f)) - (if (r3 < #x0800) - (r4 = 1)))) - - (if (r4 != 0) - ;; Invalid 3-byte sequence + (if (r0 < #xc0) ; continuation byte (invalid here) + (if (r0 < #xa0) + (write-multibyte-character r5 r0) + (write-multibyte-character r6 r0)) + ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx + (if (r0 < #xe0) + ((read r1) + + (if ((r1 & #b11000000) != #b10000000) + ;; Invalid 2-byte sequence ((if (r0 < #xa0) (write-multibyte-character r5 r0) (write-multibyte-character r6 r0)) @@ -152,68 +107,154 @@ (write r1) (if (r1 < #xa0) (write-multibyte-character r5 r1) - (write-multibyte-character r6 r1))) - (if (r2 < #x80) - (write r2) - (if (r2 < #xa0) - (write-multibyte-character r5 r2) - (write-multibyte-character r6 r2)))) + (write-multibyte-character r6 r1)))) + + ((r3 = r0) ; save in case of overlong sequence + (r2 = r1) + (r0 &= #x1f) + (r0 <<= 6) + (r1 &= #x3f) + (r1 += r0) + ;; Now r1 holds scalar value + + (if (r1 < 128) ; `overlong sequence' + ((if (r3 < #xa0) + (write-multibyte-character r5 r3) + (write-multibyte-character r6 r3)) + (if (r2 < #x80) + (write r2) + (if (r2 < #xa0) + (write-multibyte-character r5 r2) + (write-multibyte-character r6 r2)))) + + ;; eight-bit-control + (if (r1 < 160) + ((write-multibyte-character r5 r1)) + + ;; latin-iso8859-1 + (if (r1 < 256) + ((r0 = ,(charset-id 'latin-iso8859-1)) + (r1 -= 128) + (write-multibyte-character r0 r1)) + + ;; mule-unicode-0100-24ff (< 0800) + ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) + (r1 -= #x0100) + (r2 = (((r1 / 96) + 32) << 7)) + (r1 %= 96) + (r1 += (r2 + 32)) + (write-multibyte-character r0 r1)))))))) + + ;; 3byte encoding + ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx + (if (r0 < #xf0) + ((read r1 r2) + + ;; This is set to 1 if the encoding is invalid. + (r4 = 0) + + (r3 = (r1 & #b11000000)) + (r3 |= ((r2 >> 2) & #b00110000)) + (if (r3 != #b10100000) + (r4 = 1) + ((r3 = ((r0 & #x0f) << 12)) + (r3 += ((r1 & #x3f) << 6)) + (r3 += (r2 & #x3f)) + (if (r3 < #x0800) + (r4 = 1)))) + + (if (r4 != 0) + ;; Invalid 3-byte sequence + ((if (r0 < #xa0) + (write-multibyte-character r5 r0) + (write-multibyte-character r6 r0)) + (if (r1 < #x80) + (write r1) + (if (r1 < #xa0) + (write-multibyte-character r5 r1) + (write-multibyte-character r6 r1))) + (if (r2 < #x80) + (write r2) + (if (r2 < #xa0) + (write-multibyte-character r5 r2) + (write-multibyte-character r6 r2)))) - ;; mule-unicode-0100-24ff (>= 0800) - ((if (r3 < #x2500) - ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) - (r3 -= #x0100) - (r3 //= 96) - (r1 = (r7 + 32)) - (r1 += ((r3 + 32) << 7)) - (write-multibyte-character r0 r1)) - - ;; mule-unicode-2500-33ff - (if (r3 < #x3400) - ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) - (r3 -= #x2500) + ;; mule-unicode-0100-24ff (>= 0800) + ((if (r3 < #x2500) + ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) + (r3 -= #x0100) (r3 //= 96) (r1 = (r7 + 32)) (r1 += ((r3 + 32) << 7)) (write-multibyte-character r0 r1)) - - ;; U+3400 .. U+DFFF - ;; keep those bytes as eight-bit-{control|graphic} - (if (r3 < #xe000) - ( ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic - (r3 = r6) - (write-multibyte-character r3 r0) - (if (r1 < #xa0) - (r3 = r5)) - (write-multibyte-character r3 r1) - (if (r2 < #xa0) - (r3 = r5) - (r3 = r6)) - (write-multibyte-character r3 r2)) - - ;; mule-unicode-e000-ffff - ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) - (r3 -= #xe000) - (r3 //= 96) - (r1 = (r7 + 32)) - (r1 += ((r3 + 32) << 7)) - (write-multibyte-character r0 r1)))))))) - - ;; 4byte encoding - ;; keep those bytes as eight-bit-{control|graphic} - ((read r1 r2 r3) - ;; r0 > #xf0, thus eight-bit-graphic - (write-multibyte-character r6 r0) - (if (r1 < #xa0) - (write-multibyte-character r5 r1) - (write-multibyte-character r6 r1)) - (if (r2 < #xa0) - (write-multibyte-character r5 r2) - (write-multibyte-character r6 r2)) - (if (r3 < #xa0) - (write-multibyte-character r5 r3) - (write-multibyte-character r6 r3)))))) - + + ;; mule-unicode-2500-33ff + (if (r3 < #x3400) + ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) + (r3 -= #x2500) + (r3 //= 96) + (r1 = (r7 + 32)) + (r1 += ((r3 + 32) << 7)) + (write-multibyte-character r0 r1)) + + ;; U+3400 .. U+D7FF + ;; Keep them as eight-bit-{control|graphic}. + (if (r3 < #xe000) + ((r3 = r6) + (write-multibyte-character r3 r0) + (if (r1 < #xa0) + (r3 = r5)) + (write-multibyte-character r3 r1) + (if (r2 < #xa0) + (r3 = r5) + (r3 = r6)) + (write-multibyte-character r3 r2)) + ;; mule-unicode-e000-ffff + ;; Fixme: fffe and ffff are invalid. + ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) + (r3 -= #xe000) + (r3 //= 96) + (r1 = (r7 + 32)) + (r1 += ((r3 + 32) << 7)) + (write-multibyte-character r0 r1)))))))) + + (if (r0 < #xfe) + ;; 4byte encoding + ;; keep those bytes as eight-bit-{control|graphic} + ((read r1 r2 r3) + ;; r0 > #xf0, thus eight-bit-graphic + (write-multibyte-character r6 r0) + (if (r1 < #xa0) + (if (r1 < #x80) ; invalid byte + (write r1) + (write-multibyte-character r5 r1)) + (write-multibyte-character r6 r1)) + (if (r2 < #xa0) + (if (r2 < #x80) ; invalid byte + (write r2) + (write-multibyte-character r5 r2)) + (write-multibyte-character r6 r2)) + (if (r3 < #xa0) + (if (r3 < #x80) ; invalid byte + (write r3) + (write-multibyte-character r5 r3)) + (write-multibyte-character r6 r3)) + (if (r0 >= #xf8) ; 5- or 6-byte encoding + ((read r1) + (if (r1 < #xa0) + (if (r1 < #x80) ; invalid byte + (write r1) + (write-multibyte-character r5 r1)) + (write-multibyte-character r6 r1)) + (if (r0 >= #xfc) ; 6-byte + ((read r1) + (if (r1 < #xa0) + (if (r1 < #x80) ; invalid byte + (write r1) + (write-multibyte-character r5 r1)) + (write-multibyte-character r6 r1))))))) + ;; else invalid byte >= #xfe + (write r0)))))) (repeat)))) "CCL program to decode UTF-8. @@ -282,7 +323,7 @@ characters.") (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) (r1 &= #x7f) - (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000 + (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000 (r0 = (((r1 & #xf000) >> 12) | #xe0)) (r2 = ((r1 & #x3f) | #x80)) (r1 &= #x0fc0) @@ -354,12 +395,11 @@ The supported Emacs character sets are: Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF are decoded into sequences of eight-bit-control and eight-bit-graphic -characters to preserve their byte sequences. Emacs characters out of -these ranges are encoded into U+FFFD. +characters to preserve their byte sequences. The byte sequence is +preserved on i/o for valid utf-8, but not necessarily for invalid +utf-8. -Note that, currently, characters in the mule-unicode charsets have no -syntax and case information. Thus, for instance, upper- and -lower-casing commands won't work with them." +Emacs characters not from the above charsets are encoded into U+FFFD." '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) '((safe-charsets