(ccl-decode-mule-utf-8): Deal better with invalid utf-8.

author Richard M. Stallman <rms@gnu.org>

Mon, 17 Jun 2002 13:25:27 +0000 (13:25 +0000)

committer Richard M. Stallman <rms@gnu.org>

Mon, 17 Jun 2002 13:25:27 +0000 (13:25 +0000)
author Richard M. Stallman <rms@gnu.org>
Mon, 17 Jun 2002 13:25:27 +0000 (13:25 +0000)
committer Richard M. Stallman <rms@gnu.org>
Mon, 17 Jun 2002 13:25:27 +0000 (13:25 +0000)
diff --git a/lisp/ChangeLog b/lisp/ChangeLog

index 9918f75befe1b30b09cbe07f999d86fba2599e57..4e547be07c31f0395049cd5495c64c2080d4365c 100644 (file)
--- a/lisp/ChangeLog
+++ b/lisp/ChangeLog
@@ -1,3 +1,9 @@
+2002-06-17  Dave Love  <fx@gnu.org>
+
+       * international/utf-8.el (ccl-decode-mule-utf-8): Deal
+       better with invalid utf-8.
+       (mule-utf-8): Doc fix.
+
  2002-06-17  Eli Zaretskii <eliz@is.elta.co.il>
  
         * international/mule.el (ctext-pre-write-conversion): If FROM is a
@@ -24,7 +30,7 @@
  
  2002-05-27  Glenn Morris  <gmorris@ast.cam.ac.uk>
  
-       * scroll-al.el (minor-mode-alist): `scroll-all-mode', not
+       * scroll-all.el (minor-mode-alist): `scroll-all-mode', not
         `scroll-all-mode-mode'.
         (scroll-all-page-down-all, scroll-all-page-up-all)
         (scroll-all-check-to-scroll): Remove `fkey-' prefix from scroll
diff --git a/lisp/international/utf-8.el b/lisp/international/utf-8.el

index 85f60409567f5b68f923f9f952c1a7d905530057..97d59196c973691bfa60117d477fd9262d977292 100644 (file)
--- a/lisp/international/utf-8.el
+++ b/lisp/international/utf-8.el
@@ -2,9 +2,10 @@
  
  ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
  ;; Licensed to the Free Software Foundation.
-;; Copyright (C) 2001 Free Software Foundation, Inc.
+;; Copyright (C) 2001, 2002 Free Software Foundation, Inc.
  
  ;; Author: TAKAHASHI Naoto  <ntakahas@m17n.org>
+;; Maintainer: FSF
  ;; Keywords: multilingual, Unicode, UTF-8, i18n
  
  ;; This file is part of GNU Emacs.
@@ -37,14 +38,17 @@
  ;;   mule-unicode-e000-ffff
  ;;
  ;; Characters of other character sets cannot be encoded with
-;; mule-utf-8.  Note that the mule-unicode charsets currently lack
-;; case and syntax information, so things like `downcase' will only
-;; work for characters from ASCII and Latin-1.
+;; mule-utf-8.
  ;;
  ;; On decoding, Unicode characters that do not fit into the above
  ;; character sets are handled as `eight-bit-control' or
  ;; `eight-bit-graphic' characters to retain the information about the
  ;; original byte sequence.
+;;
+;; Fixme: note that reading and writing invalid utf-8, even without
+;; editing it, may alter the text.  Fixing that needs a new charset to
+;; represent the raw bytes in the eight-bit-control range, which are
+;; otherwise valid unicodes.
  
  ;; UTF-8 is defined in RFC 2279.  A sketch of the encoding is:
  
@@ -64,6 +68,7 @@
    ;;         ascii          |       1        |       1
    ;; -----------------------+----------------+---------------
    ;;    eight-bit-control   |       2        |       2
+  ;;    eight-bit-graphic   |       2        |       1
    ;;     latin-iso8859-1    |       2        |       2
    ;; -----------------------+----------------+---------------
    ;; mule-unicode-0100-24ff |       2        |       4
@@ -85,66 +90,16 @@
        ;; 1byte encoding, i.e., ascii
        (if (r0 < #x80)
           (write r0)
-
-       ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
-       (if (r0 < #xe0)
-           ((read r1)
-
-            (if ((r1 & #b11000000) != #b10000000)
-                ;; Invalid 2-byte sequence
-                ((if (r0 < #xa0)
-                     (write-multibyte-character r5 r0)
-                   (write-multibyte-character r6 r0))
-                 (if (r1 < #x80)
-                     (write r1)
-                   (if (r1 < #xa0)
-                       (write-multibyte-character r5 r1)
-                     (write-multibyte-character r6 r1))))
-
-              ((r0 &= #x1f)
-               (r0 <<= 6)
-               (r1 &= #x3f)
-               (r1 += r0)
-               ;; Now r1 holds scalar value
-
-               ;; eight-bit-control
-               (if (r1 < 160)
-                   ((write-multibyte-character r5 r1))
-
-                 ;; latin-iso8859-1
-                 (if (r1 < 256)
-                     ((r0 = ,(charset-id 'latin-iso8859-1))
-                      (r1 -= 128)
-                      (write-multibyte-character r0 r1))
-
-                   ;; mule-unicode-0100-24ff (< 0800)
-                   ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
-                    (r1 -= #x0100)
-                    (r2 = (((r1 / 96) + 32) << 7))
-                    (r1 %= 96)
-                    (r1 += (r2 + 32))
-                    (write-multibyte-character r0 r1)))))))
-
-         ;; 3byte encoding
-         ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
-         (if (r0 < #xf0)
-             ((read r1 r2)
-
-              ;; This is set to 1 if the encoding is invalid.
-              (r4 = 0)
-
-              (r3 = (r1 & #b11000000))
-              (r3 |= ((r2 >> 2) & #b00110000))
-              (if (r3 != #b10100000)
-                  (r4 = 1)
-                ((r3 = ((r0 & #x0f) << 12))
-                 (r3 += ((r1 & #x3f) << 6))
-                 (r3 += (r2 & #x3f))
-                 (if (r3 < #x0800)
-                     (r4 = 1))))
-
-              (if (r4 != 0)
-                  ;; Invalid 3-byte sequence
+       (if (r0 < #xc0)             ; continuation byte (invalid here)
+           (if (r0 < #xa0)
+               (write-multibyte-character r5 r0)
+             (write-multibyte-character r6 r0))
+         ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
+         (if (r0 < #xe0)
+             ((read r1)
+
+              (if ((r1 & #b11000000) != #b10000000)
+                  ;; Invalid 2-byte sequence
                    ((if (r0 < #xa0)
                         (write-multibyte-character r5 r0)
                       (write-multibyte-character r6 r0))
@@ -152,68 +107,154 @@
                         (write r1)
                       (if (r1 < #xa0)
                           (write-multibyte-character r5 r1)
-                       (write-multibyte-character r6 r1)))
-                   (if (r2 < #x80)
-                       (write r2)
-                     (if (r2 < #xa0)
-                         (write-multibyte-character r5 r2)
-                       (write-multibyte-character r6 r2))))
+                       (write-multibyte-character r6 r1))))
+
+                ((r3 = r0)        ; save in case of overlong sequence
+                 (r2 = r1)
+                 (r0 &= #x1f)
+                 (r0 <<= 6)
+                 (r1 &= #x3f)
+                 (r1 += r0)
+                 ;; Now r1 holds scalar value
+
+                 (if (r1 < 128)        ; `overlong sequence'
+                     ((if (r3 < #xa0)
+                          (write-multibyte-character r5 r3)
+                        (write-multibyte-character r6 r3))
+                      (if (r2 < #x80)
+                          (write r2)
+                        (if (r2 < #xa0)
+                            (write-multibyte-character r5 r2)
+                          (write-multibyte-character r6 r2))))
+
+                   ;; eight-bit-control
+                   (if (r1 < 160)
+                       ((write-multibyte-character r5 r1))
+
+                     ;; latin-iso8859-1
+                     (if (r1 < 256)
+                         ((r0 = ,(charset-id 'latin-iso8859-1))
+                          (r1 -= 128)
+                          (write-multibyte-character r0 r1))
+
+                       ;; mule-unicode-0100-24ff (< 0800)
+                       ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
+                        (r1 -= #x0100)
+                        (r2 = (((r1 / 96) + 32) << 7))
+                        (r1 %= 96)
+                        (r1 += (r2 + 32))
+                        (write-multibyte-character r0 r1))))))))
+
+           ;; 3byte encoding
+           ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
+           (if (r0 < #xf0)
+               ((read r1 r2)
+
+                ;; This is set to 1 if the encoding is invalid.
+                (r4 = 0)
+
+                (r3 = (r1 & #b11000000))
+                (r3 |= ((r2 >> 2) & #b00110000))
+                (if (r3 != #b10100000)
+                    (r4 = 1)
+                  ((r3 = ((r0 & #x0f) << 12))
+                   (r3 += ((r1 & #x3f) << 6))
+                   (r3 += (r2 & #x3f))
+                   (if (r3 < #x0800)
+                       (r4 = 1))))
+
+                (if (r4 != 0)
+                    ;; Invalid 3-byte sequence
+                    ((if (r0 < #xa0)
+                         (write-multibyte-character r5 r0)
+                       (write-multibyte-character r6 r0))
+                     (if (r1 < #x80)
+                         (write r1)
+                       (if (r1 < #xa0)
+                           (write-multibyte-character r5 r1)
+                         (write-multibyte-character r6 r1)))
+                     (if (r2 < #x80)
+                         (write r2)
+                       (if (r2 < #xa0)
+                           (write-multibyte-character r5 r2)
+                         (write-multibyte-character r6 r2))))
                  
-                ;; mule-unicode-0100-24ff (>= 0800)
-                ((if (r3 < #x2500)
-                     ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
-                      (r3 -= #x0100)
-                      (r3 //= 96)
-                      (r1 = (r7 + 32))
-                      (r1 += ((r3 + 32) << 7))
-                      (write-multibyte-character r0 r1))
-                   
-                   ;; mule-unicode-2500-33ff
-                   (if (r3 < #x3400)
-                       ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
-                        (r3 -= #x2500)
+                  ;; mule-unicode-0100-24ff (>= 0800)
+                  ((if (r3 < #x2500)
+                       ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
+                        (r3 -= #x0100)
                          (r3 //= 96)
                          (r1 = (r7 + 32))
                          (r1 += ((r3 + 32) << 7))
                          (write-multibyte-character r0 r1))
-                     
-                     ;; U+3400 .. U+DFFF
-                   ;; keep those bytes as eight-bit-{control|graphic}
-                     (if (r3 < #xe000)
-                         ( ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
-                          (r3 = r6)
-                          (write-multibyte-character r3 r0)
-                          (if (r1 < #xa0)
-                              (r3 = r5))
-                          (write-multibyte-character r3 r1)
-                          (if (r2 < #xa0)
-                              (r3 = r5)
-                            (r3 = r6))
-                          (write-multibyte-character r3 r2))
-                       
-                       ;; mule-unicode-e000-ffff
-                       ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
-                        (r3 -= #xe000)
-                        (r3 //= 96)
-                        (r1 = (r7 + 32))
-                        (r1 += ((r3 + 32) << 7))
-                        (write-multibyte-character r0 r1))))))))
-
-           ;; 4byte encoding
-           ;; keep those bytes as eight-bit-{control|graphic}
-           ((read r1 r2 r3)
-            ;; r0 > #xf0, thus eight-bit-graphic
-            (write-multibyte-character r6 r0)
-            (if (r1 < #xa0)
-                (write-multibyte-character r5 r1)
-              (write-multibyte-character r6 r1))
-            (if (r2 < #xa0)
-                (write-multibyte-character r5 r2)
-              (write-multibyte-character r6 r2))
-            (if (r3 < #xa0)
-                (write-multibyte-character r5 r3)
-              (write-multibyte-character r6 r3))))))
-
+                   
+                     ;; mule-unicode-2500-33ff
+                     (if (r3 < #x3400)
+                         ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
+                          (r3 -= #x2500)
+                          (r3 //= 96)
+                          (r1 = (r7 + 32))
+                          (r1 += ((r3 + 32) << 7))
+                          (write-multibyte-character r0 r1))
+
+                       ;; U+3400 .. U+D7FF
+                       ;; Keep them as eight-bit-{control|graphic}.
+                       (if (r3 < #xe000)
+                           ((r3 = r6)
+                            (write-multibyte-character r3 r0)
+                            (if (r1 < #xa0)
+                                (r3 = r5))
+                            (write-multibyte-character r3 r1)
+                            (if (r2 < #xa0)
+                                (r3 = r5)
+                              (r3 = r6))
+                            (write-multibyte-character r3 r2))
+                           ;; mule-unicode-e000-ffff
+                           ;; Fixme: fffe and ffff are invalid.
+                           ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
+                            (r3 -= #xe000)
+                            (r3 //= 96)
+                            (r1 = (r7 + 32))
+                            (r1 += ((r3 + 32) << 7))
+                            (write-multibyte-character r0 r1))))))))
+
+             (if (r0 < #xfe)
+                 ;; 4byte encoding
+                 ;; keep those bytes as eight-bit-{control|graphic}
+                 ((read r1 r2 r3)
+                  ;; r0 > #xf0, thus eight-bit-graphic
+                  (write-multibyte-character r6 r0)
+                  (if (r1 < #xa0)
+                      (if (r1 < #x80)  ; invalid byte
+                          (write r1)
+                        (write-multibyte-character r5 r1))
+                    (write-multibyte-character r6 r1))
+                  (if (r2 < #xa0)
+                      (if (r2 < #x80)  ; invalid byte
+                          (write r2)
+                        (write-multibyte-character r5 r2))
+                    (write-multibyte-character r6 r2))
+                  (if (r3 < #xa0)
+                      (if (r3 < #x80)  ; invalid byte
+                          (write r3)
+                        (write-multibyte-character r5 r3))
+                    (write-multibyte-character r6 r3))
+                  (if (r0 >= #xf8)     ; 5- or 6-byte encoding
+                      ((read r1)
+                       (if (r1 < #xa0)
+                           (if (r1 < #x80) ; invalid byte
+                               (write r1)
+                             (write-multibyte-character r5 r1))
+                         (write-multibyte-character r6 r1))
+                       (if (r0 >= #xfc) ; 6-byte
+                           ((read r1)
+                            (if (r1 < #xa0)
+                                (if (r1 < #x80) ; invalid byte
+                                    (write r1)
+                                  (write-multibyte-character r5 r1))
+                              (write-multibyte-character r6 r1)))))))
+               ;; else invalid byte >= #xfe
+               (write r0))))))
        (repeat))))
  
    "CCL program to decode UTF-8.
@@ -282,7 +323,7 @@ characters.")
               (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
                   ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
                    (r1 &= #x7f)
-                  (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000
+                  (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000
                    (r0 = (((r1 & #xf000) >> 12) | #xe0))
                    (r2 = ((r1 & #x3f) | #x80))
                    (r1 &= #x0fc0)
@@ -354,12 +395,11 @@ The supported Emacs character sets are:
  
  Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
  are decoded into sequences of eight-bit-control and eight-bit-graphic
-characters to preserve their byte sequences.  Emacs characters out of
-these ranges are encoded into U+FFFD.
+characters to preserve their byte sequences.  The byte sequence is
+preserved on i/o for valid utf-8, but not necessarily for invalid
+utf-8.
  
-Note that, currently, characters in the mule-unicode charsets have no
-syntax and case information.  Thus, for instance, upper- and
-lower-casing commands won't work with them."
+Emacs characters not from the above charsets are encoded into U+FFFD."
  
   '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
   '((safe-charsets
author	Richard M. Stallman <rms@gnu.org>
	Mon, 17 Jun 2002 13:25:27 +0000 (13:25 +0000)
committer	Richard M. Stallman <rms@gnu.org>
	Mon, 17 Jun 2002 13:25:27 +0000 (13:25 +0000)
lisp/ChangeLog		patch \| blob \| history
lisp/international/utf-8.el		patch \| blob \| history