From 67ff2216829a3871823356870f96c80e5434f5f5 Mon Sep 17 00:00:00 2001 From: Kenichi Handa Date: Wed, 12 Mar 2003 00:46:32 +0000 Subject: [PATCH] (ccl-decode-mule-utf-8): Don't loose bytes on handling an invalid byte sequence. --- lisp/ChangeLog | 5 ++++ lisp/international/utf-8.el | 49 +++++++++++++++++++++++++++++++------ 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/lisp/ChangeLog b/lisp/ChangeLog index da9568a6477..c3ea8b2644d 100644 --- a/lisp/ChangeLog +++ b/lisp/ChangeLog @@ -1,3 +1,8 @@ +2003-03-12 Kenichi Handa + + * international/utf-8.el (ccl-decode-mule-utf-8): Don't loose + bytes on handling an invalid byte sequence. + 2003-03-11 Jason Rumney * files.el (auto-mode-alist): Add .xsl for sgml-mode. diff --git a/lisp/international/utf-8.el b/lisp/international/utf-8.el index 67a1f11a4f5..c579a564ca7 100644 --- a/lisp/international/utf-8.el +++ b/lisp/international/utf-8.el @@ -308,18 +308,20 @@ default. Also, installing them may be rather slow." ((r5 = ,(charset-id 'eight-bit-control)) (r6 = ,(charset-id 'eight-bit-graphic)) (loop + (r0 = -1) (read r0) ;; 1byte encoding, i.e., ascii (if (r0 < #x80) - (write r0) + ((write r0)) (if (r0 < #xc0) ; continuation byte (invalid here) - (if (r0 < #xa0) - (write-multibyte-character r5 r0) - (write-multibyte-character r6 r0)) + ((if (r0 < #xa0) + (write-multibyte-character r5 r0) + (write-multibyte-character r6 r0))) ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx (if (r0 < #xe0) - ((read r1) + ((r1 = -1) + (read r1) (if ((r1 & #b11000000) != #b10000000) ;; Invalid 2-byte sequence @@ -373,7 +375,9 @@ default. Also, installing them may be rather slow." ;; 3byte encoding ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx (if (r0 < #xf0) - ((read r1 r2) + ((r1 = -1) + (r2 = -1) + (read r1 r2) ;; This is set to 1 if the encoding is invalid. (r4 = 0) @@ -478,7 +482,10 @@ default. Also, installing them may be rather slow." ;; 4byte encoding ;; keep those bytes as eight-bit-{control|graphic} ;; Fixme: allow lookup in utf-subst-table-for-decode. - ((read r1 r2 r3) + ((r1 = -1) + (r2 = -1) + (r3 = -1) + (read r1 r2 r3) ;; r0 > #xf0, thus eight-bit-graphic (write-multibyte-character r6 r0) (if (r1 < #xa0) @@ -512,7 +519,33 @@ default. Also, installing them may be rather slow." (write-multibyte-character r6 r1))))))) ;; else invalid byte >= #xfe (write-multibyte-character r6 r0)))))) - (repeat)))) + (repeat))) + + ;; At EOF... + (if (r0 >= 0) + ((if (r0 < #x80) + (write r0) + (if (r0 < #xa0) + (write-multibyte-character r5 r0) + ((write-multibyte-character r6 r0)))) + (if (r1 >= 0) + ((if (r1 < #x80) + (write r1) + (if (r1 < #xa0) + (write-multibyte-character r5 r1) + ((write-multibyte-character r6 r1)))) + (if (r2 >= 0) + ((if (r2 < #x80) + (write r2) + (if (r2 < #xa0) + (write-multibyte-character r5 r2) + ((write-multibyte-character r6 r2)))) + (if (r3 >= 0) + (if (r3 < #x80) + (write r3) + (if (r3 < #xa0) + (write-multibyte-character r5 r3) + ((write-multibyte-character r6 r3)))))))))))) "CCL program to decode UTF-8. Basic decoding is done into the charsets ascii, latin-iso8859-1 and -- 2.39.2