From c81676cd6a7a29b85a38a34e003ef1df9f6f9bbd Mon Sep 17 00:00:00 2001
From: Dave Love <fx@gnu.org>
Date: Tue, 5 Jun 2001 20:13:49 +0000
Subject: [PATCH] Add coding tag. (ccl-encode-mule-utf-8): Translate using
 ucs-mule-8859-to-mule-unicode. (ucs-mule-8859-to-mule-unicode): New
 translation table. (utf-8-untranslated-to-ucs, utf-8-help-echo,
 utf-8-compose) (utf-8-post-read-conversion, utf-8-pre-write-conversion): New
 function. (utf-8-subst-table): New variable. (mule-utf-8): Modify coding
 system definition to use post-read and pre-write functions.

---
 lisp/international/utf-8.el | 526 ++++++++++++++++++++++++------------
 1 file changed, 350 insertions(+), 176 deletions(-)

diff --git a/lisp/international/utf-8.el b/lisp/international/utf-8.el
index 33460848890..2ef5b05c963 100644
--- a/lisp/international/utf-8.el
+++ b/lisp/international/utf-8.el
@@ -1,8 +1,10 @@
-;;; utf-8.el --- Limited UTF-8 decoding/encoding support
+;;; utf-8.el --- Limited UTF-8 decoding/encoding support -*- coding: iso-2022-7bit-*-
 
 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
 ;; Licensed to the Free Software Foundation.
+;; Copyright (C) 2001 Free Software Foundation, Inc.
 
+;; Author: TAKAHASHI Naoto  <ntakahas@m17n.org>
 ;; Keywords: multilingual, Unicode, UTF-8, i18n
 
 ;; This file is part of GNU Emacs.
@@ -62,6 +64,7 @@
   ;;         ascii          |       1        |       1
   ;; -----------------------+----------------+---------------
   ;;    eight-bit-control   |       2        |       2
+  ;;    eight-bit-graphic   |       2        |       1
   ;;     latin-iso8859-1    |       2        |       2
   ;; -----------------------+----------------+---------------
   ;; mule-unicode-0100-24ff |       2        |       4
@@ -75,206 +78,353 @@
   ;; Thus magnification factor is two.
   ;;
   `(2
-    ((loop
+    ((r5 = ,(charset-id 'eight-bit-control))
+     (r6 = ,(charset-id 'eight-bit-graphic))
+     (loop
       (read r0)
 
       ;; 1byte encoding, i.e., ascii
       (if (r0 < #x80)
 	  (write r0)
 
-	;; 2byte encoding
+	;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
 	(if (r0 < #xe0)
 	    ((read r1)
-	     (r0 &= #x1f)
-	     (r0 <<= 6)
-	     (r1 &= #x3f)
-	     (r1 += r0)
-	     ;; now r1 holds scalar value
-
-	     ;; eight-bit-control
-	     (if (r1 < 160)
-		 ((r0 = ,(charset-id 'eight-bit-control))
-		  (write-multibyte-character r0 r1))
-
-	       ;; latin-iso8859-1
-	       (if (r1 < 256)
-		   ((r0 = ,(charset-id 'latin-iso8859-1))
-		    (r1 -= 128)
-		    (write-multibyte-character r0 r1))
-
-		 ;; mule-unicode-0100-24ff (< 0800)
-		 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
-		  (r1 -= #x0100)
-		  (r2 = (((r1 / 96) + 32) << 7))
-		  (r1 %= 96)
-		  (r1 += (r2 + 32))
-		  (write-multibyte-character r0 r1)))))
+
+	     (if ((r1 & #b11000000) != #b10000000)
+		 ;; Invalid 2-byte sequence
+		 ((if (r0 < #xa0)
+		      (write-multibyte-character r5 r0)
+		    (write-multibyte-character r6 r0))
+		  (if (r1 < #x80)
+		      (write r1)
+		    (if (r1 < #xa0)
+			(write-multibyte-character r5 r1)
+		      (write-multibyte-character r6 r1))))
+
+	       ((r0 &= #x1f)
+		(r0 <<= 6)
+		(r1 &= #x3f)
+		(r1 += r0)
+		;; Now r1 holds scalar value
+
+		;; eight-bit-control
+		(if (r1 < 160)
+		    ((write-multibyte-character r5 r1))
+
+		  ;; latin-iso8859-1
+		  (if (r1 < 256)
+		      ((r0 = ,(charset-id 'latin-iso8859-1))
+		       (r1 -= 128)
+		       (write-multibyte-character r0 r1))
+
+		    ;; mule-unicode-0100-24ff (< 0800)
+		    ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
+		     (r1 -= #x0100)
+		     (r2 = (((r1 / 96) + 32) << 7))
+		     (r1 %= 96)
+		     (r1 += (r2 + 32))
+		     (write-multibyte-character r0 r1)))))))
 
 	  ;; 3byte encoding
+	  ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
 	  (if (r0 < #xf0)
 	      ((read r1 r2)
-	       (r3 = ((r0 & #x0f) << 12))
-	       (r3 += ((r1 & #x3f) << 6))
-	       (r3 += (r2 & #x3f))
-	       ;; now r3 holds scalar value
-
-	       ;; mule-unicode-0100-24ff (>= 0800)
-	       (if (r3 < #x2500)
-		   ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
-		    (r3 -= #x0100)
-		    (r3 //= 96)
-		    (r1 = (r7 + 32))
-		    (r1 += ((r3 + 32) << 7))
-		    (write-multibyte-character r0 r1))
-
-		 ;; mule-unicode-2500-33ff
-		 (if (r3 < #x3400)
-		     ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
-		      (r3 -= #x2500)
-		      (r3 //= 96)
-		      (r1 = (r7 + 32))
-		      (r1 += ((r3 + 32) << 7))
-		      (write-multibyte-character r0 r1))
-
-		   ;; U+3400 .. U+DFFF
-		   ;; keep those bytes as eight-bit-{control|graphic}
-		   (if (r3 < #xe000)
-		       (;; #xe0 < r0 < #xf0, so r0 is eight-bit-graphic
-			(r3 = ,(charset-id 'eight-bit-graphic))
-			(write-multibyte-character r3 r0)
-			(if (r1 < #xa0)
-			    (r3 = ,(charset-id 'eight-bit-control)))
-			(write-multibyte-character r3 r1)
-			(if (r2 < #xa0)
-			    (r3 = ,(charset-id 'eight-bit-control))
-			  (r3 = ,(charset-id 'eight-bit-graphic)))
-			(write-multibyte-character r3 r2))
-
-		     ;; mule-unicode-e000-ffff
-		     ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
-		      (r3 -= #xe000)
-		      (r3 //= 96)
-		      (r1 = (r7 + 32))
-		      (r1 += ((r3 + 32) << 7))
-		      (write-multibyte-character r0 r1))))))
+
+	       ;; This is set to 1 if the encoding is invalid.
+	       (r4 = 0)
+
+	       (r3 = (r1 & #b11000000))
+	       (r3 |= ((r2 >> 2) & #b00110000))
+	       (if (r3 != #b10100000)
+		   (r4 = 1)
+		 ((r3 = ((r0 & #x0f) << 12))
+		  (r3 += ((r1 & #x3f) << 6))
+		  (r3 += (r2 & #x3f))
+		  (if (r3 < #x0800)
+		      (r4 = 1))))
+
+	       (if (r4 != 0)
+		   ;; Invalid 3-byte sequence
+		   ((if (r0 < #xa0)
+			(write-multibyte-character r5 r0)
+		      (write-multibyte-character r6 r0))
+		    (if (r1 < #x80)
+			(write r1)
+		      (if (r1 < #xa0)
+			  (write-multibyte-character r5 r1)
+			(write-multibyte-character r6 r1)))
+		    (if (r2 < #x80)
+			(write r2)
+		      (if (r2 < #xa0)
+			  (write-multibyte-character r5 r2)
+			(write-multibyte-character r6 r2))))
+		 
+		 ;; mule-unicode-0100-24ff (>= 0800)
+		 ((if (r3 < #x2500)
+		      ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
+		       (r3 -= #x0100)
+		       (r3 //= 96)
+		       (r1 = (r7 + 32))
+		       (r1 += ((r3 + 32) << 7))
+		       (write-multibyte-character r0 r1))
+		    
+		    ;; mule-unicode-2500-33ff
+		    (if (r3 < #x3400)
+			((r0 = ,(charset-id 'mule-unicode-2500-33ff))
+			 (r3 -= #x2500)
+			 (r3 //= 96)
+			 (r1 = (r7 + 32))
+			 (r1 += ((r3 + 32) << 7))
+			 (write-multibyte-character r0 r1))
+		      
+		      ;; U+3400 .. U+DFFF
+		    ;; keep those bytes as eight-bit-{control|graphic}
+		      (if (r3 < #xe000)
+			  ( ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
+			   (r3 = r6)
+			   (write-multibyte-character r3 r0)
+			   (if (r1 < #xa0)
+			       (r3 = r5))
+			   (write-multibyte-character r3 r1)
+			   (if (r2 < #xa0)
+			       (r3 = r5)
+			     (r3 = r6))
+			   (write-multibyte-character r3 r2))
+			
+			;; mule-unicode-e000-ffff
+			((r0 = ,(charset-id 'mule-unicode-e000-ffff))
+			 (r3 -= #xe000)
+			 (r3 //= 96)
+			 (r1 = (r7 + 32))
+			 (r1 += ((r3 + 32) << 7))
+			 (write-multibyte-character r0 r1))))))))
 
 	    ;; 4byte encoding
 	    ;; keep those bytes as eight-bit-{control|graphic}
 	    ((read r1 r2 r3)
 	     ;; r0 > #xf0, thus eight-bit-graphic
-	     (r4 = ,(charset-id 'eight-bit-graphic))
-	     (write-multibyte-character r4 r0)
+	     (write-multibyte-character r6 r0)
 	     (if (r1 < #xa0)
-		 (r4 = ,(charset-id 'eight-bit-control)))
-	     (write-multibyte-character r4 r1)
+		 (write-multibyte-character r5 r1)
+	       (write-multibyte-character r6 r1))
 	     (if (r2 < #xa0)
-		 (r4 = ,(charset-id 'eight-bit-control))
-	       (r4 = ,(charset-id 'eight-bit-graphic)))
-	     (write-multibyte-character r4 r2)
+		 (write-multibyte-character r5 r2)
+	       (write-multibyte-character r6 r2))
 	     (if (r3 < #xa0)
-		 (r4 = ,(charset-id 'eight-bit-control))
-	       (r4 = ,(charset-id 'eight-bit-graphic)))
-	     (write-multibyte-character r4 r3)))))
+		 (write-multibyte-character r5 r3)
+	       (write-multibyte-character r6 r3))))))
 
       (repeat))))
 
   "CCL program to decode UTF-8.
-Decoding is done into the charsets ascii, eight-bit-control,
-latin-iso8859-1 and mule-unicode-* only.")
+Basic decoding is done into the charsets ascii, latin-iso8859-1 and
+mule-unicode-*.  Encodings of un-representable Unicode characters are
+decoded asis into eight-bit-control and eight-bit-graphic
+characters.")
 
 (define-ccl-program ccl-encode-mule-utf-8
   `(1
-    (loop
-     (read-multibyte-character r0 r1)
-
-     (translate-character ucs-mule-8859-to-mule-unicode r0 r1)
-
-     (if (r0 == ,(charset-id 'ascii))
-	 (write r1)
-
-       (if (r0 == ,(charset-id 'latin-iso8859-1))
-	   ;; r1          scalar                  utf-8
-	   ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
-	   ;; 20    0000 0000 1010 0000    1100 0010 1010 0000
-	   ;; 7f    0000 0000 1111 1111    1100 0011 1011 1111
-	   ((r0 = (((r1 & #x40) >> 6) | #xc2))
-	    (r1 &= #x3f)
-	    (r1 |= #x80)
-	    (write r0 r1))
-
-	 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
-	     ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
-	      ;; #x3f80 == (0011 1111 1000 0000)b
-	      (r1 &= #x7f)
-	      (r1 += (r0 + 224))	; 240 == -32 + #x0100
-	      ;; now r1 holds scalar value
-	      (if (r1 < #x0800)
-		  ;; 2byte encoding
-		  ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
-		   ;; #x07c0 == (0000 0111 1100 0000)b
-		   (r1 &= #x3f)
-		   (r1 |= #x80)
-		   (write r0 r1))
-		;; 3byte encoding
-		((r0 = (((r1 & #xf000) >> 12) | #xe0))
-		 (r2 = ((r1 & #x3f) | #x80))
-		 (r1 &= #x0fc0)
-		 (r1 >>= 6)
-		 (r1 |= #x80)
-		 (write r0 r1 r2))))
-
-	   (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
-	       ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
-		(r1 &= #x7f)
-		(r1 += (r0 + 9440))	; 9440 == -32 + #x2500
-		(r0 = (((r1 & #xf000) >> 12) | #xe0))
-		(r2 = ((r1 & #x3f) | #x80))
-		(r1 &= #x0fc0)
-		(r1 >>= 6)
-		(r1 |= #x80)
-		(write r0 r1 r2))
-
-	     (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
-		 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
-		  (r1 &= #x7f)
-		  (r1 += (r0 + 57312))	; 57312 == -160 + #xe000
-		  (r0 = (((r1 & #xf000) >> 12) | #xe0))
+    ((r5 = -1)
+     (loop
+      (if (r5 < 0)
+	  ((r1 = -1)
+	   (read-multibyte-character r0 r1)
+	   (translate-character ucs-mule-8859-to-mule-unicode r0 r1))
+	(;; We have already done read-multibyte-character.
+	 (r0 = r5)
+	 (r1 = r6)
+	 (r5 = -1)))
+
+      (if (r0 == ,(charset-id 'ascii))
+	  (write r1)
+
+	(if (r0 == ,(charset-id 'latin-iso8859-1))
+	    ;; r1          scalar                  utf-8
+	    ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
+	    ;; 20    0000 0000 1010 0000    1100 0010 1010 0000
+	    ;; 7f    0000 0000 1111 1111    1100 0011 1011 1111
+	    ((r0 = (((r1 & #x40) >> 6) | #xc2))
+	     (r1 &= #x3f)
+	     (r1 |= #x80)
+	     (write r0 r1))
+
+	  (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
+	      ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
+	       ;; #x3f80 == (0011 1111 1000 0000)b
+	       (r1 &= #x7f)
+	       (r1 += (r0 + 224))	; 240 == -32 + #x0100
+	       ;; now r1 holds scalar value
+	       (if (r1 < #x0800)
+		   ;; 2byte encoding
+		   ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
+		    ;; #x07c0 == (0000 0111 1100 0000)b
+		    (r1 &= #x3f)
+		    (r1 |= #x80)
+		    (write r0 r1))
+		 ;; 3byte encoding
+		 ((r0 = (((r1 & #xf000) >> 12) | #xe0))
 		  (r2 = ((r1 & #x3f) | #x80))
 		  (r1 &= #x0fc0)
 		  (r1 >>= 6)
 		  (r1 |= #x80)
-		  (write r0 r1 r2))
-
-	       (if (r0 == ,(charset-id 'eight-bit-control))
-		   ;; r1          scalar                  utf-8
-		   ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
-		   ;; 80    0000 0000 1000 0000    1100 0010 1000 0000
-		   ;; 9f    0000 0000 1001 1111    1100 0010 1001 1111
-		   (write r1)
-
-		 (if (r0 == ,(charset-id 'eight-bit-graphic))
-		     ;; r1          scalar                  utf-8
-		     ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
-		     ;; a0    0000 0000 1010 0000    1100 0010 1010 0000
-		     ;; ff    0000 0000 1111 1111    1101 1111 1011 1111
-		     (write r1)
-
-		   ;; Unsupported character.
-		   ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
-		   ((write #xef)
-		    (write #xbf)
-		    (write #xbd)))))))))
-     (repeat)))
+		  (write r0 r1 r2))))
+
+	    (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
+		((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
+		 (r1 &= #x7f)
+		 (r1 += (r0 + 9440))	; 9440 == -32 + #x2500
+		 (r0 = (((r1 & #xf000) >> 12) | #xe0))
+		 (r2 = ((r1 & #x3f) | #x80))
+		 (r1 &= #x0fc0)
+		 (r1 >>= 6)
+		 (r1 |= #x80)
+		 (write r0 r1 r2))
+
+	      (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
+		  ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
+		   (r1 &= #x7f)
+		   (r1 += (r0 + 57312))	; 57312 == -160 + #xe000
+		   (r0 = (((r1 & #xf000) >> 12) | #xe0))
+		   (r2 = ((r1 & #x3f) | #x80))
+		   (r1 &= #x0fc0)
+		   (r1 >>= 6)
+		   (r1 |= #x80)
+		   (write r0 r1 r2))
+
+		(if (r0 == ,(charset-id 'eight-bit-control))
+		    ;; r1          scalar                  utf-8
+		    ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
+		    ;; 80    0000 0000 1000 0000    1100 0010 1000 0000
+		    ;; 9f    0000 0000 1001 1111    1100 0010 1001 1111
+		    ((write #xc2)
+		     (write r1))
+
+		  (if (r0 == ,(charset-id 'eight-bit-graphic))
+		      ;; r1          scalar                  utf-8
+		      ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
+		      ;; a0    0000 0000 1010 0000    1100 0010 1010 0000
+		      ;; ff    0000 0000 1111 1111    1101 1111 1011 1111
+		      ((write r1)
+		       (r1 = -1)
+		       (read-multibyte-character r0 r1)
+		       (if (r0 != ,(charset-id 'eight-bit-graphic))
+			   (if (r0 != ,(charset-id 'eight-bit-control))
+			       ((r5 = r0)
+				(r6 = r1))))
+		       (if (r5 < 0)
+			   ((read-multibyte-character r0 r2)
+			    (if (r0 != ,(charset-id 'eight-bit-graphic))
+				(if (r0 != ,(charset-id 'eight-bit-control))
+				    ((r5 = r0)
+				     (r6 = r2))))
+			    (if (r5 < 0)
+				(write r1 r2)
+			      (if (r1 < #xa0)
+				  (write r1)
+				((write #xc2)
+				 (write r1)))))))
+
+		    ;; Unsupported character.
+		    ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
+		    ((write #xef)
+		     (write #xbf)
+		     (write #xbd)))))))))
+      (repeat)))
+    (if (r1 >= #xa0)
+	(write r1)
+      (if (r1 >= #x80)
+	  ((write #xc2)
+	   (write r1)))))
 
   "CCL program to encode into UTF-8.
 Only characters from the charsets ascii, eight-bit-control,
-latin-iso8859-1 and mule-unicode-* are recognized.  Others are encoded
-as U+FFFD.")
+eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized.
+Others are encoded as U+FFFD.")
 
-;; Dummy definition needed by the CCL program.  The real data are
-;; loaded on demand.
+;; Dummy definition so that the CCL can be checked correctly; the
+;; actual data are loaded on demand.
 (define-translation-table 'ucs-mule-8859-to-mule-unicode)
 
+(defsubst utf-8-untranslated-to-ucs ()
+  (let ((b1 (char-after))
+	(b2 (char-after (1+ (point))))
+	(b3 (char-after (+ 2 (point))))
+	(b4 (char-after (+ 4 (point)))))
+    (if (and b1 b2 b3)
+	(cond ((< b1 ?\xf0)
+	       (setq b2 (lsh (logand b2 ?\x3f) 6))
+	       (setq b3 (logand b3 ?\x3f))
+	       (logior b3 (logior b2 (lsh (logand b1 ?\x0f) 12))))
+	      (b4
+	       (setq b2 (lsh (logand b2 ?\x3f) 12))
+	       (setq b3 (lsh (logand b3 ?\x3f) 6))
+	       (setq b4 (logand b4 ?\x3f))
+	       (logior b4 (logior b3 (logior b2 (lsh (logand b1 ?\x07)
+						     18)))))))))
+
+(defun utf-8-help-echo (window object position)
+  (format "Untranslated Unicode U+%04X"
+	  (get-char-property position 'untranslated-utf-8 object)))
+
+(defvar utf-8-subst-table nil
+  "If non-nil, a hash table mapping `untranslatable utf-8' to Emacs characters.")
+
+;; We compose the untranslatable sequences into a single character.
+;; This is infelicitous for editing, because there's currently no
+;; mechanism for treating compositions as atomic, but is OK for
+;; display.  We try to compose an appropriate character from a hash
+;; table of CJK characters to display correctly.  Otherwise we use
+;; U+FFFD.  What we really should have is hash table lookup from CCL
+;; so that we could do this properly.
+(defsubst utf-8-compose ()
+  "Put a suitable composition on an untrnslatable sequence.
+Return the sequence's length."
+  (let* ((u (utf-8-untranslated-to-ucs))
+	 (l (and u (if (>= u ?\x10000)
+		       4
+		     3)))
+	 (subst (or (and utf-8-subst-table (gethash u utf-8-subst-table))
+		    ?$,3u=(B)))
+    (when u
+      (put-text-property (point) (min (point-max) (+ l (point)))
+			 'untranslated-utf-8 u)
+      (unless subst
+	(put-text-property (point) (min (point-max) (+ l (point)))
+			   'help-echo 'utf-8-help-echo))
+      (compose-region (point) (+ l (point)) subst)
+      l)))
+
+(defun utf-8-post-read-conversion (length)
+  "Compose untranslated utf-8 sequences into single characters."
+  (save-excursion
+    (while (and (skip-chars-forward (string-as-multibyte "^\341-\377"))
+		(not (eobp)))
+      (forward-char (utf-8-compose))))
+  length)
+
+(defun utf-8-pre-write-conversion (beg end)
+  (require 'ucs-tables)			; ensure translation table is loaded
+  (when (stringp beg)
+    (set-buffer (generate-new-buffer " *temp*"))
+    (insert beg)
+    (setq end (1+ (length beg)))
+    (setq beg 1))
+  ;; Look for 8-bit-graphic characters that haven't been marked as
+  ;; untranslated, and UTF-8-encode them.
+  (save-excursion
+    (save-restriction
+      (narrow-to-region beg end)
+      (goto-char beg)
+      (while (and (skip-chars-forward (string-as-multibyte "^\240-\377"))
+		  (not (eobp)))
+	(if (get-text-property (point) 'untranslated-utf-8)
+	    (forward-char)
+	  (let ((c (char-after)))
+	    (delete-char 1)
+	    (insert (make-char 'latin-iso8859-1 (- c 128))))))))
+  nil)
+
 (make-coding-system
  'mule-utf-8 4 ?u
  "UTF-8 encoding for Emacs-supported Unicode characters.
@@ -283,18 +433,24 @@ The supported Emacs character sets are:
    eight-bit-control
    eight-bit-graphic
    latin-iso8859-1
+   latin-iso8859-2
+   latin-iso8859-3
+   latin-iso8859-4
+   cyrillic-iso8859-5
+   greek-iso8859-7
+   hebrew-iso8859-8
+   latin-iso8859-9
+   latin-iso8859-14
+   latin-iso8859-15
    mule-unicode-0100-24ff
    mule-unicode-2500-33ff
    mule-unicode-e000-ffff
 
 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
 are decoded into sequences of eight-bit-control and eight-bit-graphic
-characters to preserve their byte sequences.  Emacs characters out of
-these ranges are encoded into U+FFFD.
-
-Note that, currently, characters in the mule-unicode charsets have no
-syntax and case information.  Thus, for instance, upper- and
-lower-casing commands won't work with them."
+characters to preserve their byte sequences and composed to behave as
+a single character when editing.  Emacs characters out of these ranges
+are encoded into U+FFFD."
 
  '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
  '((safe-charsets
@@ -315,12 +471,30 @@ lower-casing commands won't work with them."
     mule-unicode-2500-33ff
     mule-unicode-e000-ffff)
    (mime-charset . utf-8)
+   (coding-category . coding-category-utf-8)
    (valid-codes (0 . 255))
-   ;; Kluge to get the real translation table loaded.
-   (pre-write-conversion . internal-require-ucs-tables)))
-
-(defun internal-require-ucs-tables (from to)
-  (require 'ucs-tables)
-  nil)
+   (post-read-conversion . utf-8-post-read-conversion)
+   (pre-write-conversion . utf-8-pre-write-conversion)))
 
 (define-coding-system-alias 'utf-8 'mule-utf-8)
+
+;; I think this needs special private charsets defined for the
+;; untranslated sequences, if it's going to work well.
+
+;; (defun utf-8-compose-function (pos to pattern &optional string)
+;;   (let* ((prop (get-char-property pos 'composition string))
+;; 	 (l (and prop (- (cadr prop) (car prop)))))
+;;     (cond ((and l (> l (- to pos)))
+;; 	   (delete-region pos to))
+;; 	  ((and (> (char-after pos) 224)
+;; 		(< (char-after pos) 256)
+;; 		(save-restriction
+;; 		  (narrow-to-region pos to)
+;; 		  (utf-8-compose)))
+;; 	   t))))
+
+;; (dotimes (i 96)
+;;   (aset composition-function-table
+;; 	(+ 128 i)
+;; 	`((,(string-as-multibyte "[\200-\237\240-\377]")
+;; 	   . utf-8-compose-function))))
-- 
2.39.5