Doc and commentary fixes.

author Dave Love <fx@gnu.org>

Tue, 20 Feb 2001 20:55:06 +0000 (20:55 +0000)

committer Dave Love <fx@gnu.org>

Tue, 20 Feb 2001 20:55:06 +0000 (20:55 +0000)
author Dave Love <fx@gnu.org>
Tue, 20 Feb 2001 20:55:06 +0000 (20:55 +0000)
committer Dave Love <fx@gnu.org>
Tue, 20 Feb 2001 20:55:06 +0000 (20:55 +0000)
diff --git a/lisp/international/utf-8.el b/lisp/international/utf-8.el

index 0d7bcb8692aceaa8b495b3ca37d8ae3984dc7981..c56e13c947531a79c15799c5e63512d08d580400 100644 (file)
--- a/lisp/international/utf-8.el
+++ b/lisp/international/utf-8.el
@@ -3,7 +3,7 @@
  ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
  ;; Licensed to the Free Software Foundation.
  
-;; Keywords: multilingual, Unicode, UTF-8
+;; Keywords: multilingual, Unicode, UTF-8, i18n
  
  ;; This file is part of GNU Emacs.
  
@@ -25,7 +25,7 @@
  ;;; Commentary:
  
  ;; The coding-system `mule-utf-8' supports encoding/decoding of the
-;; following character sets:
+;; following character sets to and from UTF-8:
  ;;
  ;;   ascii
  ;;   eight-bit-control
@@ -35,12 +35,16 @@
  ;;   mule-unicode-e000-ffff
  ;;
  ;; Characters of other character sets cannot be encoded with
-;; mule-utf-8.
+;; mule-utf-8.  Note that the mule-unicode charsets currently lack
+;; case and syntax information, so things like `downcase' will only
+;; work for characters from ASCII and Latin-1.
  ;;
-;; On decoding, Unicode characters that do not fit in above character
-;; sets are handled as `eight-bit-control' or `eight-bit-graphic'
-;; characters to retain original information (i.e. original byte
-;; sequence).
+;; On decoding, Unicode characters that do not fit into the above
+;; character sets are handled as `eight-bit-control' or
+;; `eight-bit-graphic' characters to retain the information about the
+;; original byte sequence.
+
+;; UTF-8 is defined in RFC 2279.  A sketch of the encoding is:
  
  ;;        scalar       |               utf-8
  ;;        value        | 1st byte  | 2nd byte  | 3rd byte
@@ -174,7 +178,9 @@
  
        (repeat))))
  
-  "CCL program to decode UTF-8 into ascii, eight-bit-control, latin-iso8859-1 and mule-unicode-*.")
+  "CCL program to decode UTF-8.
+Decoding is done into the charsets ascii, eight-bit-control,
+latin-iso8859-1 and mule-unicode-* only.")
  
  (define-ccl-program ccl-encode-mule-utf-8
    `(1
@@ -251,20 +257,22 @@
                      ;; ff    0000 0000 1111 1111    1101 1111 1011 1111
                      (write r1)
  
-                  ;; unsupported character.
-                  ;; output U+FFFD, which is `ef bf bd' in UTF-8
-                  ;; actually it never reach here
+                  ;; Unsupported character.
+                  ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
                    ((write #xef)
                     (write #xbf)
                     (write #xbd)))))))))
       (repeat)))
  
-  "CCL program to encode ascii, eight-bit-control, latin-iso8859-1 and mule-unicode-*. into UTF-8.")
+  "CCL program to encode into UTF-8.
+Only characters from the charsets ascii, eight-bit-control,
+latin-iso8859-1 and mule-unicode-* are recognized.  Others are encoded
+as U+FFFD.")
  
  (make-coding-system
   'mule-utf-8 4 ?u
   "UTF-8 encoding for Emacs-supported Unicode characters.
-Supported character sets are:
+The supported Emacs character sets are:
     ascii
     eight-bit-control
     eight-bit-graphic
@@ -273,8 +281,14 @@ Supported character sets are:
     mule-unicode-2500-33ff
     mule-unicode-e000-ffff
  
-Unicode characters out of these ranges are decoded
-into eight-bit-control or eight-bit-graphic."
+Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
+are decoded into sequences of eight-bit-control and eight-bit-graphic
+characters to preserve their byte sequences.  Emacs characters out of
+these ranges are encoded into U+FFFD.
+
+Note that, currently, characters in the mule-unicode charsets have no
+syntax and case information.  Thus, for instance, upper- and
+lower-casing commands won't work with them."
  
   '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
   '((safe-charsets
author	Dave Love <fx@gnu.org>
	Tue, 20 Feb 2001 20:55:06 +0000 (20:55 +0000)
committer	Dave Love <fx@gnu.org>
	Tue, 20 Feb 2001 20:55:06 +0000 (20:55 +0000)