(ctext-no-compositions, compound-text-with-extensions): New coding systems.

author Eli Zaretskii <eliz@gnu.org>

Fri, 22 Feb 2002 10:45:22 +0000 (10:45 +0000)

committer Eli Zaretskii <eliz@gnu.org>

Fri, 22 Feb 2002 10:45:22 +0000 (10:45 +0000)
author Eli Zaretskii <eliz@gnu.org>
Fri, 22 Feb 2002 10:45:22 +0000 (10:45 +0000)
committer Eli Zaretskii <eliz@gnu.org>
Fri, 22 Feb 2002 10:45:22 +0000 (10:45 +0000)
diff --git a/lisp/international/mule-conf.el b/lisp/international/mule-conf.el

index 456f74a2089ce595bad3b5228a73c5d5937813ff..6fed415115421d76809e9f6020ea9cb051c06852 100644 (file)
--- a/lisp/international/mule-conf.el
+++ b/lisp/international/mule-conf.el
@@ -405,6 +405,188 @@ is treated as a character."
  (define-coding-system-alias 'x-ctext 'compound-text)
  (define-coding-system-alias 'ctext 'compound-text)
  
+;; Same as compound-text, but doesn't produce composition escape
+;; sequences.  Used in post-read and pre-write conversions of
+;; ctext-with-extensions, below.
+(make-coding-system
+ 'ctext-no-compositions 2 ?x
+ "Compound text based generic encoding for decoding unknown messages.
+
+Like `compound-text', but does not produce escape sequences for compositions."
+ '((ascii t) (latin-iso8859-1 katakana-jisx0201 t) t t
+   nil ascii-eol ascii-cntl nil locking-shift single-shift nil nil nil
+   init-bol nil nil)
+ '((safe-charsets . t)
+   (mime-charset . x-ctext)))
+
+(defvar non-standard-icccm-encodings-alist
+  '(("ISO8859-15" . latin-iso8859-15)
+    ("ISO8859-14" . latin-iso8859-14)
+    ("KOI8-R" . koi8-r)
+    ("BIG5-0" . big5))
+  "Alist of font charset names defined by XLFD, and the corresponding Emacs
+charsets or coding systems.")
+
+;; Functions to support "Non-Standard Character Set Encodings" defined
+;; by the ICCCM spec.  We support that by converting the leading
+;; sequence of the ``extended segment'' to the corresponding ISO-2022
+;; sequences (if the leading sequence names an Emacs charset), or decode
+;; the segment (if it names a coding system).  Encoding does the reverse.
+(defun ctext-post-read-conversion (len)
+  "Decode LEN characters encoded as Compound Text with Extended Segments."
+  (buffer-disable-undo)        ; minimize consing due to insertions and deletions
+  (narrow-to-region (point) (+ (point) len))
+  (save-match-data
+    (let ((pt (point-marker))
+         (oldpt (point-marker))
+         (newpt (make-marker))
+         (modified-p (buffer-modified-p))
+         (case-fold-search nil)
+         last-coding-system-used
+         encoding textlen chset)
+      (while (re-search-forward
+             "\\(\e\\)%/[0-4]\\([\200-\377][\200-\377]\\)\\([^\002]+\\)\002"
+             nil 'move)
+       (set-marker newpt (point))
+       (set-marker pt (match-beginning 0))
+       (setq encoding (match-string 3))
+       (setq textlen (- (+ (* (- (aref (match-string 2) 0) 128) 128)
+                           (- (aref (match-string 2) 1) 128))
+                        (1+ (length encoding))))
+       (setq
+        chset (cdr (assoc-ignore-case encoding
+                                      non-standard-icccm-encodings-alist)))
+       (cond ((null chset)
+              ;; This charset is not supported--leave this extended
+              ;; segment unaltered and skip over it.
+              (goto-char (+ (point) textlen)))
+             ((charsetp chset)
+            ;; If it's a charset, replace the leading escape sequence
+            ;; with a standard ISO-2022 sequence.  We will decode all
+             ;; such segments later, in one go, when we exit the loop
+              ;; or find an extended segment that names a coding
+              ;; system, not a charset.
+              (replace-match
+               (concat "\\1"
+                       (if (= 0 (charset-iso-graphic-plane chset))
+                           ;; GL charsets
+                           (if (= 1 (charset-dimension chset)) "(" "$(")
+                         ;; GR charsets
+                         (if (= 96 (charset-chars chset))
+                             "-"
+                           (if (= 1 (charset-dimension chset)) ")" "$)")))
+                       (string (charset-iso-final-char chset)))
+               t)
+              (goto-char (+ (point) textlen)))
+             ((coding-system-p chset)
+            ;; If it's a coding system, we need to decode the segment
+              ;; right away.  But first, decode what we've skipped
+              ;; across until now.
+              (when (> pt oldpt)
+                (decode-coding-region oldpt pt 'ctext-no-compositions))
+              (delete-region pt newpt)
+              (set-marker newpt (+ newpt textlen))
+              (decode-coding-region pt newpt chset)
+              (goto-char newpt)
+              (set-marker oldpt newpt))))
+      ;; Decode what's left.
+      (when (> (point) oldpt)
+       (decode-coding-region oldpt (point) 'ctext-no-compositions))
+     ;; This buffer started as unibyte, because the string we get from
+      ;; the X selection is a unibyte string.  We must now make it
+      ;; multibyte, so that the decoded text is inserted as multibyte
+      ;; into its buffer.
+      (set-buffer-multibyte t)
+      (set-buffer-modified-p modified-p)
+      (- (point-max) (point-min)))))
+
+(defvar non-standard-designations-alist
+  '(("$(0" . (big5 "big5-0" 2))
+    ("$(1" . (big5 "big5-0" 2))
+    ("-V"  . (t "iso8859-10" 1))
+    ("-Y"  . (t "iso8859-13" 1))
+    ("-_"  . (t "iso8859-14" 1))
+    ("-b"  . (t "iso8859-15" 1))
+    ("-f"  . (t "iso8859-16" 1)))
+  "Alist of ctext control sequences that introduce character sets which
+are not in the list of approved ICCCM encodings, and the corresponding
+coding system, identifier string, and number of octets per encoded
+character.
+
+Each element has the form (CTLSEQ . (ENCODING CHARSET NOCTETS)).  CTLSEQ
+is the control sequence (sans the leading ESC) that introduces the character
+set in the text encoded by compound-text.  ENCODING is a coding system
+symbol; if it is t, it means that the ctext coding system already encodes
+the text correctly, and only the leading control sequence needs to be altered.
+If ENCODING is a coding system, we need to re-encode the text with that
+coding system.  CHARSET is the ICCCM name of the charset we need to put into
+the leading control sequence.  NOCTETS is the number of octets (bytes) that
+encode each character in this charset.  NOCTETS can be 0 (meaning the number
+of octets per character is variable), 1, 2, 3, or 4.")
+
+(defun ctext-pre-write-conversion (from to)
+  "Encode characters between FROM and TO as Compound Text w/Extended Segments."
+  (buffer-disable-undo)        ; minimize consing due to insertions and deletions
+  (narrow-to-region from to)
+  (encode-coding-region from to 'ctext-no-compositions)
+  ;; Replace ISO-2022 charset designations with extended segments, for
+  ;; those charsets that are not part of the official X registry.
+  (save-match-data
+    (goto-char (point-min))
+    (let ((newpt (make-marker))
+         (case-fold-search nil)
+         pt desig encode-info encoding chset noctets textlen)
+      (set-buffer-multibyte nil)
+      (while (re-search-forward "\e\\(\$([01]\\|-[VY_bf]\\)" nil 'move)
+       (setq desig (match-string 1)
+             pt (point-marker)
+             encode-info (cdr (assoc desig non-standard-designations-alist))
+             encoding (car encode-info)
+             chset (cadr encode-info)
+             noctets (car (cddr encode-info)))
+       (skip-chars-forward "^\e")
+       (set-marker newpt (point))
+       (cond
+        ((eq encoding t)  ; only the leading sequence needs to be changed
+         (setq textlen (+ (- newpt pt) (length chset) 1))
+         (replace-match (format "\e%%/%d%c%c%s\ 2"
+                                noctets
+                                (+ (/ textlen 128) 128)
+                                (+ (% textlen 128) 128)
+                                chset)
+                        t t))
+        ((coding-system-p encoding) ; need to recode the entire segment...
+         (set-marker pt (match-beginning 0))
+         (decode-coding-region pt newpt 'ctext-no-compositions)
+         (set-buffer-multibyte t)
+         (encode-coding-region pt newpt encoding)
+         (set-buffer-multibyte nil)
+         (setq textlen (+ (- newpt pt) (length chset) 1))
+         (goto-char pt)
+         (insert (format "\e%%/%d%c%c%s\ 2"
+                         noctets
+                         (+ (/ textlen 128) 128)
+                         (+ (% textlen 128) 128)
+                         chset))))
+       (goto-char newpt))))
+  (set-buffer-multibyte t)
+  nil)
+
+(make-coding-system
+ 'compound-text-with-extensions 5 ?x
+ "Compound text encoding with ICCCM Extended Segment extensions.
+
+This coding system should be used only for X selections.  It is inappropriate
+for decoding and encoding files, process I/O, etc."
+ nil
+ '((post-read-conversion . ctext-post-read-conversion)
+   (pre-write-conversion . ctext-pre-write-conversion)))
+
+(define-coding-system-alias
+  'x-ctext-with-extensions 'compound-text-with-extensions)
+(define-coding-system-alias
+  'ctext-with-extensions 'compound-text-with-extensions)
+
  (make-coding-system
   'iso-safe 2 ?-
   "Convert all characters but ASCII to `?'."
author	Eli Zaretskii <eliz@gnu.org>
	Fri, 22 Feb 2002 10:45:22 +0000 (10:45 +0000)
committer	Eli Zaretskii <eliz@gnu.org>
	Fri, 22 Feb 2002 10:45:22 +0000 (10:45 +0000)