From: Eli Zaretskii <eliz@gnu.org>
Date: Fri, 22 Feb 2002 10:45:22 +0000 (+0000)
Subject: (ctext-no-compositions, compound-text-with-extensions): New coding systems.
X-Git-Tag: emacs-21.2~62
X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=ec2b93f7b56dd72ef1d532c3e26af458dc22e9bb;p=emacs.git

(ctext-no-compositions, compound-text-with-extensions): New coding systems.
(x-ctext-with-extensions, ctext-with-extensions): Aliases for
compound-text-with-extensions.
(non-standard-icccm-encodings-alist, non-standard-designations-alist): New
variables.
(ctext-post-read-conversion, ctext-pre-write-conversion): New functions.
---

diff --git a/lisp/international/mule-conf.el b/lisp/international/mule-conf.el
index 456f74a2089..6fed4151154 100644
--- a/lisp/international/mule-conf.el
+++ b/lisp/international/mule-conf.el
@@ -405,6 +405,188 @@ is treated as a character."
 (define-coding-system-alias 'x-ctext 'compound-text)
 (define-coding-system-alias 'ctext 'compound-text)
 
+;; Same as compound-text, but doesn't produce composition escape
+;; sequences.  Used in post-read and pre-write conversions of
+;; ctext-with-extensions, below.
+(make-coding-system
+ 'ctext-no-compositions 2 ?x
+ "Compound text based generic encoding for decoding unknown messages.
+
+Like `compound-text', but does not produce escape sequences for compositions."
+ '((ascii t) (latin-iso8859-1 katakana-jisx0201 t) t t
+   nil ascii-eol ascii-cntl nil locking-shift single-shift nil nil nil
+   init-bol nil nil)
+ '((safe-charsets . t)
+   (mime-charset . x-ctext)))
+
+(defvar non-standard-icccm-encodings-alist
+  '(("ISO8859-15" . latin-iso8859-15)
+    ("ISO8859-14" . latin-iso8859-14)
+    ("KOI8-R" . koi8-r)
+    ("BIG5-0" . big5))
+  "Alist of font charset names defined by XLFD, and the corresponding Emacs
+charsets or coding systems.")
+
+;; Functions to support "Non-Standard Character Set Encodings" defined
+;; by the ICCCM spec.  We support that by converting the leading
+;; sequence of the ``extended segment'' to the corresponding ISO-2022
+;; sequences (if the leading sequence names an Emacs charset), or decode
+;; the segment (if it names a coding system).  Encoding does the reverse.
+(defun ctext-post-read-conversion (len)
+  "Decode LEN characters encoded as Compound Text with Extended Segments."
+  (buffer-disable-undo)	; minimize consing due to insertions and deletions
+  (narrow-to-region (point) (+ (point) len))
+  (save-match-data
+    (let ((pt (point-marker))
+	  (oldpt (point-marker))
+	  (newpt (make-marker))
+	  (modified-p (buffer-modified-p))
+	  (case-fold-search nil)
+	  last-coding-system-used
+	  encoding textlen chset)
+      (while (re-search-forward
+	      "\\(\e\\)%/[0-4]\\([\200-\377][\200-\377]\\)\\([^\002]+\\)\002"
+	      nil 'move)
+	(set-marker newpt (point))
+	(set-marker pt (match-beginning 0))
+	(setq encoding (match-string 3))
+	(setq textlen (- (+ (* (- (aref (match-string 2) 0) 128) 128)
+			    (- (aref (match-string 2) 1) 128))
+			 (1+ (length encoding))))
+	(setq
+	 chset (cdr (assoc-ignore-case encoding
+				       non-standard-icccm-encodings-alist)))
+	(cond ((null chset)
+	       ;; This charset is not supported--leave this extended
+	       ;; segment unaltered and skip over it.
+	       (goto-char (+ (point) textlen)))
+	      ((charsetp chset)
+	     ;; If it's a charset, replace the leading escape sequence
+	     ;; with a standard ISO-2022 sequence.  We will decode all
+	      ;; such segments later, in one go, when we exit the loop
+	       ;; or find an extended segment that names a coding
+	       ;; system, not a charset.
+	       (replace-match
+		(concat "\\1"
+			(if (= 0 (charset-iso-graphic-plane chset))
+			    ;; GL charsets
+			    (if (= 1 (charset-dimension chset)) "(" "$(")
+			  ;; GR charsets
+			  (if (= 96 (charset-chars chset))
+			      "-"
+			    (if (= 1 (charset-dimension chset)) ")" "$)")))
+			(string (charset-iso-final-char chset)))
+		t)
+	       (goto-char (+ (point) textlen)))
+	      ((coding-system-p chset)
+	     ;; If it's a coding system, we need to decode the segment
+	       ;; right away.  But first, decode what we've skipped
+	       ;; across until now.
+	       (when (> pt oldpt)
+		 (decode-coding-region oldpt pt 'ctext-no-compositions))
+	       (delete-region pt newpt)
+	       (set-marker newpt (+ newpt textlen))
+	       (decode-coding-region pt newpt chset)
+	       (goto-char newpt)
+	       (set-marker oldpt newpt))))
+      ;; Decode what's left.
+      (when (> (point) oldpt)
+	(decode-coding-region oldpt (point) 'ctext-no-compositions))
+     ;; This buffer started as unibyte, because the string we get from
+      ;; the X selection is a unibyte string.  We must now make it
+      ;; multibyte, so that the decoded text is inserted as multibyte
+      ;; into its buffer.
+      (set-buffer-multibyte t)
+      (set-buffer-modified-p modified-p)
+      (- (point-max) (point-min)))))
+
+(defvar non-standard-designations-alist
+  '(("$(0" . (big5 "big5-0" 2))
+    ("$(1" . (big5 "big5-0" 2))
+    ("-V"  . (t "iso8859-10" 1))
+    ("-Y"  . (t "iso8859-13" 1))
+    ("-_"  . (t "iso8859-14" 1))
+    ("-b"  . (t "iso8859-15" 1))
+    ("-f"  . (t "iso8859-16" 1)))
+  "Alist of ctext control sequences that introduce character sets which
+are not in the list of approved ICCCM encodings, and the corresponding
+coding system, identifier string, and number of octets per encoded
+character.
+
+Each element has the form (CTLSEQ . (ENCODING CHARSET NOCTETS)).  CTLSEQ
+is the control sequence (sans the leading ESC) that introduces the character
+set in the text encoded by compound-text.  ENCODING is a coding system
+symbol; if it is t, it means that the ctext coding system already encodes
+the text correctly, and only the leading control sequence needs to be altered.
+If ENCODING is a coding system, we need to re-encode the text with that
+coding system.  CHARSET is the ICCCM name of the charset we need to put into
+the leading control sequence.  NOCTETS is the number of octets (bytes) that
+encode each character in this charset.  NOCTETS can be 0 (meaning the number
+of octets per character is variable), 1, 2, 3, or 4.")
+
+(defun ctext-pre-write-conversion (from to)
+  "Encode characters between FROM and TO as Compound Text w/Extended Segments."
+  (buffer-disable-undo)	; minimize consing due to insertions and deletions
+  (narrow-to-region from to)
+  (encode-coding-region from to 'ctext-no-compositions)
+  ;; Replace ISO-2022 charset designations with extended segments, for
+  ;; those charsets that are not part of the official X registry.
+  (save-match-data
+    (goto-char (point-min))
+    (let ((newpt (make-marker))
+	  (case-fold-search nil)
+	  pt desig encode-info encoding chset noctets textlen)
+      (set-buffer-multibyte nil)
+      (while (re-search-forward "\e\\(\$([01]\\|-[VY_bf]\\)" nil 'move)
+	(setq desig (match-string 1)
+	      pt (point-marker)
+	      encode-info (cdr (assoc desig non-standard-designations-alist))
+	      encoding (car encode-info)
+	      chset (cadr encode-info)
+	      noctets (car (cddr encode-info)))
+	(skip-chars-forward "^\e")
+	(set-marker newpt (point))
+	(cond
+	 ((eq encoding t)  ; only the leading sequence needs to be changed
+	  (setq textlen (+ (- newpt pt) (length chset) 1))
+	  (replace-match (format "\e%%/%d%c%c%s"
+				 noctets
+				 (+ (/ textlen 128) 128)
+				 (+ (% textlen 128) 128)
+				 chset)
+			 t t))
+	 ((coding-system-p encoding) ; need to recode the entire segment...
+	  (set-marker pt (match-beginning 0))
+	  (decode-coding-region pt newpt 'ctext-no-compositions)
+	  (set-buffer-multibyte t)
+	  (encode-coding-region pt newpt encoding)
+	  (set-buffer-multibyte nil)
+	  (setq textlen (+ (- newpt pt) (length chset) 1))
+	  (goto-char pt)
+	  (insert (format "\e%%/%d%c%c%s"
+			  noctets
+			  (+ (/ textlen 128) 128)
+			  (+ (% textlen 128) 128)
+			  chset))))
+	(goto-char newpt))))
+  (set-buffer-multibyte t)
+  nil)
+
+(make-coding-system
+ 'compound-text-with-extensions 5 ?x
+ "Compound text encoding with ICCCM Extended Segment extensions.
+
+This coding system should be used only for X selections.  It is inappropriate
+for decoding and encoding files, process I/O, etc."
+ nil
+ '((post-read-conversion . ctext-post-read-conversion)
+   (pre-write-conversion . ctext-pre-write-conversion)))
+
+(define-coding-system-alias
+  'x-ctext-with-extensions 'compound-text-with-extensions)
+(define-coding-system-alias
+  'ctext-with-extensions 'compound-text-with-extensions)
+
 (make-coding-system
  'iso-safe 2 ?-
  "Convert all characters but ASCII to `?'."