From: Eli Zaretskii Date: Fri, 22 Feb 2002 10:45:22 +0000 (+0000) Subject: (ctext-no-compositions, compound-text-with-extensions): New coding systems. X-Git-Tag: emacs-21.2~62 X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=ec2b93f7b56dd72ef1d532c3e26af458dc22e9bb;p=emacs.git (ctext-no-compositions, compound-text-with-extensions): New coding systems. (x-ctext-with-extensions, ctext-with-extensions): Aliases for compound-text-with-extensions. (non-standard-icccm-encodings-alist, non-standard-designations-alist): New variables. (ctext-post-read-conversion, ctext-pre-write-conversion): New functions. --- diff --git a/lisp/international/mule-conf.el b/lisp/international/mule-conf.el index 456f74a2089..6fed4151154 100644 --- a/lisp/international/mule-conf.el +++ b/lisp/international/mule-conf.el @@ -405,6 +405,188 @@ is treated as a character." (define-coding-system-alias 'x-ctext 'compound-text) (define-coding-system-alias 'ctext 'compound-text) +;; Same as compound-text, but doesn't produce composition escape +;; sequences. Used in post-read and pre-write conversions of +;; ctext-with-extensions, below. +(make-coding-system + 'ctext-no-compositions 2 ?x + "Compound text based generic encoding for decoding unknown messages. + +Like `compound-text', but does not produce escape sequences for compositions." + '((ascii t) (latin-iso8859-1 katakana-jisx0201 t) t t + nil ascii-eol ascii-cntl nil locking-shift single-shift nil nil nil + init-bol nil nil) + '((safe-charsets . t) + (mime-charset . x-ctext))) + +(defvar non-standard-icccm-encodings-alist + '(("ISO8859-15" . latin-iso8859-15) + ("ISO8859-14" . latin-iso8859-14) + ("KOI8-R" . koi8-r) + ("BIG5-0" . big5)) + "Alist of font charset names defined by XLFD, and the corresponding Emacs +charsets or coding systems.") + +;; Functions to support "Non-Standard Character Set Encodings" defined +;; by the ICCCM spec. We support that by converting the leading +;; sequence of the ``extended segment'' to the corresponding ISO-2022 +;; sequences (if the leading sequence names an Emacs charset), or decode +;; the segment (if it names a coding system). Encoding does the reverse. +(defun ctext-post-read-conversion (len) + "Decode LEN characters encoded as Compound Text with Extended Segments." + (buffer-disable-undo) ; minimize consing due to insertions and deletions + (narrow-to-region (point) (+ (point) len)) + (save-match-data + (let ((pt (point-marker)) + (oldpt (point-marker)) + (newpt (make-marker)) + (modified-p (buffer-modified-p)) + (case-fold-search nil) + last-coding-system-used + encoding textlen chset) + (while (re-search-forward + "\\(\e\\)%/[0-4]\\([\200-\377][\200-\377]\\)\\([^\002]+\\)\002" + nil 'move) + (set-marker newpt (point)) + (set-marker pt (match-beginning 0)) + (setq encoding (match-string 3)) + (setq textlen (- (+ (* (- (aref (match-string 2) 0) 128) 128) + (- (aref (match-string 2) 1) 128)) + (1+ (length encoding)))) + (setq + chset (cdr (assoc-ignore-case encoding + non-standard-icccm-encodings-alist))) + (cond ((null chset) + ;; This charset is not supported--leave this extended + ;; segment unaltered and skip over it. + (goto-char (+ (point) textlen))) + ((charsetp chset) + ;; If it's a charset, replace the leading escape sequence + ;; with a standard ISO-2022 sequence. We will decode all + ;; such segments later, in one go, when we exit the loop + ;; or find an extended segment that names a coding + ;; system, not a charset. + (replace-match + (concat "\\1" + (if (= 0 (charset-iso-graphic-plane chset)) + ;; GL charsets + (if (= 1 (charset-dimension chset)) "(" "$(") + ;; GR charsets + (if (= 96 (charset-chars chset)) + "-" + (if (= 1 (charset-dimension chset)) ")" "$)"))) + (string (charset-iso-final-char chset))) + t) + (goto-char (+ (point) textlen))) + ((coding-system-p chset) + ;; If it's a coding system, we need to decode the segment + ;; right away. But first, decode what we've skipped + ;; across until now. + (when (> pt oldpt) + (decode-coding-region oldpt pt 'ctext-no-compositions)) + (delete-region pt newpt) + (set-marker newpt (+ newpt textlen)) + (decode-coding-region pt newpt chset) + (goto-char newpt) + (set-marker oldpt newpt)))) + ;; Decode what's left. + (when (> (point) oldpt) + (decode-coding-region oldpt (point) 'ctext-no-compositions)) + ;; This buffer started as unibyte, because the string we get from + ;; the X selection is a unibyte string. We must now make it + ;; multibyte, so that the decoded text is inserted as multibyte + ;; into its buffer. + (set-buffer-multibyte t) + (set-buffer-modified-p modified-p) + (- (point-max) (point-min))))) + +(defvar non-standard-designations-alist + '(("$(0" . (big5 "big5-0" 2)) + ("$(1" . (big5 "big5-0" 2)) + ("-V" . (t "iso8859-10" 1)) + ("-Y" . (t "iso8859-13" 1)) + ("-_" . (t "iso8859-14" 1)) + ("-b" . (t "iso8859-15" 1)) + ("-f" . (t "iso8859-16" 1))) + "Alist of ctext control sequences that introduce character sets which +are not in the list of approved ICCCM encodings, and the corresponding +coding system, identifier string, and number of octets per encoded +character. + +Each element has the form (CTLSEQ . (ENCODING CHARSET NOCTETS)). CTLSEQ +is the control sequence (sans the leading ESC) that introduces the character +set in the text encoded by compound-text. ENCODING is a coding system +symbol; if it is t, it means that the ctext coding system already encodes +the text correctly, and only the leading control sequence needs to be altered. +If ENCODING is a coding system, we need to re-encode the text with that +coding system. CHARSET is the ICCCM name of the charset we need to put into +the leading control sequence. NOCTETS is the number of octets (bytes) that +encode each character in this charset. NOCTETS can be 0 (meaning the number +of octets per character is variable), 1, 2, 3, or 4.") + +(defun ctext-pre-write-conversion (from to) + "Encode characters between FROM and TO as Compound Text w/Extended Segments." + (buffer-disable-undo) ; minimize consing due to insertions and deletions + (narrow-to-region from to) + (encode-coding-region from to 'ctext-no-compositions) + ;; Replace ISO-2022 charset designations with extended segments, for + ;; those charsets that are not part of the official X registry. + (save-match-data + (goto-char (point-min)) + (let ((newpt (make-marker)) + (case-fold-search nil) + pt desig encode-info encoding chset noctets textlen) + (set-buffer-multibyte nil) + (while (re-search-forward "\e\\(\$([01]\\|-[VY_bf]\\)" nil 'move) + (setq desig (match-string 1) + pt (point-marker) + encode-info (cdr (assoc desig non-standard-designations-alist)) + encoding (car encode-info) + chset (cadr encode-info) + noctets (car (cddr encode-info))) + (skip-chars-forward "^\e") + (set-marker newpt (point)) + (cond + ((eq encoding t) ; only the leading sequence needs to be changed + (setq textlen (+ (- newpt pt) (length chset) 1)) + (replace-match (format "\e%%/%d%c%c%s" + noctets + (+ (/ textlen 128) 128) + (+ (% textlen 128) 128) + chset) + t t)) + ((coding-system-p encoding) ; need to recode the entire segment... + (set-marker pt (match-beginning 0)) + (decode-coding-region pt newpt 'ctext-no-compositions) + (set-buffer-multibyte t) + (encode-coding-region pt newpt encoding) + (set-buffer-multibyte nil) + (setq textlen (+ (- newpt pt) (length chset) 1)) + (goto-char pt) + (insert (format "\e%%/%d%c%c%s" + noctets + (+ (/ textlen 128) 128) + (+ (% textlen 128) 128) + chset)))) + (goto-char newpt)))) + (set-buffer-multibyte t) + nil) + +(make-coding-system + 'compound-text-with-extensions 5 ?x + "Compound text encoding with ICCCM Extended Segment extensions. + +This coding system should be used only for X selections. It is inappropriate +for decoding and encoding files, process I/O, etc." + nil + '((post-read-conversion . ctext-post-read-conversion) + (pre-write-conversion . ctext-pre-write-conversion))) + +(define-coding-system-alias + 'x-ctext-with-extensions 'compound-text-with-extensions) +(define-coding-system-alias + 'ctext-with-extensions 'compound-text-with-extensions) + (make-coding-system 'iso-safe 2 ?- "Convert all characters but ASCII to `?'."