From 05e6f5dcf672b220bd759beded0cae38ceb8a10b Mon Sep 17 00:00:00 2001 From: Kenichi Handa Date: Thu, 27 Jul 2000 06:01:19 +0000 Subject: [PATCH] (Qsafe_charsets): This variable deleted. (Qsafe_chars, Vchar_coding_system_table, Qchar_coding_system): New variables. (coding_safe_chars): New function. (CODING_SAFE_CHAR_P): New macro. (CHARSET_OK): New arg C. Call CODING_SAFE_CHAR_P instead of checking safe_charsets member of the coding system. Caller changed. (detect_coding_iso2022): New local variable safe_chars. (DECODE_DESIGNATION): Call CODING_SAFE_CHAR_P instead of checking safe_charsets member of the coding system. (decode_coding_iso2022): New local variable safe_chars. (ENCODE_ISO_CHARACTER_DIMENSION1): Don't check unsafe chars here. (ENCODE_ISO_CHARACTER_DIMENSION2): Likewise. (ENCODE_ISO_CHARACTER): Arguments changed. Caller changed. (ENCODE_UNSAFE_CHARACTER): New macro. (encode_coding_iso2022): New local variable safe_chars. Check unsafe chars. (setup_coding_system): Delete the code to initialize coding->safe_charses (intersection, find_safe_codings): New functions. (Ffind_coding_systems_region_internal): New function. (syms_of_coding): Defsubr it. Initialize Qsafe_chars, Qsafe_cding_system. Make Vchar_coding_system_table a Lisp variable and initialize it. --- src/coding.c | 354 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 267 insertions(+), 87 deletions(-) diff --git a/src/coding.c b/src/coding.c index cc636af0ba0..d67e07687cd 100644 --- a/src/coding.c +++ b/src/coding.c @@ -337,7 +337,7 @@ Lisp_Object Qbuffer_file_coding_system; Lisp_Object Qpost_read_conversion, Qpre_write_conversion; Lisp_Object Qno_conversion, Qundecided; Lisp_Object Qcoding_system_history; -Lisp_Object Qsafe_charsets; +Lisp_Object Qsafe_chars; Lisp_Object Qvalid_codes; extern Lisp_Object Qinsert_file_contents, Qwrite_region; @@ -471,6 +471,28 @@ Lisp_Object Vdefault_process_coding_system; to avoid infinite recursive call. */ static int inhibit_pre_post_conversion; +/* Char-table containing safe coding systems of each character. */ +Lisp_Object Vchar_coding_system_table; +Lisp_Object Qchar_coding_system; + +/* Return `safe-chars' property of coding system CODING. Don't check + validity of CODING. */ + +Lisp_Object +coding_safe_chars (coding) + struct coding_system *coding; +{ + Lisp_Object coding_spec, plist, safe_chars; + + coding_spec = Fget (coding->symbol, Qcoding_system); + plist = XVECTOR (coding_spec)->contents[3]; + safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars); + return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt); +} + +#define CODING_SAFE_CHAR_P(safe_chars, c) \ + (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c))) + /*** 2. Emacs internal format (emacs-mule) handlers ***/ @@ -797,12 +819,14 @@ decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes) enum iso_code_class_type iso_code_class[256]; -#define CHARSET_OK(idx, charset) \ - (coding_system_table[idx] \ - && (coding_system_table[idx]->safe_charsets[charset] \ - || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \ - (coding_system_table[idx], charset) \ - != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))) +#define CHARSET_OK(idx, charset, c) \ + (coding_system_table[idx] \ + && (charset == CHARSET_ASCII \ + || (safe_chars = coding_safe_chars (coding_system_table[idx]), \ + CODING_SAFE_CHAR_P (safe_chars, c))) \ + && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \ + charset) \ + != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)) #define SHIFT_OUT_OK(idx) \ (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0) @@ -830,6 +854,7 @@ detect_coding_iso2022 (src, src_end) /* Dummy for ONE_MORE_BYTE. */ struct coding_system dummy_coding; struct coding_system *coding = &dummy_coding; + Lisp_Object safe_chars; reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1; while (mask && src < src_end) @@ -890,19 +915,20 @@ detect_coding_iso2022 (src, src_end) /* We found a valid designation sequence for CHARSET. */ mask &= ~CODING_CATEGORY_MASK_ISO_8BIT; - if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset)) + c = MAKE_CHAR (charset, 0, 0); + if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c)) mask_found |= CODING_CATEGORY_MASK_ISO_7; else mask &= ~CODING_CATEGORY_MASK_ISO_7; - if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset)) + if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c)) mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT; else mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT; - if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset)) + if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c)) mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE; else mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE; - if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset)) + if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c)) mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE; else mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE; @@ -1042,16 +1068,17 @@ detect_coding_iso2022 (src, src_end) /* Set designation state into CODING. */ #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \ do { \ - int charset; \ + int charset, c; \ \ if (final_char < '0' || final_char >= 128) \ goto label_invalid_code; \ charset = ISO_CHARSET_TABLE (make_number (dimension), \ make_number (chars), \ make_number (final_char)); \ + c = MAKE_CHAR (charset, 0, 0); \ if (charset >= 0 \ && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \ - || coding->safe_charsets[charset])) \ + || CODING_SAFE_CHAR_P (safe_chars, c))) \ { \ if (coding->spec.iso2022.last_invalid_designation_register == 0 \ && reg == 0 \ @@ -1238,6 +1265,9 @@ decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) unsigned char *src_base; int c, charset; Lisp_Object translation_table; + Lisp_Object safe_chars; + + safe_chars = coding_safe_chars (coding); if (NILP (Venable_character_translation)) translation_table = Qnil; @@ -1684,16 +1714,6 @@ decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) *dst++ = c1 | 0x80; \ break; \ } \ - else if (coding->flags & CODING_FLAG_ISO_SAFE \ - && !coding->safe_charsets[charset]) \ - { \ - /* We should not encode this character, instead produce one or \ - two `?'s. */ \ - *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \ - if (CHARSET_WIDTH (charset) == 2) \ - *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \ - break; \ - } \ else \ /* Since CHARSET is not yet invoked to any graphic planes, we \ must invoke it, or, at first, designate it to some graphic \ @@ -1727,16 +1747,6 @@ decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \ break; \ } \ - else if (coding->flags & CODING_FLAG_ISO_SAFE \ - && !coding->safe_charsets[charset]) \ - { \ - /* We should not encode this character, instead produce one or \ - two `?'s. */ \ - *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \ - if (CHARSET_WIDTH (charset) == 2) \ - *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \ - break; \ - } \ else \ /* Since CHARSET is not yet invoked to any graphic planes, we \ must invoke it, or, at first, designate it to some graphic \ @@ -1745,35 +1755,47 @@ decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) dst = encode_invocation_designation (charset, coding, dst); \ } while (1) -#define ENCODE_ISO_CHARACTER(charset, c1, c2) \ +#define ENCODE_ISO_CHARACTER(c) \ + do { \ + int charset, c1, c2; \ + \ + SPLIT_CHAR (c, charset, c1, c2); \ + if (CHARSET_DEFINED_P (charset)) \ + { \ + if (CHARSET_DIMENSION (charset) == 1) \ + { \ + if (charset == CHARSET_ASCII \ + && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \ + charset = charset_latin_jisx0201; \ + ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1); \ + } \ + else \ + { \ + if (charset == charset_jisx0208 \ + && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \ + charset = charset_jisx0208_1978; \ + ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2); \ + } \ + } \ + else \ + { \ + *dst++ = c1; \ + if (c2 >= 0) \ + *dst++ = c2; \ + } \ + } while (0) + + +/* Instead of encoding character C, produce one or two `?'s. */ + +#define ENCODE_UNSAFE_CHARACTER(c) \ do { \ - int alt_charset = charset; \ - \ - if (CHARSET_DEFINED_P (charset)) \ - { \ - if (CHARSET_DIMENSION (charset) == 1) \ - { \ - if (charset == CHARSET_ASCII \ - && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \ - alt_charset = charset_latin_jisx0201; \ - ENCODE_ISO_CHARACTER_DIMENSION1 (alt_charset, c1); \ - } \ - else \ - { \ - if (charset == charset_jisx0208 \ - && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \ - alt_charset = charset_jisx0208_1978; \ - ENCODE_ISO_CHARACTER_DIMENSION2 (alt_charset, c1, c2); \ - } \ - } \ - else \ - { \ - *dst++ = c1; \ - if (c2 >= 0) \ - *dst++ = c2; \ - } \ + ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION); \ + if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1) \ + ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION); \ } while (0) + /* Produce designation and invocation codes at a place pointed by DST to use CHARSET. The element `spec.iso2022' of *CODING is updated. Return new DST. */ @@ -1997,6 +2019,9 @@ encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) unsigned char *src_base; int c; Lisp_Object translation_table; + Lisp_Object safe_chars; + + safe_chars = coding_safe_chars (coding); if (NILP (Venable_character_translation)) translation_table = Qnil; @@ -2011,8 +2036,6 @@ encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) coding->errors = 0; while (1) { - int charset, c1, c2; - src_base = src; if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19))) @@ -2065,8 +2088,11 @@ encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) } else { - SPLIT_CHAR (c, charset, c1, c2); - ENCODE_ISO_CHARACTER (charset, c1, c2); + if (coding->flags & CODING_FLAG_ISO_SAFE + && ! CODING_SAFE_CHAR_P (safe_chars, c)) + ENCODE_UNSAFE_CHARACTER (c); + else + ENCODE_ISO_CHARACTER (c); if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) coding->composition_rule_follows = 1; } @@ -2125,17 +2151,17 @@ encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) } } else if (ASCII_BYTE_P (c)) - ENCODE_ISO_CHARACTER (CHARSET_ASCII, c, /* dummy */ c1); + ENCODE_ISO_CHARACTER (c); else if (SINGLE_BYTE_CHAR_P (c)) { *dst++ = c; coding->errors++; } + else if (coding->flags & CODING_FLAG_ISO_SAFE + && ! CODING_SAFE_CHAR_P (safe_chars, c)) + ENCODE_UNSAFE_CHARACTER (c); else - { - SPLIT_CHAR (c, charset, c1, c2); - ENCODE_ISO_CHARACTER (charset, c1, c2); - } + ENCODE_ISO_CHARACTER (c); coding->consumed_char++; } @@ -2970,23 +2996,6 @@ setup_coding_system (coding_system, coding) else goto label_invalid_coding_system; - val = Fplist_get (plist, Qsafe_charsets); - if (EQ (val, Qt)) - { - for (i = 0; i <= MAX_CHARSET; i++) - coding->safe_charsets[i] = 1; - } - else - { - bzero (coding->safe_charsets, MAX_CHARSET + 1); - while (CONSP (val)) - { - if ((i = get_charset_id (XCAR (val))) >= 0) - coding->safe_charsets[i] = 1; - val = XCDR (val); - } - } - /* If the coding system has non-nil `composition' property, enable composition handling. */ val = Fplist_get (plist, Qcomposition); @@ -5542,6 +5551,160 @@ highest priority.") !NILP (highest)); } +/* Return an intersection of lists L1 and L2. */ + +static Lisp_Object +intersection (l1, l2) + Lisp_Object l1, l2; +{ + Lisp_Object val; + + for (val = Qnil; CONSP (l1); l1 = XCDR (l1)) + { + if (!NILP (Fmemq (XCAR (l1), l2))) + val = Fcons (XCAR (l1), val); + } + return val; +} + + +/* Subroutine for Fsafe_coding_systems_region_internal. + + Return a list of coding systems that safely encode the multibyte + text between P and PEND. SAFE_CODINGS, if non-nil, is a list of + possible coding systems. If it is nil, it means that we have not + yet found any coding systems. + + WORK_TABLE is a copy of the char-table Vchar_coding_system_table. An + element of WORK_TABLE is set to t once the element is looked up. + + If a non-ASCII single byte char is found, set + *single_byte_char_found to 1. */ + +static Lisp_Object +find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found) + unsigned char *p, *pend; + Lisp_Object safe_codings, work_table; + int *single_byte_char_found; +{ + int c, len, idx; + Lisp_Object val; + + while (p < pend) + { + c = STRING_CHAR_AND_LENGTH (p, pend - p, len); + p += len; + if (ASCII_BYTE_P (c)) + /* We can ignore ASCII characters here. */ + continue; + if (SINGLE_BYTE_CHAR_P (c)) + *single_byte_char_found = 1; + if (NILP (safe_codings)) + continue; + /* Check the safe coding systems for C. */ + val = char_table_ref_and_index (work_table, c, &idx); + if (EQ (val, Qt)) + /* This element was already checked. Ignore it. */ + continue; + /* Remember that we checked this element. */ + CHAR_TABLE_SET (work_table, idx, Qt); + + /* If there are some safe coding systems for C and we have + already found the other set of coding systems for the + different characters, get the intersection of them. */ + if (!EQ (safe_codings, Qt) && !NILP (val)) + val = intersection (safe_codings, val); + safe_codings = val; + } + return safe_codings; +} + + +/* Return a list of coding systems that safely encode the text between + START and END. If the text contains only ASCII or is unibyte, + return t. */ + +DEFUN ("find-coding-systems-region-internal", + Ffind_coding_systems_region_internal, + Sfind_coding_systems_region_internal, 2, 2, 0, + "Internal use only.") + (start, end) + Lisp_Object start, end; +{ + Lisp_Object work_table, safe_codings; + int non_ascii_p = 0; + int single_byte_char_found = 0; + unsigned char *p1, *p1end, *p2, *p2end, *p; + Lisp_Object args[2]; + + if (STRINGP (start)) + { + if (!STRING_MULTIBYTE (start)) + return Qt; + p1 = XSTRING (start)->data, p1end = p1 + STRING_BYTES (XSTRING (start)); + p2 = p2end = p1end; + if (XSTRING (start)->size != STRING_BYTES (XSTRING (start))) + non_ascii_p = 1; + } + else + { + int from, to, stop; + + CHECK_NUMBER_COERCE_MARKER (start, 0); + CHECK_NUMBER_COERCE_MARKER (end, 1); + if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end)) + args_out_of_range (start, end); + if (NILP (current_buffer->enable_multibyte_characters)) + return Qt; + from = CHAR_TO_BYTE (XINT (start)); + to = CHAR_TO_BYTE (XINT (end)); + stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to; + p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from); + if (stop == to) + p2 = p2end = p1end; + else + p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop); + if (XINT (end) - XINT (start) != to - from) + non_ascii_p = 1; + } + + if (!non_ascii_p) + { + /* We are sure that the text contains no multibyte character. + Check if it contains eight-bit-graphic. */ + p = p1; + for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++); + if (p == p1end) + { + for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++); + if (p == p2end) + return Qt; + } + } + + /* The text contains non-ASCII characters. */ + work_table = Fcopy_sequence (Vchar_coding_system_table); + safe_codings = find_safe_codings (p1, p1end, Qt, work_table, + &single_byte_char_found); + if (p2 < p2end) + safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table, + &single_byte_char_found); + + if (!single_byte_char_found) + { + /* Append generic coding systems. */ + Lisp_Object args[2]; + args[0] = safe_codings; + args[1] = Fchar_table_extra_slot (Vchar_coding_system_table, + make_number (0)); + safe_codings = Fappend (make_number (2), args); + } + else + safe_codings = Fcons (Qraw_text, Fcons (Qemacs_mule, safe_codings)); + return safe_codings; +} + + Lisp_Object code_convert_region1 (start, end, coding_system, encodep) Lisp_Object start, end, coding_system; @@ -6196,8 +6359,18 @@ syms_of_coding () Qtranslation_table_for_encode = intern ("translation-table-for-encode"); staticpro (&Qtranslation_table_for_encode); - Qsafe_charsets = intern ("safe-charsets"); - staticpro (&Qsafe_charsets); + Qsafe_chars = intern ("safe-chars"); + staticpro (&Qsafe_chars); + + Qchar_coding_system = intern ("char-coding-system"); + staticpro (&Qchar_coding_system); + + /* Intern this now in case it isn't already done. + Setting this variable twice is harmless. + But don't staticpro it here--that is done in alloc.c. */ + Qchar_table_extra_slots = intern ("char-table-extra-slots"); + Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0)); + Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (1)); Qvalid_codes = intern ("valid-codes"); staticpro (&Qvalid_codes); @@ -6214,6 +6387,7 @@ syms_of_coding () defsubr (&Scheck_coding_system); defsubr (&Sdetect_coding_region); defsubr (&Sdetect_coding_string); + defsubr (&Sfind_coding_systems_region_internal); defsubr (&Sdecode_coding_region); defsubr (&Sencode_coding_region); defsubr (&Sdecode_coding_string); @@ -6417,6 +6591,12 @@ coding system used in each operation can't encode the text.\n\ The default value is `select-safe-coding-system' (which see)."); Vselect_safe_coding_system_function = Qnil; + DEFVAR_LISP ("char-coding-system-table", &Vchar_coding_system_table, + "Char-table containing safe coding systems of each characters.\n\ +Each element doesn't include such generic coding systems that can\n\ +encode any characters. They are in the first extra slot."); + Vchar_coding_system_table = Fmake_char_table (Qchar_coding_system, Qnil); + DEFVAR_BOOL ("inhibit-iso-escape-detection", &inhibit_iso_escape_detection, "If non-nil, Emacs ignores ISO2022's escape sequence on code detection.\n\ -- 2.39.2