/* Coding system handler (conversion, detection, and etc).
Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
Licensed to the Free Software Foundation.
+ Copyright (C) 2001 Free Software Foundation, Inc.
This file is part of GNU Emacs.
} while (0)
-/* Produce a multibyte form of characater C to `dst'. Jump to
+/* Produce a multibyte form of character C to `dst'. Jump to
`label_end_of_loop' if there's not enough space at `dst'.
If we are now in the middle of a composition sequence, the decoded
struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
/* Table of coding category masks. Nth element is a mask for a coding
- cateogry of which priority is Nth. */
+ category of which priority is Nth. */
static
int coding_priorities[CODING_CATEGORY_IDX_MAX];
In that case, a sequence of one-byte codes has a slightly different
form.
- At first, all characters in eight-bit-control are represented by
+ Firstly, all characters in eight-bit-control are represented by
one-byte sequences which are their 8-bit code.
Next, character composition data are represented by the byte
METHOD is 0xF0 plus one of composition method (enum
composition_method),
- BYTES is 0x20 plus a byte length of this composition data,
+ BYTES is 0xA0 plus the byte length of these composition data,
- CHARS is 0x20 plus a number of characters composed by this
+ CHARS is 0xA0 plus the number of characters composed by these
data,
- COMPONENTs are characters of multibye form or composition
+ COMPONENTs are characters of multibyte form or composition
rules encoded by two-byte of ASCII codes.
In addition, for backward compatibility, the following formats are
/* Get one byte from a data pointed by SRC and increment SRC. If SRC
- is not less than SRC_END, return -1 without inccrementing Src. */
+ is not less than SRC_END, return -1 without incrementing Src. */
#define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
unsigned char *dst = *destination;
int method, data_len, nchars;
unsigned char *src_base = src++;
- /* Store compoments of composition. */
+ /* Store components of composition. */
int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
int ncomponent;
/* Store multibyte form of characters to be composed. This is for
is encoded using bytes less than 128. This may make the encoded
text a little bit longer, but the text passes more easily through
several types of gateway, some of which strip off the MSB (Most
- Signigant Bit).
+ Significant Bit).
There are two kinds of character sets: control character sets and
graphic character sets. The former contain control characters such
Now you may notice that there are a lot of ways of encoding the
same multilingual text in ISO2022. Actually, there exist many
coding systems such as Compound Text (used in X11's inter client
- communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
- (used in Korean internet), EUC (Extended UNIX Code, used in Asian
+ communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
+ (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
localized platforms), and all of these are variants of ISO2022.
In addition to the above, Emacs handles two more kinds of escape
COMPOSITION_RELATIVE:
ESC 0 CHAR [ CHAR ] ESC 1
- COMPOSITOIN_WITH_RULE:
+ COMPOSITION_WITH_RULE:
ESC 2 CHAR [ RULE CHAR ] ESC 1
COMPOSITION_WITH_ALTCHARS:
ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
that coding->cmp_data has enough space to store the \
information about the composition. If not, terminate the \
current decoding loop, allocate one more memory block for \
- coding->cmp_data in the calller, then start the decoding \
+ coding->cmp_data in the caller, then start the decoding \
loop again. We can't allocate memory here directly because \
it may cause buffer/string relocation. */ \
if (!coding->cmp_data \
} \
} while (0)
-/* Handle compositoin end sequence ESC 1. */
+/* Handle composition end sequence ESC 1. */
#define DECODE_COMPOSITION_END(c1) \
do { \
goto label_invalid_code;
/* For the moment, nested direction is not supported.
So, `coding->mode & CODING_MODE_DIRECTION' zero means
- left-to-right, and nozero means right-to-left. */
+ left-to-right, and nonzero means right-to-left. */
ONE_MORE_BYTE (c1);
switch (c1)
{
It is not enough to say just "ISO2022" on encoding, we have to
specify more details. In Emacs, each ISO2022 coding system
variant has the following specifications:
- 1. Initial designation to G0 thru G3.
+ 1. Initial designation to G0 through G3.
2. Allows short-form designation?
3. ASCII should be designated to G0 before control characters?
4. ASCII should be designated to G0 at end of line?
/* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
if (coding->cmp_data_index == coding->cmp_data_start + data[0])
/* We have consumed components of the composition.
- What follows in SRC is the compositions's base
+ What follows in SRC is the composition's base
text. */
ENCODE_COMPOSITION_FAKE_START (coding);
else
--- CODE RANGE of SJIS ---
(character set) (range)
ASCII 0x00 .. 0x7F
- KATAKANA-JISX0201 0xA0 .. 0xDF
+ KATAKANA-JISX0201 0xA1 .. 0xDF
JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
(2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
-------------------------------
while (1)
{
ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
- if (c >= 0x81)
+ if (c < 0x80)
+ continue;
+ if (c == 0x80 || c == 0xA0 || c > 0xEF)
+ return 0;
+ if (c <= 0x9F || c >= 0xE0)
{
- if (c <= 0x9F || (c >= 0xE0 && c <= 0xEF))
- {
- ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
- if (c < 0x40 || c == 0x7F || c > 0xFC)
- return 0;
- }
- else if (c > 0xDF)
+ ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
+ if (c < 0x40 || c == 0x7F || c > 0xFC)
return 0;
}
}
while (1)
{
ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
- if (c >= 0xA1)
- {
- ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
- if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
- return 0;
- }
+ if (c < 0x80)
+ continue;
+ if (c < 0xA1 || c > 0xFE)
+ return 0;
+ ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
+ if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
+ return 0;
}
label_end_of_loop:
return CODING_CATEGORY_MASK_BIG5;
{
if (sjis_p)
{
- if (c1 >= 0xF0)
+ if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
goto label_invalid_code;
- if (c1 < 0xA0 || c1 >= 0xE0)
+ if (c1 <= 0x9F || c1 >= 0xE0)
{
/* SJIS -> JISX0208 */
ONE_MORE_BYTE (c2);
else
{
/* BIG5 -> Big5 */
- if (c1 < 0xA1 || c1 > 0xFE)
+ if (c1 < 0xA0 || c1 > 0xFE)
goto label_invalid_code;
ONE_MORE_BYTE (c2);
if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
format of end-of-line according to `coding->eol_type'. It also
- convert multibyte form 8-bit characers to unibyte if
+ convert multibyte form 8-bit characters to unibyte if
CODING->src_multibyte is nonzero. If `coding->mode &
CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
also means end-of-line. */
if (NILP (coding_system))
goto label_invalid_coding_system;
+ label_retry:
coding_spec = Fget (coding_system, Qcoding_system);
if (!VECTORP (coding_spec)
|| XVECTOR (coding_spec)->size != 5
|| !CONSP (XVECTOR (coding_spec)->contents[3]))
- goto label_invalid_coding_system;
+ {
+ /* Should we do an autoload? Autoload function forms for coding
+ systems have Qcoding_system as their fifth element. */
+ coding_spec = XSYMBOL (coding_spec)->function;
+ if (CONSP (coding_spec)
+ && EQ (XCAR (coding_spec), Qautoload)
+ && EQ (Fnth (make_number (4), coding_spec), Qcoding_system))
+ {
+ struct gcpro gcpro1, gcpro2;
+
+ GCPRO2 (coding_spec, coding_system);
+ do_autoload (coding_spec, coding_system);
+ UNGCPRO;
+
+ goto label_retry;
+ }
+ else
+ goto label_invalid_coding_system;
+ }
eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
if (VECTORP (eol_type))
`translation-table-for-decode', `translation-table-for-encode'. */
plist = XVECTOR (coding_spec)->contents[3];
/* Pre & post conversion functions should be disabled if
- inhibit_eol_conversion is nozero. This is the case that a code
+ inhibit_eol_conversion is nonzero. This is the case that a code
conversion function is called while those functions are running. */
if (! inhibit_pre_post_conversion)
{
o coding-category-iso-7-else
The category for a coding system which has the same code range
- as ISO2022 of 7-bit environemnt but uses locking shift or
+ as ISO2022 of 7-bit environment but uses locking shift or
single shift functions. Assigned the coding-system (Lisp
symbol) `iso-2022-7bit-lock' by default.
o coding-category-iso-8-else
The category for a coding system which has the same code range
- as ISO2022 of 8-bit environemnt but uses locking shift or
+ as ISO2022 of 8-bit environment but uses locking shift or
single shift functions. Assigned the coding-system (Lisp
symbol) `iso-2022-8bit-ss2' by default.
int try;
if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
- c = *src++ - 0x20;
+ c = src[1] - 0x20;
if (c < 0xA0)
{
coding->produced_char = coding->produced;
coding->spec.ccl.cr_carryover = ccl->cr_consumed;
}
+ else if (!ccl->eight_bit_control)
+ {
+ /* The produced bytes forms a valid multibyte sequence. */
+ coding->produced_char
+ = multibyte_chars_in_text (destination, coding->produced);
+ coding->spec.ccl.eight_bit_carryover[0] = 0;
+ }
else
{
/* On decoding, the destination should always multibyte. But,
{
detect_eol (coding, source, src_bytes);
/* We had better recover the original eol format if we
- encounter an inconsitent eol format while decoding. */
+ encounter an inconsistent eol format while decoding. */
coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
}
case CODING_CATEGORY_IDX_ISO_7:
case CODING_CATEGORY_IDX_ISO_7_TIGHT:
{
- /* We can skip all charactes at the tail except for 8-bit
+ /* We can skip all characters at the tail except for 8-bit
codes and ESC and the following 2-byte at the tail. */
unsigned char *eight_bit = NULL;
if (!NILP (CHAR_TABLE_REF (translation_table, i)))
break;
if (i < 128)
- /* Some ASCII character should be tranlsated. We give up
+ /* Some ASCII character should be translated. We give up
shrinking. */
return;
}
}
/* Reflect the saved information about compositions to OBJ.
- CODING->cmp_data points to a memory block for the informaiton. OBJ
+ CODING->cmp_data points to a memory block for the information. OBJ
is a buffer or a string, defaults to the current buffer. */
void
replace_range (insdel.c) to know what we are doing.
If REPLACE is zero, it is assumed that the source text is unibyte.
- Otherwize, it is assumed that the source text is multibyte. */
+ Otherwise, it is assumed that the source text is multibyte. */
int
code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
encodings again in vain. */
coding->type = coding_type_emacs_mule;
coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
+ /* As emacs-mule decoder will handle composition, we
+ need this setting to allocate coding->cmp_data
+ later. */
+ coding->composing = COMPOSITION_NO;
}
}
if (coding->eol_type == CODING_EOL_UNDECIDED
if (coding->eol_type == CODING_EOL_UNDECIDED)
coding->eol_type = CODING_EOL_LF;
/* We had better recover the original eol format if we
- encounter an inconsitent eol format while decoding. */
+ encounter an inconsistent eol format while decoding. */
coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
}
}
len -= total_skip; len_byte -= total_skip;
}
- /* For converion, we must put the gap before the text in addition to
+ /* For conversion, we must put the gap before the text in addition to
making the gap larger for efficient decoding. The required gap
size starts from 2000 which is the magic number used in make_gap.
But, after one batch of conversion, it will be incremented if we
if (first)
{
/* We have just done the first batch of conversion which was
- stoped because of insufficient gap. Let's reconsider the
+ stopped because of insufficient gap. Let's reconsider the
required gap size (i.e. SRT - DST) now.
We have converted ORIG bytes (== coding->consumed) into
inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
}
- /* If we have shrinked the conversion area, adjust it now. */
+ /* If we shrank the conversion area, adjust it now. */
if (total_skip > 0)
{
if (tail_skip > 0)
{
detect_coding (coding, XSTRING (str)->data, to_byte);
if (coding->type == coding_type_undecided)
- coding->type = coding_type_emacs_mule;
+ {
+ coding->type = coding_type_emacs_mule;
+ coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
+ /* As emacs-mule decoder will handle composition, we
+ need this setting to allocate coding->cmp_data
+ later. */
+ coding->composing = COMPOSITION_NO;
+ }
}
if (coding->eol_type == CODING_EOL_UNDECIDED
&& coding->type != coding_type_ccl)
if (coding->eol_type == CODING_EOL_UNDECIDED)
coding->eol_type = CODING_EOL_LF;
/* We had better recover the original eol format if we
- encounter an inconsitent eol format while decoding. */
+ encounter an inconsistent eol format while decoding. */
coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
}
}
(obj)
Lisp_Object obj;
{
+ Lisp_Object prop;
+
if (NILP (obj))
return Qt;
if (!SYMBOLP (obj))
return Qnil;
/* Get coding-spec vector for OBJ. */
- obj = Fget (obj, Qcoding_system);
- return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
- ? Qt : Qnil);
+ prop = Fget (obj, Qcoding_system);
+ if (VECTORP (prop) && XVECTOR (prop)->size == 5)
+ return Qt;
+ prop = XSYMBOL (obj)->function;
+ if (CONSP (prop)
+ && EQ (XCAR (prop), Qautoload)
+ && EQ (Fnth (make_number (4), prop), Qcoding_system))
+ return Qt;
+ else
+ return Qnil;
}
DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
{
CHECK_SYMBOL (coding_system, 0);
if (!NILP (Fcoding_system_p (coding_system)))
- return coding_system;
+ {
+ if (!NILP (coding_system)
+ && NILP (Fget (coding_system, Qcoding_system)))
+ {
+ struct gcpro gcpro1;
+
+ GCPRO1 (coding_system);
+ do_autoload (XSYMBOL (coding_system)->function, coding_system);
+ UNGCPRO;
+ }
+ return coding_system;
+ }
while (1)
Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
}
{
int from, to;
int from_byte, to_byte;
+ int include_anchor_byte = 0;
CHECK_NUMBER_COERCE_MARKER (start, 0);
CHECK_NUMBER_COERCE_MARKER (end, 1);
if (from < GPT && to >= GPT)
move_gap_both (to, to_byte);
+ /* If we an anchor byte `\0' follows the region, we include it in
+ the detecting source. Then code detectors can handle the tailing
+ byte sequence more accurately.
+ Fix me: This is not an perfect solution. It is better that we
+ add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
+ */
+ if (to == Z || (to == GPT && GAP_SIZE > 0))
+ include_anchor_byte = 1;
return detect_coding_system (BYTE_POS_ADDR (from_byte),
- to_byte - from_byte,
+ to_byte - from_byte + include_anchor_byte,
!NILP (highest),
!NILP (current_buffer
->enable_multibyte_characters));
CHECK_STRING (string, 0);
return detect_coding_system (XSTRING (string)->data,
- STRING_BYTES (XSTRING (string)),
+ /* "+ 1" is to include the anchor byte
+ `\0'. With this, code detectors can
+ handle the tailing bytes more
+ accurately. */
+ STRING_BYTES (XSTRING (string)) + 1,
!NILP (highest),
STRING_MULTIBYTE (string));
}
DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
2, 3, 0,
"Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
-Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
+Optional arg NOCOPY non-nil means it is OK to return STRING itself\n\
if the decoding operation is trivial.\n\
This function sets `last-coding-system-used' to the precise coding system\n\
used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
2, 3, 0,
"Encode STRING to CODING-SYSTEM, and return the result.\n\
-Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
+Optional arg NOCOPY non-nil means it is OK to return STRING itself\n\
if the encoding operation is trivial.\n\
This function sets `last-coding-system-used' to the precise coding system\n\
used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
/* We had better not send unsafe characters to terminal. */
terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
- /* Characer composition should be disabled. */
+ /* Character composition should be disabled. */
terminal_coding.composing = COMPOSITION_DISABLED;
/* Error notification should be suppressed. */
terminal_coding.suppress_error = 1;
CHECK_SYMBOL (coding_system, 0);
setup_coding_system (Fcheck_coding_system (coding_system),
&safe_terminal_coding);
- /* Characer composition should be disabled. */
+ /* Character composition should be disabled. */
safe_terminal_coding.composing = COMPOSITION_DISABLED;
/* Error notification should be suppressed. */
terminal_coding.suppress_error = 1;
{
CHECK_SYMBOL (coding_system, 0);
setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
- /* Characer composition should be disabled. */
+ /* Character composition should be disabled. */
keyboard_coding.composing = COMPOSITION_DISABLED;
return Qnil;
}
operation = args[0];
if (!SYMBOLP (operation)
|| !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
- error ("Invalid first arguement");
+ error ("Invalid first argument");
if (nargs < 1 + XINT (target_idx))
error ("Too few arguments for operation: %s",
XSYMBOL (operation)->name->data);
target = args[XINT (target_idx) + 1];
if (!(STRINGP (target)
|| (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
- error ("Invalid %dth argument", XINT (target_idx) + 1);
+ error ("Invalid argument %d", XINT (target_idx) + 1);
chain = ((EQ (operation, Qinsert_file_contents)
|| EQ (operation, Qwrite_region))
Vcoding_system_alist = Qnil;
DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
- "List of coding-categories (symbols) ordered by priority.");
+ "List of coding-categories (symbols) ordered by priority.\n\
+\n\
+On detecting a coding system, Emacs tries code detection algorithms\n\
+associated with each coding-category one by one in this order. When\n\
+one algorithm agrees with a byte sequence of source text, the coding\n\
+system bound to the corresponding coding-category is selected.");
{
int i;
DEFVAR_LISP ("standard-translation-table-for-encode",
&Vstandard_translation_table_for_encode,
- "Table for translationg characters while encoding.");
+ "Table for translating characters while encoding.");
Vstandard_translation_table_for_encode = Qnil;
DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
"Alist of charsets vs revision numbers.\n\
While encoding, if a charset (car part of an element) is found,\n\
-designate it with the escape sequence identifing revision (cdr part of the element).");
+designate it with the escape sequence identifying revision (cdr part of the element).");
Vcharset_revision_alist = Qnil;
DEFVAR_LISP ("default-process-coding-system",