int multibytep = coding->src_multibyte;
int consumed_chars = 0;
int found = 0;
+ int incomplete;
/* A coding system of this category is always ASCII compatible. */
src += coding->head_ascii;
{
int c, c1, c2, c3, c4;
+ incomplete = 0;
ONE_MORE_BYTE (c);
if (UTF_8_1_OCTET_P (c))
continue;
+ incomplete = 1;
ONE_MORE_BYTE (c1);
if (! UTF_8_EXTRA_OCTET_P (c1))
break;
return 0;
no_more_source:
- if (! found)
- return 0;
- *mask &= CATEGORY_MASK_UTF_8;
- return 1;
+ if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
+ {
+ *mask &= ~CATEGORY_MASK_UTF_8;
+ return 0;
+ }
+ return found;
}
int consumed_chars = 0;
int c1, c2;
+ *mask &= ~CATEGORY_MASK_UTF_16;
+
ONE_MORE_BYTE (c1);
ONE_MORE_BYTE (c2);
if ((c1 == 0xFF) && (c2 == 0xFE))
- {
- *mask &= CATEGORY_MASK_UTF_16_LE;
- return 1;
- }
+ *mask |= CATEGORY_MASK_UTF_16_LE;
else if ((c1 == 0xFE) && (c2 == 0xFF))
- {
- *mask &= CATEGORY_MASK_UTF_16_BE;
- return 1;
- }
+ *mask |= CATEGORY_MASK_UTF_16_BE;
+ else
+ *mask |= CATEGORY_MASK_UTF_16_BE_NOSIG | CATEGORY_MASK_UTF_16_LE_NOSIG;
+ return 1;
+
no_more_source:
return 0;
}
int consumed_chars = 0;
int c;
int found = 0;
+ int incomplete;
/* A coding system of this category is always ASCII compatible. */
src += coding->head_ascii;
while (1)
{
+ incomplete = 0;
ONE_MORE_BYTE (c);
+ incomplete = 1;
if (c == 0x80)
{
return 0;
no_more_source:
- if (!found)
- return 0;
- *mask &= CATEGORY_MASK_EMACS_MULE;
- return 1;
+ if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
+ {
+ *mask &= ~CATEGORY_MASK_EMACS_MULE;
+ return 0;
+ }
+ return found;
}
{
int newmask = CATEGORY_MASK_ISO_8_ELSE;
+ mask_8bit_found = 1;
if (inhibit_iso_escape_detection)
break;
if (c != ISO_CODE_CSI)
}
if (!mask_found)
return 0;
- *mask &= mask_iso & mask_found;
+ *mask &= ~CATEGORY_MASK_ISO;
+ *mask |= mask_iso & mask_found;
if (! mask_8bit_found)
*mask &= ~(CATEGORY_MASK_ISO_8BIT | CATEGORY_MASK_ISO_8_ELSE);
return 1;
int consumed_chars = 0;
int found = 0;
int c;
+ int incomplete;
/* A coding system of this category is always ASCII compatible. */
src += coding->head_ascii;
while (1)
{
+ incomplete = 0;
ONE_MORE_BYTE (c);
+ incomplete = 1;
if (c < 0x80)
continue;
if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
return 0;
no_more_source:
- if (!found)
- return 0;
- *mask &= CATEGORY_MASK_SJIS;
- return 1;
+ if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
+ {
+ *mask &= ~CATEGORY_MASK_SJIS;
+ return 0;
+ }
+ return found;
}
/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
int consumed_chars = 0;
int found = 0;
int c;
+ int incomplete;
/* A coding system of this category is always ASCII compatible. */
src += coding->head_ascii;
while (1)
{
+ incomplete = 0;
ONE_MORE_BYTE (c);
+ incomplete = 1;
if (c < 0x80)
continue;
if (c >= 0xA1)
return 0;
no_more_source:
- if (!found)
- return 0;
- *mask &= CATEGORY_MASK_BIG5;
- return 1;
+ if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
+ {
+ *mask &= ~CATEGORY_MASK_BIG5;
+ return 0;
+ }
+ return found;
}
/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
val = charset_list;
charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
- charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
- charset_kana = CHARSET_FROM_ID (XINT (XCAR (val)));
+ charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
+ charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
while (1)
{
charset = charset_kanji;
}
else
- /* SJIS -> JISX0201-Kana */
- charset = charset_kana;
+ {
+ /* SJIS -> JISX0201-Kana */
+ c &= 0x7F;
+ charset = charset_kana;
+ }
}
CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
}
return 0;
no_more_source:
- if (!found)
- return 0;
- *mask &= CATEGORY_MASK_CCL;
- return 1;
+ return found;
}
static void
return 0;
no_more_source:
- *mask &= CATEGORY_MASK_CHARSET;
return 1;
}
#define MAX_EOL_CHECK_COUNT 3
static int
-detect_eol (coding, source, src_bytes)
- struct coding_system *coding;
+detect_eol (source, src_bytes, category)
unsigned char *source;
EMACS_INT src_bytes;
+ enum coding_category category;
{
- Lisp_Object attrs, coding_type;
unsigned char *src = source, *src_end = src + src_bytes;
unsigned char c;
int total = 0;
int eol_seen = EOL_SEEN_NONE;
- attrs = CODING_ID_ATTRS (coding->id);
- coding_type = CODING_ATTR_TYPE (attrs);
-
- if (EQ (coding_type, Qccl))
+ if ((1 << category) & CATEGORY_MASK_UTF_16)
{
int msb, lsb;
- msb = coding->spec.utf_16.endian == utf_16_little_endian;
+ msb = category == (coding_category_utf_16_le
+ | coding_category_utf_16_le_nosig);
lsb = 1 - msb;
while (src + 1 < src_end)
enum coding_category category = coding_priorities[i];
struct coding_system *this = coding_categories + category;
- if (category >= coding_category_raw_text
- || detected & (1 << category))
- continue;
-
if (this->id < 0)
{
/* No coding system of this category is defined. */
mask &= ~(1 << category);
}
+ else if (category >= coding_category_raw_text
+ || detected & (1 << category))
+ continue;
else
{
detected |= detected_mask[category];
- if ((*(this->detector)) (coding, &mask))
+ if ((*(this->detector)) (coding, &mask)
+ && (mask & (1 << category)))
break;
}
}
if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
&& ! EQ (coding_type, Qccl))
{
- int eol_seen = detect_eol (coding, coding->source, coding->src_bytes);
+ int eol_seen = detect_eol (coding->source, coding->src_bytes,
+ XINT (CODING_ATTR_CATEGORY (attrs)));
if (eol_seen != EOL_SEEN_NONE)
adjust_coding_eol_type (coding, eol_seen);
}
\f
+/* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
+ HIGHEST is nonzero, return the coding system of the highest
+ priority among the detected coding systems. Otherwize return a
+ list of detected coding systems sorted by their priorities. If
+ MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
+ multibyte form but contains only ASCII and eight-bit chars.
+ Otherwise, the bytes are raw bytes.
+
+ CODING-SYSTEM controls the detection as below:
+
+ If it is nil, detect both text-format and eol-format. If the
+ text-format part of CODING-SYSTEM is already specified
+ (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
+ part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
+ detect only text-format. */
+
Lisp_Object
detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
unsigned char *src;
Lisp_Object attrs, eol_type;
Lisp_Object val;
struct coding_system coding;
+ int id;
if (NILP (coding_system))
coding_system = Qundecided;
setup_coding_system (coding_system, &coding);
attrs = CODING_ID_ATTRS (coding.id);
eol_type = CODING_ID_EOL_TYPE (coding.id);
+ coding_system = CODING_ATTR_BASE_NAME (attrs);
coding.source = src;
coding.src_bytes = src_bytes;
coding.src_multibyte = multibytep;
coding.consumed = 0;
+ coding.mode |= CODING_MODE_LAST_BLOCK;
- if (XINT (CODING_ATTR_CATEGORY (attrs)) != coding_category_undecided)
- {
- mask = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
- }
- else
+ /* At first, detect text-format if necessary. */
+ if (XINT (CODING_ATTR_CATEGORY (attrs)) == coding_category_undecided)
{
- coding_system = Qnil;
for (; src < src_end; src++)
{
c = *src;
- if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
- || c == ISO_CODE_SI
- || c == ISO_CODE_SO)))
+ if (c & 0x80
+ || (c < 0x20 && (c == ISO_CODE_ESC
+ || c == ISO_CODE_SI
+ || c == ISO_CODE_SO
+ /* Most UTF-16 text contains '\0'. */
+ || !c)))
break;
}
coding.head_ascii = src - coding.source;
enum coding_category category = coding_priorities[i];
struct coding_system *this = coding_categories + category;
- if (category >= coding_category_raw_text
- || detected & (1 << category))
- continue;
-
if (this->id < 0)
{
/* No coding system of this category is defined. */
mask &= ~(1 << category);
}
+ else if (category >= coding_category_raw_text
+ || detected & (1 << category))
+ continue;
else
{
detected |= detected_mask[category];
if ((*(coding_categories[category].detector)) (&coding, &mask)
- && highest)
+ && highest
+ && (mask & (1 << category)))
{
- mask &= detected_mask[category];
+ mask = 1 << category;
break;
}
}
}
- }
- if (!mask)
- val = Fcons (make_number (coding_category_raw_text), Qnil);
- else if (mask == CATEGORY_MASK_ANY)
- val = Fcons (make_number (coding_category_undecided), Qnil);
- else if (highest)
- {
- for (i = 0; i < coding_category_raw_text; i++)
- if (mask & (1 << coding_priorities[i]))
- {
- val = Fcons (make_number (coding_priorities[i]), Qnil);
- break;
- }
- }
+ if (!mask)
+ {
+ id = coding_categories[coding_category_raw_text].id;
+ val = Fcons (make_number (id), Qnil);
+ }
+ else if (mask == CATEGORY_MASK_ANY)
+ {
+ id = coding_categories[coding_category_undecided].id;
+ val = Fcons (make_number (id), Qnil);
+ }
+ else if (highest)
+ {
+ for (i = 0; i < coding_category_raw_text; i++)
+ if (mask & (1 << coding_priorities[i]))
+ {
+ id = coding_categories[coding_priorities[i]].id;
+ val = Fcons (make_number (id), Qnil);
+ break;
+ }
+ }
+ else
+ {
+ val = Qnil;
+ for (i = coding_category_raw_text - 1; i >= 0; i--)
+ if (mask & (1 << coding_priorities[i]))
+ {
+ id = coding_categories[coding_priorities[i]].id;
+ val = Fcons (make_number (id), val);
+ }
+ }
+ }
else
{
- val = Qnil;
- for (i = coding_category_raw_text - 1; i >= 0; i--)
- if (mask & (1 << coding_priorities[i]))
- val = Fcons (make_number (coding_priorities[i]), val);
+ mask = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
+ val = Fcons (make_number (coding.id), Qnil);
}
+ /* Then, detect eol-format if necessary. */
{
- int one_byte_eol = -1, two_byte_eol = -1;
+ int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
Lisp_Object tail;
+ if (VECTORP (eol_type))
+ {
+ if (mask & ~CATEGORY_MASK_UTF_16)
+ normal_eol = detect_eol (coding.source, src_bytes,
+ coding_category_raw_text);
+ if (mask & (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_BE_NOSIG))
+ utf_16_be_eol = detect_eol (coding.source, src_bytes,
+ coding_category_utf_16_be);
+ if (mask & (CATEGORY_MASK_UTF_16_LE | CATEGORY_MASK_UTF_16_LE_NOSIG))
+ utf_16_le_eol = detect_eol (coding.source, src_bytes,
+ coding_category_utf_16_le);
+ }
+ else
+ {
+ if (EQ (eol_type, Qunix))
+ normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
+ else if (EQ (eol_type, Qdos))
+ normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
+ else
+ normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
+ }
+
for (tail = val; CONSP (tail); tail = XCDR (tail))
{
- struct coding_system *this
- = (NILP (coding_system) ? coding_categories + XINT (XCAR (tail))
- : &coding);
+ enum coding_category category;
int this_eol;
-
- attrs = CODING_ID_ATTRS (this->id);
- eol_type = CODING_ID_EOL_TYPE (this->id);
- XSETCAR (tail, CODING_ID_NAME (this->id));
+
+ id = XINT (XCAR (tail));
+ attrs = CODING_ID_ATTRS (id);
+ category = XINT (CODING_ATTR_CATEGORY (attrs));
+ eol_type = CODING_ID_EOL_TYPE (id);
if (VECTORP (eol_type))
{
- if (EQ (CODING_ATTR_TYPE (attrs), Qutf_16))
- {
- if (two_byte_eol < 0)
- two_byte_eol = detect_eol (this, coding.source, src_bytes);
- this_eol = two_byte_eol;
- }
+ if (category == coding_category_utf_16_be
+ || category == coding_category_utf_16_be_nosig)
+ this_eol = utf_16_be_eol;
+ else if (category == coding_category_utf_16_le
+ || category == coding_category_utf_16_le_nosig)
+ this_eol = utf_16_le_eol;
else
- {
- if (one_byte_eol < 0)
- one_byte_eol =detect_eol (this, coding.source, src_bytes);
- this_eol = one_byte_eol;
- }
+ this_eol = normal_eol;
+
if (this_eol == EOL_SEEN_LF)
XSETCAR (tail, AREF (eol_type, 0));
else if (this_eol == EOL_SEEN_CRLF)
XSETCAR (tail, AREF (eol_type, 1));
else if (this_eol == EOL_SEEN_CR)
XSETCAR (tail, AREF (eol_type, 2));
+ else
+ XSETCAR (tail, CODING_ID_NAME (id));
}
+ else
+ XSETCAR (tail, CODING_ID_NAME (id));
}
}