(detect_coding_iso2022): New arg

author Kenichi Handa <handa@m17n.org>

Sat, 19 Jan 2008 05:55:50 +0000 (05:55 +0000)

committer Kenichi Handa <handa@m17n.org>

Sat, 19 Jan 2008 05:55:50 +0000 (05:55 +0000)
author Kenichi Handa <handa@m17n.org>
Sat, 19 Jan 2008 05:55:50 +0000 (05:55 +0000)
committer Kenichi Handa <handa@m17n.org>
Sat, 19 Jan 2008 05:55:50 +0000 (05:55 +0000)
diff --git a/src/ChangeLog b/src/ChangeLog

index b28c1d9ac05fe98dd9871e58ab2a3cd097bdd547..034315cf7cc39542735206ec9398c0465db54a9a 100644 (file)
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,13 @@
+2008-01-19  Kenichi Handa  <handa@m17n.org>
+
+       * coding.c (detect_coding_iso2022): New arg
+       latin_extra_code_state.  Allow Latin extra codes only
+       when *latin_extra_code_state is nonzero.
+       (detect_coding_mask): If there is a NULL byte, detect the encoding
+       as UTF-16 or binary.  If Latin extra codes exist, detect the
+       encoding as ISO-2022 only when there's no other proper encoding is
+       found.
+
  2008-01-17  Jason Rumney  <jasonr@gnu.org>
  
         * xterm.c (handle_one_xevent): Revert to counting chars not bytes.
diff --git a/src/coding.c b/src/coding.c

index 716eb98a87faed70122e48a1ff3090458331625e..dd201ae61d636ab385fb2ed09b0053f9981a89a0 100644 (file)
--- a/src/coding.c
+++ b/src/coding.c
@@ -1410,12 +1410,17 @@ enum iso_code_class_type iso_code_class[256];
         CODING_CATEGORY_MASK_ISO_7_ELSE
         CODING_CATEGORY_MASK_ISO_8_ELSE
     are set.  If a code which should never appear in ISO2022 is found,
-   returns 0.  */
+   returns 0.
+
+   If *latin_extra_code_state is zero and Latin extra codes are found,
+   set *latin_extra_code_state to 1 and return 0.  If it is nonzero,
+   accept Latin extra codes.  */
  
  static int
-detect_coding_iso2022 (src, src_end, multibytep)
+detect_coding_iso2022 (src, src_end, multibytep, latin_extra_code_state)
       unsigned char *src, *src_end;
       int multibytep;
+     int *latin_extra_code_state;
  {
    int mask = CODING_CATEGORY_MASK_ISO;
    int mask_found = 0;
@@ -1578,6 +1583,11 @@ detect_coding_iso2022 (src, src_end, multibytep)
             if (VECTORP (Vlatin_extra_code_table)
                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
               {
+               if (! *latin_extra_code_state)
+                 {
+                   *latin_extra_code_state = 1;
+                   return 0;
+                 }
                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
                     & CODING_FLAG_ISO_LATIN_EXTRA)
                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
@@ -1604,6 +1614,11 @@ detect_coding_iso2022 (src, src_end, multibytep)
                 {
                   int newmask = 0;
  
+                 if (! *latin_extra_code_state)
+                   {
+                     *latin_extra_code_state = 1;
+                     return 0;
+                   }
                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
                       & CODING_FLAG_ISO_LATIN_EXTRA)
                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
@@ -4131,6 +4146,8 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
    unsigned char *src = source, *src_end = source + src_bytes;
    unsigned int mask, utf16_examined_p, iso2022_examined_p;
    int i;
+  int null_byte_found;
+  int latin_extra_code_state = 1;
  
    /* At first, skip all ASCII characters and control characters except
       for three ISO2022 specific control characters.  */
@@ -4139,21 +4156,36 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
    ascii_skip_code[ISO_CODE_ESC] = 0;
  
   label_loop_detect_coding:
-  while (src < src_end && ascii_skip_code[*src]) src++;
+  null_byte_found = 0;
+  /* We stop this loop before the last byte because it may be a NULL
+     anchor byte.  */
+  while (src < src_end - 1 && ascii_skip_code[*src])
+    null_byte_found |= (! *src++);
+  if (ascii_skip_code[*src])
+    src++;
+  else if (! null_byte_found)
+    {
+      unsigned char *p = src + 1;
+      while (p < src_end - 1)
+       null_byte_found |= (! *p++);
+    }
    *skip = src - source;
  
    if (src >= src_end)
-    /* We found nothing other than ASCII.  There's nothing to do.  */
+    /* We found nothing other than ASCII (and NULL byte).  There's
+       nothing to do.  */
      return 0;
  
    c = *src;
    /* The text seems to be encoded in some multilingual coding system.
       Now, try to find in which coding system the text is encoded.  */
-  if (c < 0x80)
+  if (! null_byte_found && c < 0x80)
      {
        /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
        /* C is an ISO2022 specific control code of C0.  */
-      mask = detect_coding_iso2022 (src, src_end, multibytep);
+      latin_extra_code_state = 1;
+      mask = detect_coding_iso2022 (src, src_end, multibytep,
+                                   &latin_extra_code_state);
        if (mask == 0)
         {
           /* No valid ISO2022 code follows C.  Try again.  */
@@ -4181,21 +4213,27 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
        if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
         c = src[1] - 0x20;
  
-      if (c < 0xA0)
+      if (null_byte_found)
+       {
+         try = (CODING_CATEGORY_MASK_UTF_16_BE
+                | CODING_CATEGORY_MASK_UTF_16_LE);
+       }
+      else if (c < 0xA0)
         {
           /* C is the first byte of SJIS character code,
              or a leading-code of Emacs' internal format (emacs-mule),
              or the first byte of UTF-16.  */
           try = (CODING_CATEGORY_MASK_SJIS
-                 | CODING_CATEGORY_MASK_EMACS_MULE
-                 | CODING_CATEGORY_MASK_UTF_16_BE
-                 | CODING_CATEGORY_MASK_UTF_16_LE);
+                | CODING_CATEGORY_MASK_EMACS_MULE
+                | CODING_CATEGORY_MASK_UTF_16_BE
+                | CODING_CATEGORY_MASK_UTF_16_LE);
  
           /* Or, if C is a special latin extra code,
              or is an ISO2022 specific control code of C1 (SS2 or SS3),
              or is an ISO2022 control-sequence-introducer (CSI),
              we should also consider the possibility of ISO2022 codings.  */
-         if ((VECTORP (Vlatin_extra_code_table)
+         if ((latin_extra_code_state
+              && VECTORP (Vlatin_extra_code_table)
                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
               || (c == ISO_CODE_CSI
@@ -4205,7 +4243,7 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
                               && src + 1 < src_end
                               && src[1] == ']')))))
             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
-                    | CODING_CATEGORY_MASK_ISO_8BIT);
+                   | CODING_CATEGORY_MASK_ISO_8BIT);
         }
        else
         /* C is a character of ISO2022 in graphic plane right,
@@ -4213,29 +4251,36 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
            or the first byte of BIG5's 2-byte code,
            or the first byte of UTF-8/16.  */
         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
-               | CODING_CATEGORY_MASK_ISO_8BIT
-               | CODING_CATEGORY_MASK_SJIS
-               | CODING_CATEGORY_MASK_BIG5
-               | CODING_CATEGORY_MASK_UTF_8
-               | CODING_CATEGORY_MASK_UTF_16_BE
-               | CODING_CATEGORY_MASK_UTF_16_LE);
+              | CODING_CATEGORY_MASK_ISO_8BIT
+              | CODING_CATEGORY_MASK_SJIS
+              | CODING_CATEGORY_MASK_BIG5
+              | CODING_CATEGORY_MASK_UTF_8
+              | CODING_CATEGORY_MASK_UTF_16_BE
+              | CODING_CATEGORY_MASK_UTF_16_LE);
  
        /* Or, we may have to consider the possibility of CCL.  */
-      if (coding_system_table[CODING_CATEGORY_IDX_CCL]
+      if (! null_byte_found
+         && coding_system_table[CODING_CATEGORY_IDX_CCL]
           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
               ->spec.ccl.valid_codes)[c])
         try |= CODING_CATEGORY_MASK_CCL;
  
        mask = 0;
-      utf16_examined_p = iso2022_examined_p = 0;
        if (priorities)
         {
+         /* At first try detection with Latin extra codes not-allowed.
+            If no proper coding system is found because of Latin extra
+            codes, try detection with Latin extra codes allowed.  */
+         latin_extra_code_state = 0;
+       label_retry:
+         utf16_examined_p = iso2022_examined_p = 0;
           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
             {
               if (!iso2022_examined_p
                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
                 {
-                 mask |= detect_coding_iso2022 (src, src_end, multibytep);
+                 mask |= detect_coding_iso2022 (src, src_end, multibytep,
+                                                &latin_extra_code_state);
                   iso2022_examined_p = 1;
                 }
               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
@@ -4256,16 +4301,40 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
                 mask |= detect_coding_ccl (src, src_end, multibytep);
               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
-               mask |= CODING_CATEGORY_MASK_RAW_TEXT;
+               {
+                 if (latin_extra_code_state == 1)
+                   {
+                     /* Detection of ISO-2022 based coding system
+                        failed because of Latin extra codes.  Before
+                        falling back to raw-text, try again with
+                        Latin extra codes allowed.  */
+                     latin_extra_code_state = 2;
+                     try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE
+                            | CODING_CATEGORY_MASK_ISO_8BIT);
+                     goto label_retry;
+                   }
+                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
+               }
               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
-               mask |= CODING_CATEGORY_MASK_BINARY;
+               {
+                 if (latin_extra_code_state == 1)
+                   {
+                     /* See the above comment.  */
+                     latin_extra_code_state = 2;
+                     try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE
+                            | CODING_CATEGORY_MASK_ISO_8BIT);
+                     goto label_retry;
+                   }
+                 mask |= CODING_CATEGORY_MASK_BINARY;
+               }
               if (mask & priorities[i])
                 return priorities[i];
             }
           return CODING_CATEGORY_MASK_RAW_TEXT;
         }
        if (try & CODING_CATEGORY_MASK_ISO)
-       mask |= detect_coding_iso2022 (src, src_end, multibytep);
+       mask |= detect_coding_iso2022 (src, src_end, multibytep,
+                                      &latin_extra_code_state);
        if (try & CODING_CATEGORY_MASK_SJIS)
         mask |= detect_coding_sjis (src, src_end, multibytep);
        if (try & CODING_CATEGORY_MASK_BIG5)
author	Kenichi Handa <handa@m17n.org>
	Sat, 19 Jan 2008 05:55:50 +0000 (05:55 +0000)
committer	Kenichi Handa <handa@m17n.org>
	Sat, 19 Jan 2008 05:55:50 +0000 (05:55 +0000)
src/ChangeLog		patch \| blob \| history
src/coding.c		patch \| blob \| history