From: Kenichi Handa <handa@m17n.org>
Date: Wed, 14 Jan 2009 12:19:44 +0000 (+0000)
Subject: (TWO_MORE_BYTES): New macro.
X-Git-Tag: emacs-pretest-23.0.90~466
X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=f56a4450912fa06401b13e6631313fe17bed006f;p=emacs.git

(TWO_MORE_BYTES): New macro.
(detect_coding_utf_16): Use TWO_MORE_BYTES instead of
ONE_MORE_BYTE.
---

diff --git a/src/ChangeLog b/src/ChangeLog
index 5d048f7413b..4f11a1269cc 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,9 @@
+2009-01-14  Kenichi Handa  <handa@m17n.org>
+
+	* coding.c (TWO_MORE_BYTES): New macro.
+	(detect_coding_utf_16): Use TWO_MORE_BYTES instead of
+	ONE_MORE_BYTE.
+
 2009-01-13  Chong Yidong  <cyd@stupidchicken.com>
 
 	* font.c (font_clear_prop): If clearing the family, clear the font
@@ -90,7 +96,7 @@
 2009-01-07  Kenichi Handa  <handa@m17n.org>
 
 	* fileio.c (Finsert_file_contents): In the case of replace,
-	remeber the coding system used for decoding in
+	remember the coding system used for decoding in
 	coding_system (Bug#1039).
 
 	* coding.c (decode_coding_utf_8): Check byte_after_cr before
diff --git a/src/coding.c b/src/coding.c
index 01878a37b5c..9a94bc6fb2a 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -743,6 +743,47 @@ static struct coding_system coding_categories[coding_category_max];
     consumed_chars++;					\
   } while (0)
 
+/* Safely get two bytes from the source text pointed by SRC which ends
+   at SRC_END, and set C1 and C2 to those bytes.  If there are not
+   enough bytes in the source for C1, it jumps to `no_more_source'.
+   If there are not enough bytes in the source for C2, set C2 to -1.
+   If multibytep is nonzero and a multibyte character is found at SRC,
+   set C1 and/or C2 to the negative value of the character code.  The
+   caller should declare and set these variables appropriately in
+   advance:
+	src, src_end, multibytep
+   It is intended that this macro is used in detect_coding_utf_16.  */
+
+#define TWO_MORE_BYTES(c1, c2)			\
+  do {						\
+    if (src == src_end)				\
+      goto no_more_source;			\
+    c1 = *src++;				\
+    if (multibytep && (c1 & 0x80))		\
+      {						\
+	if ((c1 & 0xFE) == 0xC0)		\
+	  c1 = ((c1 & 1) << 6) | *src++;	\
+	else					\
+	  {					\
+	    c1 = c2 = -1;			\
+	    break;				\
+	  }					\
+      }						\
+    if (src == src_end)				\
+      c2 = -1;					\
+    else					\
+      {						\
+	c2 = *src++;				\
+	if (multibytep && (c2 & 0x80))		\
+	  {					\
+	    if ((c2 & 0xFE) == 0xC0)		\
+	      c2 = ((c2 & 1) << 6) | *src++;	\
+	    else				\
+	      c2 = -1;				\
+	  }					\
+      }						\
+  } while (0)
+
 
 #define ONE_MORE_BYTE_NO_CHECK(c)			\
   do {							\
@@ -1575,8 +1616,7 @@ detect_coding_utf_16 (coding, detect_info)
       return 0;
     }
 
-  ONE_MORE_BYTE (c1);
-  ONE_MORE_BYTE (c2);
+  TWO_MORE_BYTES (c1, c2);
   if ((c1 == 0xFF) && (c2 == 0xFE))
     {
       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
@@ -1593,6 +1633,11 @@ detect_coding_utf_16 (coding, detect_info)
 				| CATEGORY_MASK_UTF_16_BE_NOSIG
 				| CATEGORY_MASK_UTF_16_LE_NOSIG);
     }
+  else if (c1 < 0 || c2 < 0)
+    {
+      detect_info->rejected |= CATEGORY_MASK_UTF_16;
+      return 0;
+    }
   else
     {
       /* We check the dispersion of Eth and Oth bytes where E is even and
@@ -1610,8 +1655,9 @@ detect_coding_utf_16 (coding, detect_info)
 
       while (1)
 	{
-	  ONE_MORE_BYTE (c1);
-	  ONE_MORE_BYTE (c2);
+	  TWO_MORE_BYTES (c1, c2);
+	  if (c1 < 0 || c2 < 0)
+	    break;
 	  if (! e[c1])
 	    {
 	      e[c1] = 1;