From 3c7649c1859d6252444044fd64c7b27d8e487f68 Mon Sep 17 00:00:00 2001
From: Paul Eggert <eggert@cs.ucla.edu>
Date: Sat, 24 Sep 2011 18:22:30 -0700
Subject: [PATCH] * charset.c: Integer overflow fixes.

Don't rely on undefined behavior with signed left shift overflow.
Don't assume unsigned int fits into fixnum, or that fixnum fits
into unsigned int.  Don't require max_code to be a valid fixnum;
that's not true for gb10830 4-byte on a 32-bit host.  Allow
invalid_code to be a cons, for the same reason.  Require code_offset
to be a character.  Avoid int overflow if max_char is close
to INT_MAX.
(CODE_POINT_TO_INDEX): On 32-bit hosts, return int, not unsigned;
this is intended anyway and avoids some undefined behavior.
(load_charset_map): Pass unsigned, not int, as 2nd arg of
INDEX_TO_CODE_POINT, as that's what it expects.
(Funify_charset, encode_char): Don't stuff unsigned vals into int vars.
---
 src/ChangeLog | 16 ++++++++++++--
 src/charset.c | 61 ++++++++++++++++++++++++++-------------------------
 2 files changed, 45 insertions(+), 32 deletions(-)

diff --git a/src/ChangeLog b/src/ChangeLog
index f67d1b72bf2..7973cc277e2 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,4 +1,4 @@
-2011-09-21  Paul Eggert  <eggert@cs.ucla.edu>
+2011-09-25  Paul Eggert  <eggert@cs.ucla.edu>
 
 	* alloc.c (pure_bytes_used_lisp, pure_bytes_used_non_lisp):
 	(allocate_vectorlike, buffer_memory_full, struct sdata, SDATA_SIZE)
@@ -103,10 +103,22 @@
 	Use ptrdiff_t, not int, to avoid needless 32-bit limit on 64-bit hosts.
 	(load_charset_map_from_file): Redo idx calculation to avoid overflow.
 	(load_charset_map_from_vector, Fdefine_charset_internal):
-	Don't assume fixnum fits in int or unsigned int.
+	Don't assume fixnum fits in int.
 	(load_charset_map_from_vector, Fmap_charset_chars):
 	Remove now-unnecessary CHECK_NATNUMs.
 	(Fdefine_charset_internal): Check ranges here, more carefully.
+	Don't rely on undefined behavior with signed left shift overflow.
+	Don't assume unsigned int fits into fixnum, or that fixnum fits
+	into unsigned int.  Don't require max_code to be a valid fixnum;
+	that's not true for gb10830 4-byte on a 32-bit host.  Allow
+	invalid_code to be a cons, for the same reason.  Require code_offset
+	to be a character.  Avoid int overflow if max_char is close
+	to INT_MAX.
+	(CODE_POINT_TO_INDEX): On 32-bit hosts, return int, not unsigned;
+	this is intended anyway and avoids some undefined behavior.
+	(load_charset_map): Pass unsigned, not int, as 2nd arg of
+	INDEX_TO_CODE_POINT, as that's what it expects.
+	(Funify_charset, encode_char): Don't stuff unsigned vals into int vars.
 	* chartab.c (Fmake_char_table, Fset_char_table_range)
 	(uniprop_get_decoder, uniprop_get_encoder):
 	Don't assume fixnum fits in int.
diff --git a/src/charset.c b/src/charset.c
index 9d58d29d05c..2451c55e92a 100644
--- a/src/charset.c
+++ b/src/charset.c
@@ -118,24 +118,25 @@ int iso_charset_table[ISO_MAX_DIMENSION][ISO_MAX_CHARS][ISO_MAX_FINAL];
 
 #define CODE_POINT_TO_INDEX(charset, code)				\
   ((charset)->code_linear_p						\
-   ? (code) - (charset)->min_code					\
+   ? (int) ((code) - (charset)->min_code)				\
    : (((charset)->code_space_mask[(code) >> 24] & 0x8)			\
       && ((charset)->code_space_mask[((code) >> 16) & 0xFF] & 0x4)	\
       && ((charset)->code_space_mask[((code) >> 8) & 0xFF] & 0x2)	\
       && ((charset)->code_space_mask[(code) & 0xFF] & 0x1))		\
-   ? (((((code) >> 24) - (charset)->code_space[12])			\
-       * (charset)->code_space[11])					\
-      + (((((code) >> 16) & 0xFF) - (charset)->code_space[8])		\
-	 * (charset)->code_space[7])					\
-      + (((((code) >> 8) & 0xFF) - (charset)->code_space[4])		\
-	 * (charset)->code_space[3])					\
-      + (((code) & 0xFF) - (charset)->code_space[0])			\
-      - ((charset)->char_index_offset))					\
+   ? (int) (((((code) >> 24) - (charset)->code_space[12])		\
+	     * (charset)->code_space[11])				\
+	    + (((((code) >> 16) & 0xFF) - (charset)->code_space[8])	\
+	       * (charset)->code_space[7])				\
+	    + (((((code) >> 8) & 0xFF) - (charset)->code_space[4])	\
+	       * (charset)->code_space[3])				\
+	    + (((code) & 0xFF) - (charset)->code_space[0])		\
+	    - ((charset)->char_index_offset))				\
    : -1)
 
 
-/* Convert the character index IDX to code-point CODE for CHARSET.
-   It is assumed that IDX is in a valid range.  */
+/* Return the code-point for the character index IDX in CHARSET.
+   IDX should be an unsigned int variable in a valid range (which is
+   always in nonnegative int range too).  IDX contains garbage afterwards.  */
 
 #define INDEX_TO_CODE_POINT(charset, idx)				     \
   ((charset)->code_linear_p						     \
@@ -363,7 +364,8 @@ load_charset_map (struct charset *charset, struct charset_map_entries *entries,
 	      && CHARSET_COMPACT_CODES_P (charset))
 	    for (; from_index < lim_index; from_index++, from_c++)
 	      {
-		unsigned code = INDEX_TO_CODE_POINT (charset, from_index);
+		unsigned code = from_index;
+		code = INDEX_TO_CODE_POINT (charset, code);
 
 		if (NILP (CHAR_TABLE_REF (table, from_c)))
 		  CHAR_TABLE_SET (table, from_c, make_number (code));
@@ -923,11 +925,11 @@ usage: (define-charset-internal ...)  */)
   charset.min_code = (charset.code_space[0]
 		      | (charset.code_space[4] << 8)
 		      | (charset.code_space[8] << 16)
-		      | (charset.code_space[12] << 24));
+		      | ((unsigned) charset.code_space[12] << 24));
   charset.max_code = (charset.code_space[1]
 		      | (charset.code_space[5] << 8)
 		      | (charset.code_space[9] << 16)
-		      | (charset.code_space[13] << 24));
+		      | ((unsigned) charset.code_space[13] << 24));
   charset.char_index_offset = 0;
 
   val = args[charset_arg_min_code];
@@ -937,8 +939,8 @@ usage: (define-charset-internal ...)  */)
 
       if (code < charset.min_code
 	  || code > charset.max_code)
-	args_out_of_range_3 (make_number (charset.min_code),
-			     make_number (charset.max_code), val);
+	args_out_of_range_3 (make_fixnum_or_float (charset.min_code),
+			     make_fixnum_or_float (charset.max_code), val);
       charset.char_index_offset = CODE_POINT_TO_INDEX (&charset, code);
       charset.min_code = code;
     }
@@ -950,8 +952,8 @@ usage: (define-charset-internal ...)  */)
 
       if (code < charset.min_code
 	  || code > charset.max_code)
-	args_out_of_range_3 (make_number (charset.min_code),
-			     make_number (charset.max_code), val);
+	args_out_of_range_3 (make_fixnum_or_float (charset.min_code),
+			     make_fixnum_or_float (charset.max_code), val);
       charset.max_code = code;
     }
 
@@ -964,14 +966,14 @@ usage: (define-charset-internal ...)  */)
 	charset.invalid_code = 0;
       else
 	{
-	  if (charset.max_code < min (UINT_MAX, MOST_POSITIVE_FIXNUM))
+	  if (charset.max_code < UINT_MAX)
 	    charset.invalid_code = charset.max_code + 1;
 	  else
 	    error ("Attribute :invalid-code must be specified");
 	}
     }
   else
-    charset.invalid_code = XFASTINT (val);
+    charset.invalid_code = cons_to_unsigned (val, UINT_MAX);
 
   val = args[charset_arg_iso_final];
   if (NILP (val))
@@ -1015,17 +1017,17 @@ usage: (define-charset-internal ...)  */)
   if (! NILP (args[charset_arg_code_offset]))
     {
       val = args[charset_arg_code_offset];
-      CHECK_TYPE_RANGED_INTEGER (int, val);
+      CHECK_CHARACTER (val);
 
       charset.method = CHARSET_METHOD_OFFSET;
       charset.code_offset = XINT (val);
 
-      i = CODE_POINT_TO_INDEX (&charset, charset.min_code);
-      charset.min_char = i + charset.code_offset;
       i = CODE_POINT_TO_INDEX (&charset, charset.max_code);
-      charset.max_char = i + charset.code_offset;
-      if (charset.max_char > MAX_CHAR)
+      if (MAX_CHAR - charset.code_offset < i)
 	error ("Unsupported max char: %d", charset.max_char);
+      charset.max_char = i + charset.code_offset;
+      i = CODE_POINT_TO_INDEX (&charset, charset.min_code);
+      charset.min_char = i + charset.code_offset;
 
       i = (charset.min_char >> 7) << 7;
       for (; i < 0x10000 && i <= charset.max_char; i += 128)
@@ -1385,8 +1387,8 @@ Optional third argument DEUNIFY, if non-nil, means to de-unify CHARSET.  */)
     }
   else if (CHAR_TABLE_P (Vchar_unify_table))
     {
-      int min_code = CHARSET_MIN_CODE (cs);
-      int max_code = CHARSET_MAX_CODE (cs);
+      unsigned min_code = CHARSET_MIN_CODE (cs);
+      unsigned max_code = CHARSET_MAX_CODE (cs);
       int min_char = DECODE_CHAR (cs, min_code);
       int max_char = DECODE_CHAR (cs, max_code);
 
@@ -1830,9 +1832,8 @@ encode_char (struct charset *charset, int c)
     }
   else				/* method == CHARSET_METHOD_OFFSET */
     {
-      int code_index = c - CHARSET_CODE_OFFSET (charset);
-
-      code = INDEX_TO_CODE_POINT (charset, code_index);
+      code = c - CHARSET_CODE_OFFSET (charset);
+      code = INDEX_TO_CODE_POINT (charset, code);
     }
 
   return code;
-- 
2.39.5