* charset.c: Integer overflow fixes.

author Paul Eggert <eggert@cs.ucla.edu>

Sun, 25 Sep 2011 01:22:30 +0000 (18:22 -0700)

committer Paul Eggert <eggert@cs.ucla.edu>

Sun, 25 Sep 2011 01:22:30 +0000 (18:22 -0700)
author Paul Eggert <eggert@cs.ucla.edu>
Sun, 25 Sep 2011 01:22:30 +0000 (18:22 -0700)
committer Paul Eggert <eggert@cs.ucla.edu>
Sun, 25 Sep 2011 01:22:30 +0000 (18:22 -0700)
diff --git a/src/ChangeLog b/src/ChangeLog

index f67d1b72bf2e1dc1f21af3ba5906faaabfffffd2..7973cc277e2d37c97f5353cce39333b61c3ebcb5 100644 (file)
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,4 +1,4 @@
-2011-09-21  Paul Eggert  <eggert@cs.ucla.edu>
+2011-09-25  Paul Eggert  <eggert@cs.ucla.edu>
  
         * alloc.c (pure_bytes_used_lisp, pure_bytes_used_non_lisp):
         (allocate_vectorlike, buffer_memory_full, struct sdata, SDATA_SIZE)
@@ -103,10 +103,22 @@
         Use ptrdiff_t, not int, to avoid needless 32-bit limit on 64-bit hosts.
         (load_charset_map_from_file): Redo idx calculation to avoid overflow.
         (load_charset_map_from_vector, Fdefine_charset_internal):
-       Don't assume fixnum fits in int or unsigned int.
+       Don't assume fixnum fits in int.
         (load_charset_map_from_vector, Fmap_charset_chars):
         Remove now-unnecessary CHECK_NATNUMs.
         (Fdefine_charset_internal): Check ranges here, more carefully.
+       Don't rely on undefined behavior with signed left shift overflow.
+       Don't assume unsigned int fits into fixnum, or that fixnum fits
+       into unsigned int.  Don't require max_code to be a valid fixnum;
+       that's not true for gb10830 4-byte on a 32-bit host.  Allow
+       invalid_code to be a cons, for the same reason.  Require code_offset
+       to be a character.  Avoid int overflow if max_char is close
+       to INT_MAX.
+       (CODE_POINT_TO_INDEX): On 32-bit hosts, return int, not unsigned;
+       this is intended anyway and avoids some undefined behavior.
+       (load_charset_map): Pass unsigned, not int, as 2nd arg of
+       INDEX_TO_CODE_POINT, as that's what it expects.
+       (Funify_charset, encode_char): Don't stuff unsigned vals into int vars.
         * chartab.c (Fmake_char_table, Fset_char_table_range)
         (uniprop_get_decoder, uniprop_get_encoder):
         Don't assume fixnum fits in int.
diff --git a/src/charset.c b/src/charset.c

index 9d58d29d05c14171f7eedc2825b799dd7c9cd3c7..2451c55e92ada161d078000a3bfb89a169150b9c 100644 (file)
--- a/src/charset.c
+++ b/src/charset.c
@@ -118,24 +118,25 @@ int iso_charset_table[ISO_MAX_DIMENSION][ISO_MAX_CHARS][ISO_MAX_FINAL];
  
  #define CODE_POINT_TO_INDEX(charset, code)                             \
    ((charset)->code_linear_p                                            \
-   ? (code) - (charset)->min_code                                      \
+   ? (int) ((code) - (charset)->min_code)                              \
     : (((charset)->code_space_mask[(code) >> 24] & 0x8)                 \
        && ((charset)->code_space_mask[((code) >> 16) & 0xFF] & 0x4)     \
        && ((charset)->code_space_mask[((code) >> 8) & 0xFF] & 0x2)      \
        && ((charset)->code_space_mask[(code) & 0xFF] & 0x1))            \
-   ? (((((code) >> 24) - (charset)->code_space[12])                    \
-       * (charset)->code_space[11])                                    \
-      + (((((code) >> 16) & 0xFF) - (charset)->code_space[8])          \
-        * (charset)->code_space[7])                                    \
-      + (((((code) >> 8) & 0xFF) - (charset)->code_space[4])           \
-        * (charset)->code_space[3])                                    \
-      + (((code) & 0xFF) - (charset)->code_space[0])                   \
-      - ((charset)->char_index_offset))                                        \
+   ? (int) (((((code) >> 24) - (charset)->code_space[12])              \
+            * (charset)->code_space[11])                               \
+           + (((((code) >> 16) & 0xFF) - (charset)->code_space[8])     \
+              * (charset)->code_space[7])                              \
+           + (((((code) >> 8) & 0xFF) - (charset)->code_space[4])      \
+              * (charset)->code_space[3])                              \
+           + (((code) & 0xFF) - (charset)->code_space[0])              \
+           - ((charset)->char_index_offset))                           \
     : -1)
  
  
-/* Convert the character index IDX to code-point CODE for CHARSET.
-   It is assumed that IDX is in a valid range.  */
+/* Return the code-point for the character index IDX in CHARSET.
+   IDX should be an unsigned int variable in a valid range (which is
+   always in nonnegative int range too).  IDX contains garbage afterwards.  */
  
  #define INDEX_TO_CODE_POINT(charset, idx)                                   \
    ((charset)->code_linear_p                                                 \
@@ -363,7 +364,8 @@ load_charset_map (struct charset *charset, struct charset_map_entries *entries,
               && CHARSET_COMPACT_CODES_P (charset))
             for (; from_index < lim_index; from_index++, from_c++)
               {
-               unsigned code = INDEX_TO_CODE_POINT (charset, from_index);
+               unsigned code = from_index;
+               code = INDEX_TO_CODE_POINT (charset, code);
  
                 if (NILP (CHAR_TABLE_REF (table, from_c)))
                   CHAR_TABLE_SET (table, from_c, make_number (code));
@@ -923,11 +925,11 @@ usage: (define-charset-internal ...)  */)
    charset.min_code = (charset.code_space[0]
                       | (charset.code_space[4] << 8)
                       | (charset.code_space[8] << 16)
-                     | (charset.code_space[12] << 24));
+                     | ((unsigned) charset.code_space[12] << 24));
    charset.max_code = (charset.code_space[1]
                       | (charset.code_space[5] << 8)
                       | (charset.code_space[9] << 16)
-                     | (charset.code_space[13] << 24));
+                     | ((unsigned) charset.code_space[13] << 24));
    charset.char_index_offset = 0;
  
    val = args[charset_arg_min_code];
@@ -937,8 +939,8 @@ usage: (define-charset-internal ...)  */)
  
        if (code < charset.min_code
           || code > charset.max_code)
-       args_out_of_range_3 (make_number (charset.min_code),
-                            make_number (charset.max_code), val);
+       args_out_of_range_3 (make_fixnum_or_float (charset.min_code),
+                            make_fixnum_or_float (charset.max_code), val);
        charset.char_index_offset = CODE_POINT_TO_INDEX (&charset, code);
        charset.min_code = code;
      }
@@ -950,8 +952,8 @@ usage: (define-charset-internal ...)  */)
  
        if (code < charset.min_code
           || code > charset.max_code)
-       args_out_of_range_3 (make_number (charset.min_code),
-                            make_number (charset.max_code), val);
+       args_out_of_range_3 (make_fixnum_or_float (charset.min_code),
+                            make_fixnum_or_float (charset.max_code), val);
        charset.max_code = code;
      }
  
@@ -964,14 +966,14 @@ usage: (define-charset-internal ...)  */)
         charset.invalid_code = 0;
        else
         {
-         if (charset.max_code < min (UINT_MAX, MOST_POSITIVE_FIXNUM))
+         if (charset.max_code < UINT_MAX)
             charset.invalid_code = charset.max_code + 1;
           else
             error ("Attribute :invalid-code must be specified");
         }
      }
    else
-    charset.invalid_code = XFASTINT (val);
+    charset.invalid_code = cons_to_unsigned (val, UINT_MAX);
  
    val = args[charset_arg_iso_final];
    if (NILP (val))
@@ -1015,17 +1017,17 @@ usage: (define-charset-internal ...)  */)
    if (! NILP (args[charset_arg_code_offset]))
      {
        val = args[charset_arg_code_offset];
-      CHECK_TYPE_RANGED_INTEGER (int, val);
+      CHECK_CHARACTER (val);
  
        charset.method = CHARSET_METHOD_OFFSET;
        charset.code_offset = XINT (val);
  
-      i = CODE_POINT_TO_INDEX (&charset, charset.min_code);
-      charset.min_char = i + charset.code_offset;
        i = CODE_POINT_TO_INDEX (&charset, charset.max_code);
-      charset.max_char = i + charset.code_offset;
-      if (charset.max_char > MAX_CHAR)
+      if (MAX_CHAR - charset.code_offset < i)
         error ("Unsupported max char: %d", charset.max_char);
+      charset.max_char = i + charset.code_offset;
+      i = CODE_POINT_TO_INDEX (&charset, charset.min_code);
+      charset.min_char = i + charset.code_offset;
  
        i = (charset.min_char >> 7) << 7;
        for (; i < 0x10000 && i <= charset.max_char; i += 128)
@@ -1385,8 +1387,8 @@ Optional third argument DEUNIFY, if non-nil, means to de-unify CHARSET.  */)
      }
    else if (CHAR_TABLE_P (Vchar_unify_table))
      {
-      int min_code = CHARSET_MIN_CODE (cs);
-      int max_code = CHARSET_MAX_CODE (cs);
+      unsigned min_code = CHARSET_MIN_CODE (cs);
+      unsigned max_code = CHARSET_MAX_CODE (cs);
        int min_char = DECODE_CHAR (cs, min_code);
        int max_char = DECODE_CHAR (cs, max_code);
  
@@ -1830,9 +1832,8 @@ encode_char (struct charset *charset, int c)
      }
    else                         /* method == CHARSET_METHOD_OFFSET */
      {
-      int code_index = c - CHARSET_CODE_OFFSET (charset);
-
-      code = INDEX_TO_CODE_POINT (charset, code_index);
+      code = c - CHARSET_CODE_OFFSET (charset);
+      code = INDEX_TO_CODE_POINT (charset, code);
      }
  
    return code;
author	Paul Eggert <eggert@cs.ucla.edu>
	Sun, 25 Sep 2011 01:22:30 +0000 (18:22 -0700)
committer	Paul Eggert <eggert@cs.ucla.edu>
	Sun, 25 Sep 2011 01:22:30 +0000 (18:22 -0700)
src/ChangeLog		patch \| blob \| history
src/charset.c		patch \| blob \| history