while (ptr < endp)
{
- int len = MULTIBYTE_LENGTH (ptr, endp);
+ int len = multibyte_length (ptr, endp, true, true);
if (len == 0)
emacs_abort ();
ptrdiff_t *nchars, ptrdiff_t *nbytes)
{
const unsigned char *endp = str + len;
- int n;
ptrdiff_t chars = 0, bytes = 0;
if (len >= MAX_MULTIBYTE_LENGTH)
const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
while (str < adjusted_endp)
{
- if (! CHAR_BYTE8_HEAD_P (*str)
- && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
+ int n = multibyte_length (str, NULL, false, false);
+ if (0 < n)
str += n, bytes += n;
else
str++, bytes += 2;
}
while (str < endp)
{
- if (! CHAR_BYTE8_HEAD_P (*str)
- && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
+ int n = multibyte_length (str, endp, true, false);
+ if (0 < n)
str += n, bytes += n;
else
str++, bytes += 2;
unsigned char *p = str, *endp = str + nbytes;
unsigned char *to;
ptrdiff_t chars = 0;
- int n;
if (nbytes >= MAX_MULTIBYTE_LENGTH)
{
unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
- while (p < adjusted_endp
- && ! CHAR_BYTE8_HEAD_P (*p)
- && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
- p += n, chars++;
+ while (p < adjusted_endp)
+ {
+ int n = multibyte_length (p, NULL, false, false);
+ if (n <= 0)
+ break;
+ p += n, chars++;
+ }
+ }
+ while (true)
+ {
+ int n = multibyte_length (p, endp, true, false);
+ if (n <= 0)
+ break;
+ p += n, chars++;
}
- while (p < endp
- && ! CHAR_BYTE8_HEAD_P (*p)
- && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
- p += n, chars++;
if (nchars)
*nchars = chars;
if (p == endp)
unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
while (p < adjusted_endp)
{
- if (! CHAR_BYTE8_HEAD_P (*p)
- && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
+ int n = multibyte_length (p, NULL, false, false);
+ if (0 < n)
{
while (n--)
*to++ = *p++;
}
while (p < endp)
{
- if (! CHAR_BYTE8_HEAD_P (*p)
- && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
+ int n = multibyte_length (p, endp, true, false);
+ if (0 < n)
{
while (n--)
*to++ = *p++;
/* character code 1st byte byte sequence
-------------- -------- -------------
0-7F 00..7F 0xxxxxxx
- 80-7FF C2..DF 110xxxxx 10xxxxxx
- 800-FFFF E0..EF 1110xxxx 10xxxxxx 10xxxxxx
- 10000-1FFFFF F0..F7 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- 200000-3FFF7F F8 11111000 1000xxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ 80-7FF C2..DF 110yyyyx 10xxxxxx
+ 800-FFFF E0..EF 1110yyyy 10yxxxxx 10xxxxxx
+ 10000-1FFFFF F0..F7 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
+ 200000-3FFF7F F8 11111000 1000yxxx 10xxxxxx 10xxxxxx 10xxxxxx
3FFF80-3FFFFF C0..C1 1100000x 10xxxxxx (for eight-bit-char)
400000-... invalid
invalid 1st byte 80..BF 10xxxxxx
- F9..FF 11111xxx (xxx != 000)
+ F9..FF 11111yyy
+
+ In each bit pattern, 'x' and 'y' each represent a single bit of the
+ character code payload, and least one 'y' must be a 1 bit.
+ In the 5-byte sequence, the 22-bit payload cannot exceed 3FFF7F.
*/
/* Maximum character code ((1 << CHARACTERBITS) - 1). */
}
/* How many bytes a character that starts with BYTE occupies in a
- multibyte form. Unlike MULTIBYTE_LENGTH below, this function does not
+ multibyte form. Unlike multibyte_length, this function does not
validate the multibyte form, but looks only at its first byte. */
INLINE int
BYTES_BY_CHAR_HEAD (int byte)
}
-/* The byte length of multibyte form at unibyte string P ending at
- PEND. If the string doesn't point to a valid multibyte form,
- return 0. Unlike BYTES_BY_CHAR_HEAD, this macro validates the
- multibyte form. */
+/* The byte length of the multibyte form at the unibyte string P,
+ ending at PEND if CHECK, and without a length check if !CHECK.
+ If ALLOW_8BIT, allow multibyte forms of eight-bit characters.
+ If the string doesn't point to a valid multibyte form, return 0.
+ Unlike BYTES_BY_CHAR_HEAD, this function validates the multibyte form. */
INLINE int
-MULTIBYTE_LENGTH (unsigned char const *p, unsigned char const *pend)
-{
- return (! (p < pend) ? 0
- : ! (p[0] & 0x80) ? 1
- : ! (p + 1 < pend && (p[1] & 0xC0) == 0x80) ? 0
- : (p[0] & 0xE0) == 0xC0 ? 2
- : ! (p + 2 < pend && (p[2] & 0xC0) == 0x80) ? 0
- : (p[0] & 0xF0) == 0xE0 ? 3
- : ! (p + 3 < pend && (p[3] & 0xC0) == 0x80) ? 0
- : (p[0] & 0xF8) == 0xF0 ? 4
- : ! (p + 4 < pend && (p[4] & 0xC0) == 0x80) ? 0
- : p[0] == 0xF8 && (p[1] & 0xF0) == 0x80 ? 5
- : 0);
-}
-
-
-/* Like MULTIBYTE_LENGTH, but don't check the ending address. The
- multibyte form is still validated, unlike BYTES_BY_CHAR_HEAD. */
+multibyte_length (unsigned char const *p, unsigned char const *pend,
+ bool check, bool allow_8bit)
+{
+ if (!check || p < pend)
+ {
+ unsigned char c = p[0];
+ if (c < 0x80)
+ return 1;
+ if (!check || p + 1 < pend)
+ {
+ /* The 'unsigned int' avoids int overflow in the 5-byte case. */
+ unsigned int d = p[1];
+
+ if (TRAILING_CODE_P (d))
+ {
+ if (allow_8bit ? (c & 0xE0) == 0xC0 : 0xC2 <= c && c <= 0xDF)
+ return 2;
+ if ((!check || p + 2 < pend)
+ && TRAILING_CODE_P (p[2]))
+ {
+ if ((c & 0xF0) == 0xE0 && ((c & 0x0F) | (d & 0x20)))
+ return 3;
+ if ((!check || p + 3 < pend) && TRAILING_CODE_P (p[3]))
+ {
+ if ((c & 0xF8) == 0xF0 && ((c & 0x07) | (d & 0x30)))
+ return 4;
+ if (c == 0xF8 && (!check || p + 4 < pend)
+ && TRAILING_CODE_P (p[4]))
+ {
+ unsigned int w = ((d << 24) + (p[2] << 16)
+ + (p[3] << 8) + p[4]);
+ if (0x88808080 <= w && w <= 0x8FBFBDBF)
+ return 5;
+ }
+ }
+ }
+ }
+ }
+ }
-INLINE int
-MULTIBYTE_LENGTH_NO_CHECK (unsigned char const *p)
-{
- return (!(p[0] & 0x80) ? 1
- : (p[1] & 0xC0) != 0x80 ? 0
- : (p[0] & 0xE0) == 0xC0 ? 2
- : (p[2] & 0xC0) != 0x80 ? 0
- : (p[0] & 0xF0) == 0xE0 ? 3
- : (p[3] & 0xC0) != 0x80 ? 0
- : (p[0] & 0xF8) == 0xF0 ? 4
- : (p[4] & 0xC0) != 0x80 ? 0
- : p[0] == 0xF8 && (p[1] & 0xF0) == 0x80 ? 5
- : 0);
+ return 0;
}
if (! multibytep)
{
- int bytes;
-
if (coding->encoder == encode_coding_raw_text
|| coding->encoder == encode_coding_ccl)
c = *src++, pos++;
- else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
- c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
else
- c = BYTE8_TO_CHAR (*src), src++, pos++;
+ {
+ int bytes = multibyte_length (src, src_end, true, true);
+ if (0 < bytes)
+ c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
+ else
+ c = BYTE8_TO_CHAR (*src), src++, pos++;
+ }
}
else
c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
for (i = 1; i < to_nchars; i++)
*buf++ = XFIXNUM (AREF (trans, i));
for (i = 1; i < from_nchars; i++, pos++)
- src += MULTIBYTE_LENGTH_NO_CHECK (src);
+ src += multibyte_length (src, NULL, false, true);
}
}