}
-/* Return a character whose multibyte form is at P. Set *LEN to the
- byte length of the multibyte form. */
-
-int
-string_char (const unsigned char *p, int *len)
-{
- int c;
- const unsigned char *saved_p = p;
-
- if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
- {
- /* 1-, 2-, and 3-byte sequences can be handled by the macro. */
- c = string_char_advance (&p);
- }
- else if (! (*p & 0x08))
- {
- /* A 4-byte sequence of this form:
- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
- c = ((((p)[0] & 0x7) << 18)
- | (((p)[1] & 0x3F) << 12)
- | (((p)[2] & 0x3F) << 6)
- | ((p)[3] & 0x3F));
- p += 4;
- }
- else
- {
- /* A 5-byte sequence of this form:
-
- 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
-
- Note that the top 4 `x's are always 0, so shifting p[1] can
- never exceed the maximum valid character codepoint. */
- c = (/* (((p)[0] & 0x3) << 24) ... always 0, so no need to shift. */
- (((p)[1] & 0x3F) << 18)
- | (((p)[2] & 0x3F) << 12)
- | (((p)[3] & 0x3F) << 6)
- | ((p)[4] & 0x3F));
- p += 5;
- }
-
- *len = p - saved_p;
- return c;
-}
-
-
/* Translate character C by translation table TABLE. If no translation is
found in TABLE, return the untranslated character. If TABLE is a list,
elements are char tables. In that case, recursively translate C by all the
};
extern int char_string (unsigned, unsigned char *);
-extern int string_char (const unsigned char *, int *);
/* UTF-8 encodings. Use \x escapes, so they are portable to pre-C11
compilers and can be concatenated with ordinary string literals. */
INLINE int
string_char_and_length (unsigned char const *p, int *length)
{
- int c, len;
+ int c = p[0];
+ if (! (c & 0x80))
+ {
+ *length = 1;
+ return c;
+ }
+ eassume (0xC0 <= c);
- if (! (p[0] & 0x80))
+ int d = (c << 6) + p[1] - ((0xC0 << 6) + 0x80);
+ if (! (c & 0x20))
{
- len = 1;
- c = p[0];
+ *length = 2;
+ return d + (c < 0xC2 ? 0x3FFF80 : 0);
}
- else if (! (p[0] & 0x20))
+
+ d = (d << 6) + p[2] - ((0x20 << 12) + 0x80);
+ if (! (c & 0x10))
{
- len = 2;
- c = ((((p[0] & 0x1F) << 6)
- | (p[1] & 0x3F))
- + (p[0] < 0xC2 ? 0x3FFF80 : 0));
+ *length = 3;
+ eassume (MAX_2_BYTE_CHAR < d && d <= MAX_3_BYTE_CHAR);
+ return d;
}
- else if (! (p[0] & 0x10))
+
+ d = (d << 6) + p[3] - ((0x10 << 18) + 0x80);
+ if (! (c & 0x08))
{
- len = 3;
- c = (((p[0] & 0x0F) << 12)
- | ((p[1] & 0x3F) << 6)
- | (p[2] & 0x3F));
+ *length = 4;
+ eassume (MAX_3_BYTE_CHAR < d && d <= MAX_4_BYTE_CHAR);
+ return d;
}
- else
- c = string_char (p, &len);
- eassume (0 < len && len <= MAX_MULTIBYTE_LENGTH);
- *length = len;
- return c;
+ d = (d << 6) + p[4] - ((0x08 << 24) + 0x80);
+ *length = 5;
+ eassume (MAX_4_BYTE_CHAR < d && d <= MAX_5_BYTE_CHAR);
+ return d;
}
/* Return the character code of character whose multibyte form is at P. */