From: Kenichi Handa Date: Fri, 5 Apr 2013 14:08:56 +0000 (+0900) Subject: Optimize the code for reading UTF-8 files. X-Git-Tag: emacs-24.3.90~173^2^2~42^2~45^2~387^2~2026^2~527 X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=251e91474c91e16b101502c2ed7c05fc13e4ecea;p=emacs.git Optimize the code for reading UTF-8 files. --- diff --git a/src/ChangeLog b/src/ChangeLog index 0634ec7cc1d..6b3ca9d3ff3 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,17 @@ +2013-04-03 Kenichi Handa + + The following changes is to optimize the code for reading UTF-8 + files. + + * coding.c (check_ascii): Renamed from detect_ascii. Return value + changed. Check EOL format. Do not call adjust_coding_eol_type + here. + (check_utf_8): New function. + (adjust_coding_eol_type): Do nothing if already adjusted. + (detect_coding): Compare the return value of check_ascii with + coding->src_bytes. Call adjust_coding_eol_type if necessary. + (decode_coding_gap): Optimize for valid UTF-8. + 2013-03-21 Kenichi Handa * coding.c (syms_of_coding): Cancel previous change. @@ -89,7 +103,7 @@ * coding.c (decode_coding_gap): Fix typo caught by static checking. -2013-03-15 handa +2013-03-15 Kenichi Handa * insdel.c (insert_from_gap): New arg text_at_gap_tail. (adjust_after_replace): Make it back to static. Delete the third diff --git a/src/coding.c b/src/coding.c index 8a09cd67859..735af25502d 100644 --- a/src/coding.c +++ b/src/coding.c @@ -6072,17 +6072,18 @@ complement_process_encoding_system (Lisp_Object coding_system) #define EOL_SEEN_CRLF 4 -static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, int eol_seen); +static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, + int eol_seen); -/* Return true iff all the source bytes are ASCII. +/* Return the number of ASCII characters at the head of the source. By side effects, set coding->head_ascii and coding->eol_seen. The value of coding->eol_seen is "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when all the source bytes are ASCII. */ -static bool -detect_ascii (struct coding_system *coding) +static int +check_ascii (struct coding_system *coding) { const unsigned char *src, *end; Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id); @@ -6096,21 +6097,20 @@ detect_ascii (struct coding_system *coding) src = coding->source; end = src + coding->src_bytes; - if (inhibit_eol_conversion) + if (inhibit_eol_conversion + || eol_seen != EOL_SEEN_NONE) { /* We don't have to check EOL format. */ while (src < end && !( *src & 0x80)) src++; - eol_seen = EOL_SEEN_LF; - adjust_coding_eol_type (coding, eol_seen); - } - else if (eol_seen != EOL_SEEN_NONE) - { - /* We don't have to check EOL format either. */ - while (src < end && !(*src & 0x80)) src++; + if (inhibit_eol_conversion) + { + eol_seen = EOL_SEEN_LF; + adjust_coding_eol_type (coding, eol_seen); + } } else { - end--; /* We look ahead one byte. */ + end--; /* We look ahead one byte for "CR LF". */ while (src < end) { int c = *src; @@ -6118,6 +6118,69 @@ detect_ascii (struct coding_system *coding) if (c & 0x80) break; src++; + if (c == '\r') + { + if (*src == '\n') + { + eol_seen |= EOL_SEEN_CRLF; + src++; + } + else + eol_seen |= EOL_SEEN_CR; + } + else if (c == '\n') + eol_seen |= EOL_SEEN_LF; + } + if (src == end) + { + int c = *src; + + /* All bytes but the last one C are ASCII. */ + if (! (c & 0x80)) + { + if (c == '\r') + eol_seen |= EOL_SEEN_CR; + else if (c == '\n') + eol_seen |= EOL_SEEN_LF; + src++; + } + } + } + coding->head_ascii = src - coding->source; + coding->eol_seen = eol_seen; + return (coding->head_ascii); +} + + +/* Return the number of charcters at the source if all the bytes are + valid UTF-8 (of Unicode range). Otherwise, return -1. By side + effects, update coding->eol_seen. The value of coding->eol_seen is + "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but + the value is reliable only when all the source bytes are valid + UTF-8. */ + +static int +check_utf_8 (struct coding_system *coding) +{ + const unsigned char *src, *end; + int eol_seen = coding->eol_seen; + int nchars = coding->head_ascii; + + if (coding->head_ascii < 0) + check_ascii (coding); + else + coding_set_source (coding); + src = coding->source + coding->head_ascii; + /* We look ahead one byte for CR LF. */ + end = coding->source + coding->src_bytes - 1; + + while (src < end) + { + int c = *src; + + if (UTF_8_1_OCTET_P (*src)) + { + src++; if (c < 0x20) { if (c == '\r') @@ -6126,6 +6189,7 @@ detect_ascii (struct coding_system *coding) { eol_seen |= EOL_SEEN_CRLF; src++; + nchars++; } else eol_seen |= EOL_SEEN_CR; @@ -6134,27 +6198,58 @@ detect_ascii (struct coding_system *coding) eol_seen |= EOL_SEEN_LF; } } - if (src > end) - /* The last two bytes are CR LF, which means that we have - scanned all bytes. */ - end++; - else if (src == end) + else if (UTF_8_2_OCTET_LEADING_P (c)) { - end++; - if (! (*src & 0x80)) - { - if (*src == '\r') - eol_seen |= EOL_SEEN_CR; - else if (*src == '\n') - eol_seen |= EOL_SEEN_LF; - src++; - } + if (c < 0xC2 /* overlong sequence */ + || src + 1 >= end + || ! UTF_8_EXTRA_OCTET_P (src[1])) + return -1; + src += 2; } - adjust_coding_eol_type (coding, eol_seen); + else if (UTF_8_3_OCTET_LEADING_P (c)) + { + if (src + 2 >= end + || ! (UTF_8_EXTRA_OCTET_P (src[1]) + && UTF_8_EXTRA_OCTET_P (src[2]))) + return -1; + c = (((c & 0xF) << 12) + | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); + if (c < 0x800 /* overlong sequence */ + || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */ + return -1; + src += 3; + } + else if (UTF_8_4_OCTET_LEADING_P (c)) + { + if (src + 3 >= end + || ! (UTF_8_EXTRA_OCTET_P (src[1]) + && UTF_8_EXTRA_OCTET_P (src[2]) + && UTF_8_EXTRA_OCTET_P (src[3]))) + return -1; + c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12) + | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); + if (c < 0x10000 /* overlong sequence */ + || c >= 0x110000) /* non-Unicode character */ + return -1; + src += 4; + } + else + return -1; + nchars++; + } + + if (src == end) + { + if (! UTF_8_1_OCTET_P (*src)) + return -1; + nchars++; + if (*src == '\r') + eol_seen |= EOL_SEEN_CR; + else if (*src == '\n') + eol_seen |= EOL_SEEN_LF; } - coding->head_ascii = src - coding->source; coding->eol_seen = eol_seen; - return (src == end); + return nchars; } @@ -6269,6 +6364,9 @@ adjust_coding_eol_type (struct coding_system *coding, int eol_seen) Lisp_Object eol_type; eol_type = CODING_ID_EOL_TYPE (coding->id); + if (! VECTORP (eol_type)) + /* Already adjusted. */ + return eol_type; if (eol_seen & EOL_SEEN_LF) { coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0)); @@ -6360,7 +6458,8 @@ detect_coding (struct coding_system *coding) { coding->eol_seen |= EOL_SEEN_CRLF; src++; - coding->head_ascii++; + if (! eight_bit_found) + coding->head_ascii++; } else coding->eol_seen |= EOL_SEEN_CR; @@ -6461,9 +6560,14 @@ detect_coding (struct coding_system *coding) coding_systems = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); detect_info.found = detect_info.rejected = 0; - if (detect_ascii (coding)) + if (check_ascii (coding) == coding->src_bytes) { + int head_ascii = coding->head_ascii; + + if (coding->eol_seen != EOL_SEEN_NONE) + adjust_coding_eol_type (coding, coding->eol_seen); setup_coding_system (XCDR (coding_systems), coding); + coding->head_ascii = head_ascii; } else { @@ -7620,15 +7724,27 @@ decode_coding_gap (struct coding_system *coding, if (CODING_REQUIRE_DETECTION (coding)) detect_coding (coding); attrs = CODING_ID_ATTRS (coding->id); - if (! disable_ascii_optimization) - { - if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)) - && NILP (CODING_ATTR_POST_READ (attrs)) - && NILP (get_translation_table (attrs, 0, NULL)) - && (coding->head_ascii >= 0 /* We've already called detect_coding */ - ? coding->head_ascii == bytes - : detect_ascii (coding))) + if (! disable_ascii_optimization + && ! coding->src_multibyte + && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)) + && NILP (CODING_ATTR_POST_READ (attrs)) + && NILP (get_translation_table (attrs, 0, NULL))) + { + chars = coding->head_ascii; + if (chars < 0) + chars = check_ascii (coding); + if (chars != bytes) + { + if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)) + chars = check_utf_8 (coding); + else + chars = -1; + } + if (chars >= 0) { + if (coding->eol_seen != EOL_SEEN_NONE) + adjust_coding_eol_type (coding, coding->eol_seen); + if (coding->eol_seen == EOL_SEEN_CR) { unsigned char *src_end = GAP_END_ADDR; @@ -7645,6 +7761,7 @@ decode_coding_gap (struct coding_system *coding, unsigned char *src = GAP_END_ADDR; unsigned char *src_beg = src - coding->src_bytes; unsigned char *dst = src; + ptrdiff_t diff; while (src_beg < src) { @@ -7652,10 +7769,13 @@ decode_coding_gap (struct coding_system *coding, if (*src == '\n') src--; } - bytes -= dst - src; + diff = dst - src; + bytes -= diff; + chars -= diff; } - coding->produced_char = coding->produced = bytes; - insert_from_gap (bytes, bytes, 1); + coding->produced = bytes; + coding->produced_char = chars; + insert_from_gap (chars, bytes, 1); return; } }