From 8a44e6d176989d8eef140314098c76a70248ba61 Mon Sep 17 00:00:00 2001 From: Kenichi Handa Date: Sat, 16 Mar 2013 01:03:54 +0900 Subject: [PATCH] Optimize ASCII file reading with EOL format detection and decoding. --- src/ChangeLog | 28 +++++++ src/coding.c | 197 ++++++++++++++++++++++++++++++++++++++++---------- src/coding.h | 6 +- src/insdel.c | 33 ++++----- src/lisp.h | 4 +- 5 files changed, 209 insertions(+), 59 deletions(-) diff --git a/src/ChangeLog b/src/ChangeLog index 8ae25e6e612..44e2ff1a1f1 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,31 @@ +2013-03-15 handa + + * insdel.c (insert_from_gap): New arg text_at_gap_tail. + (adjust_after_replace): Make it back to static. Delete the third + arg text_at_gap_tail. Cancel the code for handling it. + + * coding.h (struct coding_system): New member eol_seen. + + * coding.c (detect_ascii): New function. + (detect_coding): Set coding->head_ascii and coding->eol_seen only + when the source bytes are actually scanned. On detecting for + coding_category_utf_8_auto, call detect_ascii instead of scanning + source bytes directly. + (produce_chars): Call insert_from_gap with the new arg 0. + (encode_coding): Likewise. + (decode_coding_gap): Control ASCII optimization by the variable + disable_ascii_optimization instead of #ifndef .. #endif. + Deccode EOL format according to coding->eol_seen. + (syms_of_coding): Declare disable-ascii-optimization as a Lisp + variable. + + * global.h (struct emacs_globals): New member + f_disable_ascii_optimization. + (disable_ascii_optimization): New macro. + + * lisp.h (adjust_after_replace): Cancel externing it. + (insert_from_gap): Adjust prototype. + 2013-03-11 Paul Eggert * insdel.c (adjust_after_replace): Use bool for boolean. diff --git a/src/coding.c b/src/coding.c index c18632f301b..5047e1149bc 100644 --- a/src/coding.c +++ b/src/coding.c @@ -6071,6 +6071,93 @@ complement_process_encoding_system (Lisp_Object coding_system) #define EOL_SEEN_CR 2 #define EOL_SEEN_CRLF 4 + +static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, int eol_seen); + + +/* Return 1 if all the source bytes are ASCII, and return 0 otherwize. + By side effects, set coding->head_ascii and coding->eol_seen. The + value of coding->eol_seen is "logical or" of EOL_SEEN_LF, + EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when + all the source bytes are ASCII. */ + +static bool +detect_ascii (struct coding_system *coding) +{ + const unsigned char *src, *end; + Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id); + int eol_seen; + + eol_seen = (VECTORP (eol_type) ? EOL_SEEN_NONE + : EQ (eol_type, Qunix) ? EOL_SEEN_LF + : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF + : EOL_SEEN_CR); + coding_set_source (coding); + src = coding->source; + end = src + coding->src_bytes; + + if (inhibit_eol_conversion) + { + /* We don't have to check EOL format. */ + while (src < end && !( *src & 0x80)) src++; + eol_seen = EOL_SEEN_LF; + adjust_coding_eol_type (coding, eol_seen); + } + else if (eol_seen != EOL_SEEN_NONE) + { + /* We don't have to check EOL format either. */ + while (src < end && !(*src & 0x80)) src++; + } + else + { + end--; /* We look ahead one byte. */ + while (src < end) + { + int c = *src; + + if (c & 0x80) + break; + src++; + if (c < 0x20) + { + if (c == '\r') + { + if (*src == '\n') + { + eol_seen |= EOL_SEEN_CRLF; + src++; + } + else + eol_seen |= EOL_SEEN_CR; + } + else if (c == '\n') + eol_seen |= EOL_SEEN_LF; + } + } + if (src > end) + /* The last two bytes are CR LF, which means that we have + scanned all bytes. */ + end++; + else if (src == end) + { + end++; + if (! (*src & 0x80)) + { + if (*src == '\r') + eol_seen |= EOL_SEEN_CR; + else if (*src == '\n') + eol_seen |= EOL_SEEN_LF; + src++; + } + } + adjust_coding_eol_type (coding, eol_seen); + } + coding->head_ascii = src - coding->source; + coding->eol_seen = eol_seen; + return (src == end); +} + + /* Detect how end-of-line of a text of length SRC_BYTES pointed by SOURCE is encoded. If CATEGORY is one of coding_category_utf_16_XXXX, assume that CR and LF are encoded by @@ -6215,7 +6302,6 @@ detect_coding (struct coding_system *coding) coding_set_source (coding); src_end = coding->source + coding->src_bytes; - coding->head_ascii = 0; /* If we have not yet decided the text encoding type, detect it now. */ @@ -6225,6 +6311,8 @@ detect_coding (struct coding_system *coding) struct coding_detection_info detect_info; bool null_byte_found = 0, eight_bit_found = 0; + coding->head_ascii = 0; + coding->eol_seen = EOL_SEEN_NONE; detect_info.checked = detect_info.found = detect_info.rejected = 0; for (src = coding->source; src < src_end; src++) { @@ -6263,6 +6351,26 @@ detect_coding (struct coding_system *coding) if (eight_bit_found) break; } + else if (! disable_ascii_optimization + && ! inhibit_eol_conversion) + { + if (c == '\r') + { + if (src < src_end && src[1] == '\n') + { + coding->eol_seen |= EOL_SEEN_CRLF; + src++; + coding->head_ascii++; + } + else + coding->eol_seen |= EOL_SEEN_CR; + } + else if (c == '\n') + { + coding->eol_seen |= EOL_SEEN_LF; + } + } + if (! eight_bit_found) coding->head_ascii++; } @@ -6353,19 +6461,20 @@ detect_coding (struct coding_system *coding) coding_systems = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); detect_info.found = detect_info.rejected = 0; - for (src = coding->source; src < src_end; src++) + if (detect_ascii (coding)) { - if (*src & 0x80) - break; + setup_coding_system (XCDR (coding_systems), coding); } - coding->head_ascii = src - coding->source; - if (CONSP (coding_systems) - && detect_coding_utf_8 (coding, &detect_info)) + else { - if (detect_info.found & CATEGORY_MASK_UTF_8_SIG) - setup_coding_system (XCAR (coding_systems), coding); - else - setup_coding_system (XCDR (coding_systems), coding); + if (CONSP (coding_systems) + && detect_coding_utf_8 (coding, &detect_info)) + { + if (detect_info.found & CATEGORY_MASK_UTF_8_SIG) + setup_coding_system (XCAR (coding_systems), coding); + else + setup_coding_system (XCDR (coding_systems), coding); + } } } else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) @@ -6378,6 +6487,7 @@ detect_coding (struct coding_system *coding) = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); detect_info.found = detect_info.rejected = 0; coding->head_ascii = 0; + coding->eol_seen = EOL_SEEN_NONE; if (CONSP (coding_systems) && detect_coding_utf_16 (coding, &detect_info)) { @@ -6815,7 +6925,7 @@ produce_chars (struct coding_system *coding, Lisp_Object translation_table, produced = dst - (coding->destination + coding->produced); if (BUFFERP (coding->dst_object) && produced_chars > 0) - insert_from_gap (produced_chars, produced); + insert_from_gap (produced_chars, produced, 0); coding->produced += produced; coding->produced_char += produced_chars; return carryover; @@ -7400,7 +7510,7 @@ encode_coding (struct coding_system *coding) } while (coding->consumed_char < coding->src_chars); if (BUFFERP (coding->dst_object) && coding->produced_char > 0) - insert_from_gap (coding->produced_char, coding->produced); + insert_from_gap (coding->produced_char, coding->produced, 0); SAFE_FREE (); } @@ -7510,39 +7620,45 @@ decode_coding_gap (struct coding_system *coding, if (CODING_REQUIRE_DETECTION (coding)) detect_coding (coding); attrs = CODING_ID_ATTRS (coding->id); -#ifndef CODING_DISABLE_ASCII_OPTIMIZATION - if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)) - && NILP (CODING_ATTR_POST_READ (attrs)) - && NILP (get_translation_table (attrs, 0, NULL)) - && (inhibit_eol_conversion - || EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))) + if (! disable_ascii_optimization) { - /* We can skip the conversion if all source bytes are ASCII. */ - if (coding->head_ascii < 0) + if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)) + && NILP (CODING_ATTR_POST_READ (attrs)) + && NILP (get_translation_table (attrs, 0, NULL)) + && (coding->head_ascii >= 0 /* We've already called detect_coding */ + ? coding->head_ascii == bytes + : detect_ascii (coding))) { - /* We have not yet counted the number of ASCII bytes at the - head of the source. Do it now. */ - const unsigned char *src, *src_end; + if (coding->eol_seen == EOL_SEEN_CR) + { + unsigned char *src_end = GAP_END_ADDR; + unsigned char *src = src - coding->src_bytes; - coding_set_source (coding); - src_end = coding->source + coding->src_bytes; - for (src = coding->source; src < src_end; src++) + while (src < src_end) + { + if (*src++ == '\r') + src[-1] = '\n'; + } + } + else if (coding->eol_seen == EOL_SEEN_CRLF) { - if (*src & 0x80) - break; + unsigned char *src = GAP_END_ADDR; + unsigned char *src_beg = src - coding->src_bytes; + unsigned char *dst = src; + + while (src_beg < src) + { + *--dst = *--src; + if (*src == '\n') + src--; + } + bytes -= dst - src; } - coding->head_ascii = src - coding->source; - } - if (coding->src_bytes == coding->head_ascii) - { - /* No need of conversion. Use the data in the gap as is. */ - coding->produced_char = chars; - coding->produced = bytes; - adjust_after_replace (PT, PT_BYTE, Qnil, chars, bytes, 1); + coding->produced_char = coding->produced = bytes; + insert_from_gap (bytes, bytes, 1); return; } } -#endif /* not CODING_DISABLE_ASCII_OPTIMIZATION */ code_conversion_save (0, 0); coding->mode |= CODING_MODE_LAST_BLOCK; @@ -10758,6 +10874,11 @@ from GNU Find and GNU Grep. Emacs will then ignore the null bytes and decode text as usual. */); inhibit_null_byte_detection = 0; + DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization, + doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files. +Internal use only. Removed after the experimental optimizer gets stable. */); + disable_ascii_optimization = 0; + DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input, doc: /* Char table for translating self-inserting characters. This is applied to the result of input methods, not their input. diff --git a/src/coding.h b/src/coding.h index c13567c3d53..d40209be68f 100644 --- a/src/coding.h +++ b/src/coding.h @@ -440,9 +440,13 @@ struct coding_system /* How may heading bytes we can skip for decoding. This is set to -1 in setup_coding_system, and updated by detect_coding. So, when this is equal to the byte length of the text being - converted, we can skip the actual conversion process. */ + converted, we can skip the actual conversion process except for + the eol format. */ ptrdiff_t head_ascii; + /* Used internally in coding.c. See the comment of detect_ascii. */ + int eol_seen; + /* The following members are set by encoding/decoding routine. */ ptrdiff_t produced, produced_char, consumed, consumed_char; diff --git a/src/insdel.c b/src/insdel.c index c0afa80d5e8..a60fed0c32e 100644 --- a/src/insdel.c +++ b/src/insdel.c @@ -977,10 +977,11 @@ insert_from_string_1 (Lisp_Object string, ptrdiff_t pos, ptrdiff_t pos_byte, } /* Insert a sequence of NCHARS chars which occupy NBYTES bytes - starting at GPT_ADDR. */ + starting at GAP_END_ADDR - NBYTES (if text_at_gap_tail) and at + GPT_ADDR (if not text_at_gap_tail). */ void -insert_from_gap (ptrdiff_t nchars, ptrdiff_t nbytes) +insert_from_gap (ptrdiff_t nchars, ptrdiff_t nbytes, bool text_at_gap_tail) { if (NILP (BVAR (current_buffer, enable_multibyte_characters))) nchars = nbytes; @@ -989,10 +990,13 @@ insert_from_gap (ptrdiff_t nchars, ptrdiff_t nbytes) MODIFF++; GAP_SIZE -= nbytes; - GPT += nchars; + if (! text_at_gap_tail) + { + GPT += nchars; + GPT_BYTE += nbytes; + } ZV += nchars; Z += nchars; - GPT_BYTE += nbytes; ZV_BYTE += nbytes; Z_BYTE += nbytes; if (GAP_SIZE > 0) *(GPT_ADDR) = 0; /* Put an anchor. */ @@ -1010,7 +1014,7 @@ insert_from_gap (ptrdiff_t nchars, ptrdiff_t nbytes) current_buffer, 0); } - if (GPT - nchars < PT) + if (! text_at_gap_tail && GPT - nchars < PT) adjust_point (nchars, nbytes); check_markers (); @@ -1162,16 +1166,14 @@ insert_from_buffer_1 (struct buffer *buf, /* Record undo information and adjust markers and position keepers for a replacement of a text PREV_TEXT at FROM to a new text of LEN - chars (LEN_BYTE bytes). If TEXT_AT_GAP_TAIL, the new text - resides at the gap tail; i.e. at (GAP_END_ADDR - LEN_BYTE) - Otherwise, the text resides in the gap just after GPT_BYTE. + chars (LEN_BYTE bytes) which resides in the gap just after + GPT_ADDR. PREV_TEXT nil means the new text was just inserted. */ -void +static void adjust_after_replace (ptrdiff_t from, ptrdiff_t from_byte, - Lisp_Object prev_text, ptrdiff_t len, ptrdiff_t len_byte, - bool text_at_gap_tail) + Lisp_Object prev_text, ptrdiff_t len, ptrdiff_t len_byte) { ptrdiff_t nchars_del = 0, nbytes_del = 0; @@ -1191,11 +1193,8 @@ adjust_after_replace (ptrdiff_t from, ptrdiff_t from_byte, GAP_SIZE -= len_byte; ZV += len; Z+= len; ZV_BYTE += len_byte; Z_BYTE += len_byte; - if (! text_at_gap_tail) - { - GPT += len; GPT_BYTE += len_byte; - if (GAP_SIZE > 0) *(GPT_ADDR) = 0; /* Put an anchor. */ - } + GPT += len; GPT_BYTE += len_byte; + if (GAP_SIZE > 0) *(GPT_ADDR) = 0; /* Put an anchor. */ if (nchars_del > 0) adjust_markers_for_replace (from, from_byte, nchars_del, nbytes_del, @@ -1250,7 +1249,7 @@ adjust_after_insert (ptrdiff_t from, ptrdiff_t from_byte, GPT -= len; GPT_BYTE -= len_byte; ZV -= len; ZV_BYTE -= len_byte; Z -= len; Z_BYTE -= len_byte; - adjust_after_replace (from, from_byte, Qnil, newlen, len_byte, 0); + adjust_after_replace (from, from_byte, Qnil, newlen, len_byte); } /* Replace the text from character positions FROM to TO with NEW, diff --git a/src/lisp.h b/src/lisp.h index bb9f60b29f9..b2ab5684d4d 100644 --- a/src/lisp.h +++ b/src/lisp.h @@ -2880,7 +2880,7 @@ extern void insert (const char *, ptrdiff_t); extern void insert_and_inherit (const char *, ptrdiff_t); extern void insert_1_both (const char *, ptrdiff_t, ptrdiff_t, bool, bool, bool); -extern void insert_from_gap (ptrdiff_t, ptrdiff_t); +extern void insert_from_gap (ptrdiff_t, ptrdiff_t, bool text_at_gap_tail); extern void insert_from_string (Lisp_Object, ptrdiff_t, ptrdiff_t, ptrdiff_t, ptrdiff_t, bool); extern void insert_from_buffer (struct buffer *, ptrdiff_t, ptrdiff_t, bool); @@ -2900,8 +2900,6 @@ extern Lisp_Object del_range_2 (ptrdiff_t, ptrdiff_t, extern void modify_region_1 (ptrdiff_t, ptrdiff_t, bool); extern void prepare_to_modify_buffer (ptrdiff_t, ptrdiff_t, ptrdiff_t *); extern void signal_after_change (ptrdiff_t, ptrdiff_t, ptrdiff_t); -extern void adjust_after_replace (ptrdiff_t, ptrdiff_t, Lisp_Object, - ptrdiff_t, ptrdiff_t, bool); extern void adjust_after_insert (ptrdiff_t, ptrdiff_t, ptrdiff_t, ptrdiff_t, ptrdiff_t); extern void adjust_markers_for_delete (ptrdiff_t, ptrdiff_t, -- 2.39.2