From 119852e72b941a90312b943c2edd467a53f599d7 Mon Sep 17 00:00:00 2001 From: Kenichi Handa Date: Sun, 2 Mar 2008 08:05:14 +0000 Subject: [PATCH] (decode_coding_utf_8): When eol-type of CODING is `dos', don't decode '\r' if that is the last in the source. (decode_coding_utf_16, decode_coding_emacs_mule) (decode_coding_iso_2022, decode_coding_sjis, decode_coding_big5) (decode_coding_raw_text, decode_coding_charset): Likewise. (produce_chars): Don't decode EOL here. Use EMACS_INT. --- src/ChangeLog | 9 +++ src/coding.c | 204 +++++++++++++++++++++++++++++--------------------- 2 files changed, 128 insertions(+), 85 deletions(-) diff --git a/src/ChangeLog b/src/ChangeLog index 0a087cbc85e..04eb2201b77 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,12 @@ +2008-03-02 Kenichi Handa + + * coding.c (decode_coding_utf_8): When eol-type of CODING is + `dos', don't decode '\r' if that is the last in the source. + (decode_coding_utf_16, decode_coding_emacs_mule) + (decode_coding_iso_2022, decode_coding_sjis, decode_coding_big5) + (decode_coding_raw_text, decode_coding_charset): Likewise. + (produce_chars): Don't decode EOL here. Use EMACS_INT. + 2008-03-01 Jason Rumney * w32font.c (w32font_full_name): Report point size for scalable fonts. diff --git a/src/coding.c b/src/coding.c index a43b5cb8187..525764d693d 100644 --- a/src/coding.c +++ b/src/coding.c @@ -955,6 +955,11 @@ record_conversion_result (struct coding_system *coding, } while (0) +/* If there are at least BYTES length of room at dst, allocate memory + for coding->destination and update dst and dst_end. We don't have + to take care of coding->source which will be relocated. It is + handled by calling coding_set_source in encode_coding. */ + #define ASSURE_DESTINATION(bytes) \ do { \ if (dst + (bytes) >= dst_end) \ @@ -1225,6 +1230,8 @@ decode_coding_utf_8 (coding) int consumed_chars = 0, consumed_chars_base; int multibytep = coding->src_multibyte; Lisp_Object attr, charset_list; + int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); + int byte_after_cr = -1; CODING_GET_INFO (coding, attr, charset_list); @@ -1238,13 +1245,18 @@ decode_coding_utf_8 (coding) if (charbuf >= charbuf_end) break; - ONE_MORE_BYTE (c1); + if (byte_after_cr >= 0) + c1 = byte_after_cr, byte_after_cr = -1; + else + ONE_MORE_BYTE (c1); if (c1 < 0) { c = - c1; } else if (UTF_8_1_OCTET_P(c1)) { + if (eol_crlf && c1 == '\r') + ONE_MORE_BYTE (byte_after_cr); c = c1; } else @@ -1458,6 +1470,8 @@ decode_coding_utf_16 (coding) enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding); int surrogate = CODING_UTF_16_SURROGATE (coding); Lisp_Object attr, charset_list; + int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); + int byte_after_cr1 = -1, byte_after_cr2 = -1; CODING_GET_INFO (coding, attr, charset_list); @@ -1497,13 +1511,19 @@ decode_coding_utf_16 (coding) if (charbuf + 2 >= charbuf_end) break; - ONE_MORE_BYTE (c1); + if (byte_after_cr1 >= 0) + c1 = byte_after_cr1, byte_after_cr1 = -1; + else + ONE_MORE_BYTE (c1); if (c1 < 0) { *charbuf++ = -c1; continue; } - ONE_MORE_BYTE (c2); + if (byte_after_cr2 >= 0) + c2 = byte_after_cr2, byte_after_cr2 = -1; + else + ONE_MORE_BYTE (c2); if (c2 < 0) { *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1); @@ -1512,6 +1532,7 @@ decode_coding_utf_16 (coding) } c = (endian == utf_16_big_endian ? ((c1 << 8) | c2) : ((c2 << 8) | c1)); + if (surrogate) { if (! UTF_16_LOW_SURROGATE_P (c)) @@ -1540,7 +1561,14 @@ decode_coding_utf_16 (coding) if (UTF_16_HIGH_SURROGATE_P (c)) CODING_UTF_16_SURROGATE (coding) = surrogate = c; else - *charbuf++ = c; + { + if (eol_crlf && c == '\r') + { + ONE_MORE_BYTE (byte_after_cr1); + ONE_MORE_BYTE (byte_after_cr2); + } + *charbuf++ = c; + } } } @@ -2072,6 +2100,8 @@ decode_coding_emacs_mule (coding) int char_offset = coding->produced_char; int last_offset = char_offset; int last_id = charset_ascii; + int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); + int byte_after_cr = -1; CODING_GET_INFO (coding, attrs, charset_list); @@ -2085,7 +2115,10 @@ decode_coding_emacs_mule (coding) if (charbuf >= charbuf_end) break; - ONE_MORE_BYTE (c); + if (byte_after_cr >= 0) + c = byte_after_cr, byte_after_cr = -1; + else + ONE_MORE_BYTE (c); if (c < 0) { *charbuf++ = -c; @@ -2093,6 +2126,8 @@ decode_coding_emacs_mule (coding) } else if (c < 0x80) { + if (eol_crlf && c == '\r') + ONE_MORE_BYTE (byte_after_cr); *charbuf++ = c; char_offset++; } @@ -2945,6 +2980,8 @@ decode_coding_iso_2022 (coding) int char_offset = coding->produced_char; int last_offset = char_offset; int last_id = charset_ascii; + int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); + int byte_after_cr = -1; CODING_GET_INFO (coding, attrs, charset_list); setup_iso_safe_charsets (attrs); @@ -2962,7 +2999,10 @@ decode_coding_iso_2022 (coding) if (charbuf >= charbuf_end) break; - ONE_MORE_BYTE (c1); + if (byte_after_cr >= 0) + c1 = byte_after_cr, byte_after_cr = -1; + else + ONE_MORE_BYTE (c1); if (c1 < 0) goto invalid_code; @@ -3021,6 +3061,8 @@ decode_coding_iso_2022 (coding) break; case ISO_control_0: + if (eol_crlf && c1 == '\r') + ONE_MORE_BYTE (byte_after_cr); MAYBE_FINISH_COMPOSITION (); charset = CHARSET_FROM_ID (charset_ascii); break; @@ -4091,6 +4133,8 @@ decode_coding_sjis (coding) int char_offset = coding->produced_char; int last_offset = char_offset; int last_id = charset_ascii; + int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); + int byte_after_cr = -1; CODING_GET_INFO (coding, attrs, charset_list); @@ -4111,11 +4155,18 @@ decode_coding_sjis (coding) if (charbuf >= charbuf_end) break; - ONE_MORE_BYTE (c); + if (byte_after_cr >= 0) + c = byte_after_cr, byte_after_cr = -1; + else + ONE_MORE_BYTE (c); if (c < 0) goto invalid_code; if (c < 0x80) - charset = charset_roman; + { + if (eol_crlf && c == '\r') + ONE_MORE_BYTE (byte_after_cr); + charset = charset_roman; + } else if (c == 0x80 || c == 0xA0) goto invalid_code; else if (c >= 0xA1 && c <= 0xDF) @@ -4193,6 +4244,8 @@ decode_coding_big5 (coding) int char_offset = coding->produced_char; int last_offset = char_offset; int last_id = charset_ascii; + int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); + int byte_after_cr = -1; CODING_GET_INFO (coding, attrs, charset_list); val = charset_list; @@ -4210,12 +4263,19 @@ decode_coding_big5 (coding) if (charbuf >= charbuf_end) break; - ONE_MORE_BYTE (c); + if (byte_after_cr >= 0) + c1 = byte_after_cr, byte_after_cr = -1; + else + ONE_MORE_BYTE (c); if (c < 0) goto invalid_code; if (c < 0x80) - charset = charset_roman; + { + if (eol_crlf && c1 == '\r') + ONE_MORE_BYTE (byte_after_cr); + charset = charset_roman; + } else { /* BIG5 -> Big5 */ @@ -4632,10 +4692,19 @@ static void decode_coding_raw_text (coding) struct coding_system *coding; { + int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); + coding->chars_at_source = 1; - coding->consumed_char = 0; - coding->consumed = 0; - record_conversion_result (coding, CODING_RESULT_SUCCESS); + coding->consumed_char = coding->src_chars; + coding->consumed = coding->src_bytes; + if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r') + { + coding->consumed_char--; + coding->consumed--; + record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC); + } + else + record_conversion_result (coding, CODING_RESULT_SUCCESS); } static int @@ -4829,6 +4898,8 @@ decode_coding_charset (coding) int char_offset = coding->produced_char; int last_offset = char_offset; int last_id = charset_ascii; + int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); + int byte_after_cr = -1; CODING_GET_INFO (coding, attrs, charset_list); valids = AREF (attrs, coding_attr_charset_valids); @@ -4848,7 +4919,17 @@ decode_coding_charset (coding) if (charbuf >= charbuf_end) break; - ONE_MORE_BYTE (c); + if (byte_after_cr >= 0) + { + c = byte_after_cr; + byte_after_cr = -1; + } + else + { + ONE_MORE_BYTE (c); + if (eol_crlf && c == '\r') + ONE_MORE_BYTE (byte_after_cr); + } if (c < 0) goto invalid_code; code = c; @@ -5880,13 +5961,13 @@ produce_chars (coding, translation_table, last_block) { unsigned char *dst = coding->destination + coding->produced; unsigned char *dst_end = coding->destination + coding->dst_bytes; - int produced; - int produced_chars = 0; + EMACS_INT produced; + EMACS_INT produced_chars = 0; int carryover = 0; if (! coding->chars_at_source) { - /* Characters are in coding->charbuf. */ + /* Source characters are in coding->charbuf. */ int *buf = coding->charbuf; int *buf_end = buf + coding->charbuf_used; @@ -5945,18 +6026,16 @@ produce_chars (coding, translation_table, last_block) } else { + /* Source characters are at coding->source. */ const unsigned char *src = coding->source; - const unsigned char *src_end = src + coding->src_bytes; - Lisp_Object eol_type; - - eol_type = CODING_ID_EOL_TYPE (coding->id); + const unsigned char *src_end = src + coding->consumed; if (coding->src_multibyte != coding->dst_multibyte) { if (coding->src_multibyte) { int multibytep = 1; - int consumed_chars; + EMACS_INT consumed_chars; while (1) { @@ -5964,37 +6043,21 @@ produce_chars (coding, translation_table, last_block) int c; ONE_MORE_BYTE (c); - if (c == '\r') + if (dst == dst_end) { - if (EQ (eol_type, Qdos)) + if (EQ (coding->src_object, coding->dst_object)) + dst_end = (unsigned char *) src; + if (dst == dst_end) { - if (src == src_end) - { - record_conversion_result - (coding, CODING_RESULT_INSUFFICIENT_SRC); - goto no_more_source; - } - if (*src == '\n') - c = *src++; + EMACS_INT offset = src - coding->source; + + dst = alloc_destination (coding, src_end - src + 1, + dst); + dst_end = coding->destination + coding->dst_bytes; + coding_set_source (coding); + src = coding->source + offset; + src_end = coding->source + coding->src_bytes; } - else if (EQ (eol_type, Qmac)) - c = '\n'; - } - if (dst == dst_end) - { - coding->consumed = src - coding->source; - - if (EQ (coding->src_object, coding->dst_object)) - dst_end = (unsigned char *) src; - if (dst == dst_end) - { - dst = alloc_destination (coding, src_end - src + 1, - dst); - dst_end = coding->destination + coding->dst_bytes; - coding_set_source (coding); - src = coding->source + coding->consumed; - src_end = coding->source + coding->src_bytes; - } } *dst++ = c; produced_chars++; @@ -6008,30 +6071,19 @@ produce_chars (coding, translation_table, last_block) int multibytep = 1; int c = *src++; - if (c == '\r') - { - if (EQ (eol_type, Qdos)) - { - if (src < src_end - && *src == '\n') - c = *src++; - } - else if (EQ (eol_type, Qmac)) - c = '\n'; - } if (dst >= dst_end - 1) { - coding->consumed = src - coding->source; - if (EQ (coding->src_object, coding->dst_object)) dst_end = (unsigned char *) src; if (dst >= dst_end - 1) { + EMACS_INT offset = src - coding->source; + dst = alloc_destination (coding, src_end - src + 2, dst); dst_end = coding->destination + coding->dst_bytes; coding_set_source (coding); - src = coding->source + coding->consumed; + src = coding->source + offset; src_end = coding->source + coding->src_bytes; } } @@ -6042,7 +6094,7 @@ produce_chars (coding, translation_table, last_block) { if (!EQ (coding->src_object, coding->dst_object)) { - int require = coding->src_bytes - coding->dst_bytes; + EMACS_INT require = coding->src_bytes - coding->dst_bytes; if (require > 0) { @@ -6054,28 +6106,10 @@ produce_chars (coding, translation_table, last_block) src_end = coding->source + coding->src_bytes; } } - produced_chars = coding->src_chars; + produced_chars = coding->consumed_char; while (src < src_end) - { - int c = *src++; - - if (c == '\r') - { - if (EQ (eol_type, Qdos)) - { - if (src < src_end - && *src == '\n') - c = *src++; - produced_chars--; - } - else if (EQ (eol_type, Qmac)) - c = '\n'; - } - *dst++ = c; - } + *dst += *src++; } - coding->consumed = coding->src_bytes; - coding->consumed_char = coding->src_chars; } produced = dst - (coding->destination + coding->produced); -- 2.39.5