From: Eli Zaretskii Date: Fri, 6 Apr 2012 13:10:30 +0000 (+0300) Subject: Warning comments about subtleties of fetching characters from buffers/strings. X-Git-Tag: emacs-24.0.96~111^2~14 X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=2f8e16b2a3c5782a3c8266cc76fbba80d506b93d;p=emacs.git Warning comments about subtleties of fetching characters from buffers/strings. src/buffer.h (FETCH_CHAR, FETCH_MULTIBYTE_CHAR): src/character.h (STRING_CHAR, STRING_CHAR_AND_LENGTH): Add comments about subtle differences between FETCH_CHAR* and STRING_CHAR* macros related to unification of CJK characters. For the details, see the discussion following the message here: http://debbugs.gnu.org/cgi/bugreport.cgi?bug=11073#14. --- diff --git a/src/ChangeLog b/src/ChangeLog index ea770969818..af65d38a33b 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,12 @@ +2012-04-06 Eli Zaretskii + + * buffer.h (FETCH_CHAR, FETCH_MULTIBYTE_CHAR): + * character.h (STRING_CHAR, STRING_CHAR_AND_LENGTH): Add comments + about subtle differences between FETCH_CHAR* and STRING_CHAR* + macros related to unification of CJK characters. For the details, + see the discussion following the message here: + http://debbugs.gnu.org/cgi/bugreport.cgi?bug=11073#14. + 2012-04-04 Chong Yidong * keyboard.c (Vdelayed_warnings_list): Doc fix. diff --git a/src/buffer.h b/src/buffer.h index 3df4a95cf93..1635a847839 100644 --- a/src/buffer.h +++ b/src/buffer.h @@ -343,7 +343,8 @@ while (0) - (ptr - (current_buffer)->text->beg <= GPT_BYTE - BEG_BYTE ? 0 : GAP_SIZE) \ + BEG_BYTE) -/* Return character at byte position POS. */ +/* Return character at byte position POS. See the caveat WARNING for + FETCH_MULTIBYTE_CHAR below. */ #define FETCH_CHAR(pos) \ (!NILP (BVAR (current_buffer, enable_multibyte_characters)) \ @@ -359,7 +360,17 @@ extern unsigned char *_fetch_multibyte_char_p; /* Return character code of multi-byte form at byte position POS. If POS doesn't point the head of valid multi-byte form, only the byte at - POS is returned. No range checking. */ + POS is returned. No range checking. + + WARNING: The character returned by this macro could be "unified" + inside STRING_CHAR, if the original character in the buffer belongs + to one of the Private Use Areas (PUAs) of codepoints that Emacs + uses to support non-unified CJK characters. If that happens, + CHAR_BYTES will return a value that is different from the length of + the original multibyte sequence stored in the buffer. Therefore, + do _not_ use FETCH_MULTIBYTE_CHAR if you need to advance through + the buffer to the next character after fetching this one. Instead, + use either FETCH_CHAR_ADVANCE or STRING_CHAR_AND_LENGTH. */ #define FETCH_MULTIBYTE_CHAR(pos) \ (_fetch_multibyte_char_p = (((pos) >= GPT_BYTE ? GAP_SIZE : 0) \ diff --git a/src/character.h b/src/character.h index 5ae6cb8c49c..a829def428d 100644 --- a/src/character.h +++ b/src/character.h @@ -292,7 +292,9 @@ along with GNU Emacs. If not, see . */ } while (0) /* Return the character code of character whose multibyte form is at - P. */ + P. Note that this macro unifies CJK characters whose codepoints + are in the Private Use Areas (PUAs), so it might return a different + codepoint from the one actually stored at P. */ #define STRING_CHAR(p) \ (!((p)[0] & 0x80) \ @@ -309,7 +311,15 @@ along with GNU Emacs. If not, see . */ /* Like STRING_CHAR, but set ACTUAL_LEN to the length of multibyte - form. */ + form. + + Note: This macro returns the actual length of the character's + multibyte sequence as it is stored in a buffer or string. The + character it returns might have a different codepoint that has a + different multibyte sequence of a different legth, due to possible + unification of CJK characters inside string_char. Therefore do NOT + assume that the length returned by this macro is identical to the + length of the multibyte sequence of the character it returns. */ #define STRING_CHAR_AND_LENGTH(p, actual_len) \ (!((p)[0] & 0x80) \