CHECK_STRING (string);
bufp = compile_pattern (string, &search_regs,
(!NILP (current_buffer->case_fold_search)
- ? DOWNCASE_TABLE : Qnil),
+ ? current_buffer->case_canon_table : Qnil),
posix,
!NILP (current_buffer->enable_multibyte_characters));
bufp = compile_pattern (regexp, &search_regs,
(!NILP (current_buffer->case_fold_search)
- ? DOWNCASE_TABLE : Qnil),
+ ? current_buffer->case_canon_table : Qnil),
posix,
STRING_MULTIBYTE (string));
immediate_quit = 1;
regexp = string_make_unibyte (regexp);
re_match_object = Qt;
bufp = compile_pattern (regexp, 0,
- Vascii_downcase_table, 0,
+ Vascii_canon_table, 0,
0);
immediate_quit = 1;
val = re_search (bufp, string, len, 0, len, 0);
int val;
struct re_pattern_buffer *bufp;
- bufp = compile_pattern (regexp, 0, Vascii_downcase_table,
+ bufp = compile_pattern (regexp, 0, Vascii_canon_table,
0, STRING_MULTIBYTE (string));
immediate_quit = 1;
re_match_object = string;
unsigned char *patbuf;
int multibyte = !NILP (current_buffer->enable_multibyte_characters);
unsigned char *base_pat = SDATA (string);
- int charset_base = -1;
+ /* Set to nozero if we find a non-ASCII char that need
+ translation. */
+ int charset_base = 0;
int boyer_moore_ok = 1;
/* MULTIBYTE says whether the text to be searched is multibyte.
base_pat = raw_pattern;
if (multibyte)
{
+ /* Fill patbuf by translated characters in STRING while
+ checking if we can use boyer-moore search. If TRT is
+ non-nil, we can use boyer-moore search only if TRT can be
+ represented by the byte array of 256 elements. For that,
+ all non-ASCII case-equivalents of all case-senstive
+ characters in STRING must belong to the same charset and
+ row. */
+
while (--len >= 0)
{
- unsigned char str[MAX_MULTIBYTE_LENGTH];
+ unsigned char str_base[MAX_MULTIBYTE_LENGTH], *str;
int c, translated, inverse;
int in_charlen, charlen;
if (RE && *base_pat == '\\')
{
len--;
+ raw_pattern_size--;
len_byte--;
base_pat++;
}
c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen);
- /* Translate the character, if requested. */
- TRANSLATE (translated, trt, c);
- /* If translation changed the byte-length, go back
- to the original character. */
- charlen = CHAR_STRING (translated, str);
- if (in_charlen != charlen)
+ if (NILP (trt))
{
- translated = c;
- charlen = CHAR_STRING (c, str);
+ str = base_pat;
+ charlen = in_charlen;
}
-
- /* If we are searching for something strange,
- an invalid multibyte code, don't use boyer-moore. */
- if (! ASCII_BYTE_P (translated)
- && (charlen == 1 /* 8bit code */
- || charlen != in_charlen /* invalid multibyte code */
- ))
- boyer_moore_ok = 0;
-
- TRANSLATE (inverse, inverse_trt, c);
-
- /* Did this char actually get translated?
- Would any other char get translated into it? */
- if (translated != c || inverse != c)
+ else
{
- /* Keep track of which character set row
- contains the characters that need translation. */
- int charset_base_code = c & ~CHAR_FIELD3_MASK;
- int inverse_charset_base = inverse & ~CHAR_FIELD3_MASK;
-
- if (charset_base_code != inverse_charset_base)
- boyer_moore_ok = 0;
- else if (charset_base == -1)
- charset_base = charset_base_code;
- else if (charset_base != charset_base_code)
- /* If two different rows appear, needing translation,
- then we cannot use boyer_moore search. */
- boyer_moore_ok = 0;
+ /* Translate the character. */
+ TRANSLATE (translated, trt, c);
+ charlen = CHAR_STRING (translated, str_base);
+ str = str_base;
+
+ /* Check if C has any other case-equivalents. */
+ TRANSLATE (inverse, inverse_trt, c);
+ /* If so, check if we can use boyer-moore. */
+ if (c != inverse && boyer_moore_ok)
+ {
+ /* Check if all equivalents belong to the same
+ charset & row. Note that the check of C
+ itself is done by the last iteration. Note
+ also that we don't have to check ASCII
+ characters because boyer-moore search can
+ always handle their translation. */
+ while (1)
+ {
+ if (! ASCII_BYTE_P (inverse))
+ {
+ if (SINGLE_BYTE_CHAR_P (inverse))
+ {
+ /* Boyer-moore search can't handle a
+ translation of an eight-bit
+ character. */
+ boyer_moore_ok = 0;
+ break;
+ }
+ else if (charset_base == 0)
+ charset_base = inverse & ~CHAR_FIELD3_MASK;
+ else if ((inverse & ~CHAR_FIELD3_MASK)
+ != charset_base)
+ {
+ boyer_moore_ok = 0;
+ break;
+ }
+ }
+ if (c == inverse)
+ break;
+ TRANSLATE (inverse, inverse_trt, inverse);
+ }
+ }
}
/* Store this character into the translated pattern. */
if (RE && *base_pat == '\\')
{
len--;
+ raw_pattern_size--;
base_pat++;
}
c = *base_pat++;
return n;
}
\f
-/* Do Boyer-Moore search N times for the string PAT,
+/* Do Boyer-Moore search N times for the string BASE_PAT,
whose length is LEN/LEN_BYTE,
from buffer position POS/POS_BYTE until LIM/LIM_BYTE.
DIRECTION says which direction we search in.
TRT and INVERSE_TRT are translation tables.
+ Characters in PAT are already translated by TRT.
- This kind of search works if all the characters in PAT that have
- nontrivial translation are the same aside from the last byte. This
- makes it possible to translate just the last byte of a character,
- and do so after just a simple test of the context.
+ This kind of search works if all the characters in BASE_PAT that
+ have nontrivial translation are the same aside from the last byte.
+ This makes it possible to translate just the last byte of a
+ character, and do so after just a simple test of the context.
+ CHARSET_BASE is nonzero iff there is such a non-ASCII character.
If that criterion is not satisfied, do not call this function. */
int multibyte = ! NILP (current_buffer->enable_multibyte_characters);
unsigned char simple_translate[0400];
- int translate_prev_byte = 0;
- int translate_anteprev_byte = 0;
+ /* These are set to the preceding bytes of a byte to be translated
+ if charset_base is nonzero. As the maximum byte length of a
+ multibyte character is 4, we have to check at most three previous
+ bytes. */
+ int translate_prev_byte1 = 0;
+ int translate_prev_byte2 = 0;
+ int translate_prev_byte3 = 0;
#ifdef C_ALLOCA
int BM_tab_space[0400];
for (i = 0; i < 0400; i++)
simple_translate[i] = i;
+ if (charset_base)
+ {
+ /* Setup translate_prev_byte1/2/3 from CHARSET_BASE. Only a
+ byte following them are the target of translation. */
+ int sample_char = charset_base | 0x20;
+ unsigned char str[MAX_MULTIBYTE_LENGTH];
+ int len = CHAR_STRING (sample_char, str);
+
+ translate_prev_byte1 = str[len - 2];
+ if (len > 2)
+ {
+ translate_prev_byte2 = str[len - 3];
+ if (len > 3)
+ translate_prev_byte3 = str[len - 4];
+ }
+ }
+
i = 0;
while (i != infinity)
{
i = infinity;
if (! NILP (trt))
{
- int ch;
- int untranslated;
- int this_translated = 1;
-
- if (multibyte
- /* Is *PTR the last byte of a character? */
- && (pat_end - ptr == 1 || CHAR_HEAD_P (ptr[1])))
+ /* If the byte currently looking at is a head of a character
+ to check case-equivalents, set CH to that character. An
+ ASCII character and a non-ASCII character matching with
+ CHARSET_BASE are to be checked. */
+ int ch = -1;
+
+ if (ASCII_BYTE_P (*ptr) || ! multibyte)
+ ch = *ptr;
+ else if (charset_base && CHAR_HEAD_P (*ptr))
{
- unsigned char *charstart = ptr;
- while (! CHAR_HEAD_P (*charstart))
- charstart--;
- untranslated = STRING_CHAR (charstart, ptr - charstart + 1);
- if (charset_base == (untranslated & ~CHAR_FIELD3_MASK))
- {
- TRANSLATE (ch, trt, untranslated);
- if (! CHAR_HEAD_P (*ptr))
- {
- translate_prev_byte = ptr[-1];
- if (! CHAR_HEAD_P (translate_prev_byte))
- translate_anteprev_byte = ptr[-2];
- }
- }
- else
- {
- this_translated = 0;
- ch = *ptr;
- }
+ ch = STRING_CHAR (ptr, pat_end - ptr);
+ if (charset_base != (ch & ~CHAR_FIELD3_MASK))
+ ch = -1;
}
- else if (!multibyte)
- TRANSLATE (ch, trt, *ptr);
- else
- {
- ch = *ptr;
- this_translated = 0;
- }
-
- if (ch > 0400)
- j = ((unsigned char) ch) | 0200;
- else
- j = (unsigned char) ch;
+ j = *ptr;
if (i == infinity)
stride_for_teases = BM_tab[j];
BM_tab[j] = dirlen - i;
/* A translation table is accompanied by its inverse -- see */
/* comment following downcase_table for details */
- if (this_translated)
+ if (ch >= 0)
{
int starting_ch = ch;
- int starting_j = j;
+ int starting_j;
+
+ if (ch > 0400)
+ starting_j = ((unsigned char) ch) | 0200;
+ else
+ starting_j = (unsigned char) ch;
while (1)
{
TRANSLATE (ch, inverse_trt, ch);
|| ((cursor == tail_end_ptr
|| CHAR_HEAD_P (cursor[1]))
&& (CHAR_HEAD_P (cursor[0])
- || (translate_prev_byte == cursor[-1]
- && (CHAR_HEAD_P (translate_prev_byte)
- || translate_anteprev_byte == cursor[-2])))))
+ /* Check if this is the last byte of
+ a translable character. */
+ || (translate_prev_byte1 == cursor[-1]
+ && (CHAR_HEAD_P (translate_prev_byte1)
+ || (translate_prev_byte2 == cursor[-2]
+ && (CHAR_HEAD_P (translate_prev_byte2)
+ || (translate_prev_byte3 == cursor[-3]))))))))
ch = simple_translate[*cursor];
else
ch = *cursor;
|| ((ptr == tail_end_ptr
|| CHAR_HEAD_P (ptr[1]))
&& (CHAR_HEAD_P (ptr[0])
- || (translate_prev_byte == ptr[-1]
- && (CHAR_HEAD_P (translate_prev_byte)
- || translate_anteprev_byte == ptr[-2])))))
+ /* Check if this is the last byte of a
+ translable character. */
+ || (translate_prev_byte1 == ptr[-1]
+ && (CHAR_HEAD_P (translate_prev_byte1)
+ || (translate_prev_byte2 == ptr[-2]
+ && (CHAR_HEAD_P (translate_prev_byte2)
+ || translate_prev_byte3 == ptr[-3])))))))
ch = simple_translate[*ptr];
else
ch = *ptr;