f, arg);
}
-/* analyze_first_old.
- If fastmap is non-NULL, go through the pattern and fill fastmap
- with all the possible leading chars. If fastmap is NULL, don't
- bother filling it up (obviously) and only return whether the
- pattern could potentially match the empty string.
-
- Return false if p..pend matches at least one char.
- Return true if p..pend might match the empty string
- or if fastmap was not updated accurately. */
-
-static bool
-analyze_first_old (re_char *p, re_char *pend, char *fastmap, bool multibyte)
-{
- int j, k;
- int nbits;
- bool not;
-
- /* If all elements for base leading-codes in fastmap is set, this
- flag is set true. */
- bool match_any_multibyte_characters = false;
-
- eassert (p);
-
- /* The loop below works as follows:
- - It has a working-list kept in the PATTERN_STACK and which basically
- starts by only containing a pointer to the first operation.
- - If the opcode we're looking at is a match against some set of
- chars, then we add those chars to the fastmap and go on to the
- next work element from the worklist (done via 'break').
- - If the opcode is a control operator on the other hand, we either
- ignore it (if it's meaningless at this point, such as 'start_memory')
- or execute it (if it's a jump). If the jump has several destinations
- (i.e. 'on_failure_jump'), then we push the other destination onto the
- worklist.
- We guarantee termination by ignoring backward jumps (more or less),
- so that P is monotonically increasing. More to the point, we
- never set P (or push) anything '<= p1'. */
-
- while (p < pend)
- {
- /* P1 is used as a marker of how far back a 'on_failure_jump'
- can go without being ignored. It is normally equal to P
- (which prevents any backward 'on_failure_jump') except right
- after a plain 'jump', to allow patterns such as:
- 0: jump 10
- 3..9: <body>
- 10: on_failure_jump 3
- as used for the *? operator. */
- re_char *p1 = p;
-
- switch (*p++)
- {
- case succeed:
- return true;
-
- case duplicate:
- /* If the first character has to match a backreference, that means
- that the group was empty (since it already matched). Since this
- is the only case that interests us here, we can assume that the
- backreference must match the empty string. */
- /* Note: Only true if we started from bufp->buffer (e.g. when
- fastmap is non-NULL), but when fastmap is NULL we're only
- trying to figure out if the body of a loop can possibly match
- the empty string so it's safe (tho sometimes pessimistic)
- to presume that `duplicate` can. */
- p++;
- continue;
-
-
- /* Following are the cases which match a character. These end
- with 'break'. */
-
- case exactn:
- if (fastmap)
- {
- /* If multibyte is nonzero, the first byte of each
- character is an ASCII or a leading code. Otherwise,
- each byte is a character. Thus, this works in both
- cases. */
- fastmap[p[1]] = 1;
- if (multibyte)
- {
- /* Cover the case of matching a raw char in a
- multibyte regexp against unibyte. */
- if (CHAR_BYTE8_HEAD_P (p[1]))
- fastmap[CHAR_TO_BYTE8 (STRING_CHAR (p + 1))] = 1;
- }
- else
- {
- /* For the case of matching this unibyte regex
- against multibyte, we must set a leading code of
- the corresponding multibyte character. */
- int c = RE_CHAR_TO_MULTIBYTE (p[1]);
-
- fastmap[CHAR_LEADING_CODE (c)] = 1;
- }
- }
- break;
-
-
- case anychar:
- /* We could put all the chars except for \n (and maybe \0)
- but we don't bother since it is generally not worth it. */
- if (!fastmap) break;
- return true;
-
-
- case charset_not:
- if (!fastmap) break;
- {
- /* Chars beyond end of bitmap are possible matches. */
- for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
- j < (1 << BYTEWIDTH); j++)
- fastmap[j] = 1;
- }
- FALLTHROUGH;
- case charset:
- if (!fastmap) break;
- not = (re_opcode_t) *(p - 1) == charset_not;
- nbits = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
- p++;
- for (j = 0; j < nbits; j++)
- if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
- fastmap[j] = 1;
-
- /* To match raw bytes (in the 80..ff range) against multibyte
- strings, add their leading bytes to the fastmap. */
- for (j = 0x80; j < nbits; j++)
- if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
- fastmap[CHAR_LEADING_CODE (BYTE8_TO_CHAR (j))] = 1;
-
- if (/* Any leading code can possibly start a character
- which doesn't match the specified set of characters. */
- not
- ||
- /* If we can match a character class, we can match any
- multibyte characters. */
- (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
- && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
-
- {
- if (match_any_multibyte_characters == false)
- {
- for (j = MIN_MULTIBYTE_LEADING_CODE;
- j <= MAX_MULTIBYTE_LEADING_CODE; j++)
- fastmap[j] = 1;
- match_any_multibyte_characters = true;
- }
- }
-
- else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
- && match_any_multibyte_characters == false)
- {
- /* Set fastmap[I] to 1 where I is a leading code of each
- multibyte character in the range table. */
- int c, count;
- unsigned char lc1, lc2;
-
- /* Make P points the range table. '+ 2' is to skip flag
- bits for a character class. */
- p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
-
- /* Extract the number of ranges in range table into COUNT. */
- EXTRACT_NUMBER_AND_INCR (count, p);
- for (; count > 0; count--, p += 3)
- {
- /* Extract the start and end of each range. */
- EXTRACT_CHARACTER (c, p);
- lc1 = CHAR_LEADING_CODE (c);
- p += 3;
- EXTRACT_CHARACTER (c, p);
- lc2 = CHAR_LEADING_CODE (c);
- for (j = lc1; j <= lc2; j++)
- fastmap[j] = 1;
- }
- }
- break;
-
- case syntaxspec:
- case notsyntaxspec:
- if (!fastmap) break;
- /* This match depends on text properties. These end with
- aborting optimizations. */
- return true;
-
- case categoryspec:
- case notcategoryspec:
- if (!fastmap) break;
- not = (re_opcode_t)p[-1] == notcategoryspec;
- k = *p++;
- for (j = (1 << BYTEWIDTH); j >= 0; j--)
- if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
- fastmap[j] = 1;
-
- /* Any leading code can possibly start a character which
- has or doesn't has the specified category. */
- if (match_any_multibyte_characters == false)
- {
- for (j = MIN_MULTIBYTE_LEADING_CODE;
- j <= MAX_MULTIBYTE_LEADING_CODE; j++)
- fastmap[j] = 1;
- match_any_multibyte_characters = true;
- }
- break;
-
- /* All cases after this match the empty string. These end with
- 'continue'. */
-
- case at_dot:
- case no_op:
- case begline:
- case endline:
- case begbuf:
- case endbuf:
- case wordbound:
- case notwordbound:
- case wordbeg:
- case wordend:
- case symbeg:
- case symend:
- continue;
-
-
- case jump:
- EXTRACT_NUMBER_AND_INCR (j, p);
- if (j < 0)
- /* Backward jumps can only go back to code that we've already
- visited. 're_compile' should make sure this is true. */
- break;
- p += j;
- switch (*p)
- {
- case on_failure_jump:
- case on_failure_keep_string_jump:
- case on_failure_jump_loop:
- case on_failure_jump_nastyloop:
- case on_failure_jump_smart:
- p++;
- break;
- default:
- continue;
- };
- /* Keep P1 to allow the 'on_failure_jump' we are jumping to
- to jump back to "just after here". */
- FALLTHROUGH;
- case on_failure_jump:
- case on_failure_keep_string_jump:
- case on_failure_jump_nastyloop:
- case on_failure_jump_loop:
- case on_failure_jump_smart:
- EXTRACT_NUMBER_AND_INCR (j, p);
- if (p + j <= p1)
- ; /* Backward jump to be ignored. */
- else
- { /* We have to look down both arms.
- We first go down the "straight" path so as to minimize
- stack usage when going through alternatives. */
- bool r = analyze_first_old (p, pend, fastmap, multibyte);
- if (r) return r;
- p += j;
- }
- continue;
-
-
- case jump_n:
- /* This code simply does not properly handle forward jump_n. */
- DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); eassert (j < 0));
- p += 4;
- /* jump_n can either jump or fall through. The (backward) jump
- case has already been handled, so we only need to look at the
- fallthrough case. */
- continue;
-
- case succeed_n:
- /* If N == 0, it should be an on_failure_jump_loop instead.
- `j` can be negative because `EXTRACT_NUMBER` extracts a
- signed number whereas `succeed_n` treats it as unsigned. */
- DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); eassert (j != 0));
- p += 4;
- /* We only care about one iteration of the loop, so we don't
- need to consider the case where this behaves like an
- on_failure_jump. */
- /* FIXME: Sadly, the above is not true when the loop's body
- can match the empty string :-( */
- /* continue; */
- return true;
-
- case set_number_at:
- p += 4;
- continue;
-
-
- case start_memory:
- case stop_memory:
- p += 1;
- continue;
-
-
- default:
- abort (); /* We have listed all the cases. */
- } /* switch *p++ */
-
- /* Getting here means we have found the possible starting
- characters for one path of the pattern -- and that the empty
- string does not match. We need not follow this path further. */
- return false;
- } /* while p */
-
- /* We reached the end without matching anything. */
- return true;
-
-} /* analyze_first_old */
-
struct anafirst_data {
bool multibyte;
char *fastmap;
fastmap ? analyze_first_fastmap
: analyze_first_null,
&data);
-#if ENABLE_CHECKING
- bool old = !!analyze_first_old (p, pend, fastmap, data.multibyte);
- if (old && safe)
- {
- fprintf (stderr, "ANALYZE_FIRST: New optimization (fastmap=%d)!\n",
- fastmap ? 1 : 0);
- print_partial_compiled_pattern (stderr, p, pend);
- }
- else if (!old && !safe)
- {
- fprintf (stderr, "ANALYZE_FIRST: Lost an optimization (fastmap=%d)!\n",
- fastmap ? 1 : 0);
- print_partial_compiled_pattern (stderr, p, pend);
- }
-#endif
return !safe;
}
}
-/* Jump over non-matching operations. */
-static re_char *
-skip_noops (re_char *p, re_char *pend)
-{
- while (p < pend)
- {
- switch (*p)
- {
- case start_memory:
- case stop_memory:
- p += 2; break;
- case no_op:
- p += 1; break;
- case jump:
- p = extract_address (p + 1);
- break;
- default:
- return p;
- }
- }
- eassert (p == pend);
- return p;
-}
-
/* Test if C matches charset op. *PP points to the charset or charset_not
opcode. When the function finishes, *PP will be advanced past that opcode.
C is character to test (possibly after translations) and CORIG is original
return false;
}
-/* True if "p1 matches something" implies "p2 fails". */
-/* We're trying to follow all paths reachable from `p2`, but since some
- loops can match the empty string, this can loop back to `p2`.
-
- To avoid inf-looping, we take advantage of the fact that
- the bytecode we generate is made of syntactically nested loops, more
- specifically, every loop has a single entry point and single exit point.
-
- The function takes 2 more arguments (`loop_entry` and `loop_exit`).
- `loop_entry` points to the sole entry point of the current loop and
- `loop_exit` points to its sole exit point (when non-NULL).
-
- Jumps outside of `loop_entry..exit` should not occur.
- The function can assume that `loop_exit` is "mutually exclusive".
- The same holds for `loop_entry` except when `p2 == loop_entry`.
-
- To guarantee termination, recursive calls should make sure that either
- `loop_entry` is larger, or it's unchanged but `p2` is larger.
-
- FIXME: This is failsafe (can't return true when it shouldn't)
- but it could be too conservative if we start generating bytecode
- with a different shape, so maybe we should bite the bullet and
- replace done_beg/end with an actual list of positions we've
- already processed. */
-static bool
-mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1,
- re_char *p2, re_char *loop_entry, re_char *loop_exit)
-{
- re_opcode_t op2;
- unsigned char *pend = bufp->buffer + bufp->used;
- re_char *p2_orig = p2;
-
- eassert (p1 >= bufp->buffer && p1 < pend
- && p2 >= bufp->buffer && p2 <= pend);
-
- if (p2 == loop_exit)
- return true; /* Presumably already checked elsewhere. */
- eassert (loop_entry && p2 >= loop_entry);
- if (p2 < loop_entry || (loop_exit && p2 > loop_exit))
- { /* The assumptions about the shape of the code aren't true :-( */
-#ifdef ENABLE_CHECKING
- error ("Broken assumption in regex.c:mutually_exclusive_aux");
-#endif
- return false;
- }
-
- /* Skip over open/close-group commands.
- If what follows this loop is a ...+ construct,
- look at what begins its body, since we will have to
- match at least one of that. */
- p2 = skip_noops (p2, pend);
- /* The same skip can be done for p1, except that this function
- is only used in the case where p1 is a simple match operator. */
- /* p1 = skip_noops (p1, pend); */
-
- eassert (p1 >= bufp->buffer && p1 < pend
- && p2 >= bufp->buffer && p2 <= pend);
-
- op2 = p2 == pend ? succeed : *p2;
-
- switch (op2)
- {
- case succeed:
- case endbuf:
- /* If we're at the end of the pattern, we can change. */
- if (skip_one_char (p1))
- {
- DEBUG_PRINT (" End of pattern: fast loop.\n");
- return true;
- }
- break;
-
- case endline:
- case exactn:
- return mutually_exclusive_exactn (bufp, p1, p2);
-
- case charset:
- {
- if ((re_opcode_t) *p1 == exactn)
- return mutually_exclusive_exactn (bufp, p2, p1);
- else
- return mutually_exclusive_charset (bufp, p1, p2);
- }
- break;
-
- case charset_not:
- switch (*p1)
- {
- case exactn:
- return mutually_exclusive_exactn (bufp, p2, p1);
- case charset:
- return mutually_exclusive_charset (bufp, p2, p1);
- case charset_not:
- /* When we have two charset_not, it's very unlikely that
- they don't overlap. The union of the two sets of excluded
- chars should cover all possible chars, which, as a matter of
- fact, is virtually impossible in multibyte buffers. */
- break;
- }
- break;
-
- case wordend:
- return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
- case symend:
- return ((re_opcode_t) *p1 == syntaxspec
- && (p1[1] == Ssymbol || p1[1] == Sword));
- case notsyntaxspec:
- return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
-
- case wordbeg:
- return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
- case symbeg:
- return ((re_opcode_t) *p1 == notsyntaxspec
- && (p1[1] == Ssymbol || p1[1] == Sword));
- case syntaxspec:
- return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
-
- case wordbound:
- /* FIXME: This optimization seems correct after the first iteration
- of the loop, but not for the very first :-(
- IOW we'd need to pull out the first iteration and do:
-
- syntaxspec w
- on_failure_keep_string_jump end
- loop:
- syntaxspec w
- goto loop
- end:
- wordbound
-
- return (((re_opcode_t) *p1 == notsyntaxspec
- || (re_opcode_t) *p1 == syntaxspec)
- && p1[1] == Sword); */
- return false;
-
- case categoryspec:
- return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
- case notcategoryspec:
- return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
-
- case on_failure_jump_nastyloop:
- case on_failure_jump_smart:
- case on_failure_jump_loop:
- case on_failure_keep_string_jump:
- case on_failure_jump:
- {
- int mcnt;
- p2++;
- EXTRACT_NUMBER_AND_INCR (mcnt, p2);
- re_char *p2_other = p2 + mcnt, *tmp;
- /* For `+` loops, we often have an `on_failure_jump` that skips forward
- over a subsequent `jump` for lack of an `on_failure_dont_jump`
- kind of thing. Recognize this pattern since that subsequent
- `jump` is the one that jumps to the loop-entry. */
- if ((re_opcode_t) p2[0] == jump && mcnt == 3)
- p2 = extract_address (p2 + 1);
-
- /* We have to check that both destinations are safe.
- Arrange for `p2` to be the smaller of the two. */
- if (p2 > p2_other)
- (tmp = p2, p2 = p2_other, p2_other = tmp);
-
- if (p2_other <= p2_orig /* Both destinations go backward! */
- || !mutually_exclusive_aux (bufp, p1, p2_other,
- loop_entry, loop_exit))
- return false;
-
- /* Now that we know that `p2_other` is a safe (i.e. mutually-exclusive)
- position, let's check `p2`. */
- if (p2 == loop_entry)
- /* If we jump backward to the entry point of the current loop
- it means it's a zero-length cycle through that loop, so
- this cycle itself does not break mutual-exclusion. */
- return true;
- else if (p2 > p2_orig)
- /* Boring forward jump. */
- return mutually_exclusive_aux (bufp, p1, p2, loop_entry, loop_exit);
- else if (loop_entry < p2 && p2 < p2_orig)
- /* We jump backward to a new loop, nested within the current one.
- `p2` is the entry point and `p2_other` the exit of that inner. */
- return mutually_exclusive_aux (bufp, p1, p2, p2, p2_other);
- else
- return false;
- }
-
- default:
- ;
- }
-
- /* Safe default. */
- return false;
-}
-
struct mutexcl_data {
struct re_pattern_buffer *bufp;
re_char *p1;
}
}
+/* True if "p1 matches something" implies "p2 fails". */
+
static bool
mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
re_char *p2)
{
struct mutexcl_data data = { bufp, p1 };
- bool new = forall_firstchar (bufp, p2, NULL, mutually_exclusive_one, &data);
-#if ENABLE_CHECKING
- bool old = mutually_exclusive_aux (bufp, p1, p2, bufp->buffer - 1, NULL);
- if (old && !new)
- {
- fprintf (stderr, "MUTUALLY_EXCLUSIVE_P: Lost an optimization between %d and %d!\n",
- (int)(p1 - bufp->buffer), (int)(p2 - bufp->buffer));
- print_partial_compiled_pattern (stderr, bufp->buffer, bufp->buffer + bufp->used);
- }
- else if (!old && new)
- {
- fprintf (stderr, "MUTUALLY_EXCLUSIVE_P: New optimization between %d and %d!\n",
- (int)(p1 - bufp->buffer), (int)(p2 - bufp->buffer));
- print_partial_compiled_pattern (stderr, bufp->buffer, bufp->buffer + bufp->used);
- }
-#endif
- return new;
+ return forall_firstchar (bufp, p2, NULL, mutually_exclusive_one, &data);
}
\f
/* Matching routines. */