static bool at_begline_loc_p (re_char *pattern, re_char *p);
static bool at_endline_loc_p (re_char *p, re_char *pend);
static re_char *skip_one_char (re_char *p);
-static bool analyze_first (re_char *p, re_char *pend,
- char *fastmap, bool multibyte);
+static bool analyze_first (struct re_pattern_buffer *bufp,
+ re_char *p, re_char *pend, char *fastmap);
/* Fetch the next character in the uncompiled pattern, with no
translation. */
ptrdiff_t startoffset = 0;
re_opcode_t ofj =
/* Check if the loop can match the empty string. */
- (simple || !analyze_first (laststart, b, NULL, false))
+ (simple || !analyze_first (bufp, laststart, b, NULL))
? on_failure_jump : on_failure_jump_loop;
eassert (skip_one_char (laststart) <= b);
GET_BUFFER_SPACE (7); /* We might use less. */
if (many_times_ok)
{
- bool emptyp = analyze_first (laststart, b, NULL, false);
+ bool emptyp = analyze_first (bufp, laststart, b, NULL);
/* The non-greedy multiple match looks like
a repeat..until: we only need a conditional jump
return false;
}
\f
-/* analyze_first.
+/* Iterate through all the char-matching operations directly reachable from P.
+ This is the inner loop of `forall_firstchar`, which see.
+ LOOP_BEG..LOOP_END delimit the currentl "block" of code (we assume
+ the code is made of syntactically nested loops).
+ LOOP_END is blindly assumed to be "safe".
+ To guarantee termination, at each iteration, either LOOP_BEG should
+ get bigger, or it should stay the same and P should get bigger. */
+static bool
+forall_firstchar_1 (re_char *p, re_char *pend,
+ re_char *loop_beg, re_char *loop_end,
+ bool f (const re_char *p, void *arg), void *arg)
+{
+ eassert (p >= loop_beg);
+ eassert (p <= loop_end);
+
+ while (true)
+ {
+ re_char *newp1, *newp2, *tmp;
+ re_char *p_orig = p;
+
+ if (p == pend)
+ return false;
+ else if (p == loop_end)
+ return true;
+ else if (p > loop_end)
+ {
+#if ENABLE_CHECKING
+ fprintf (stderr, "FORALL_FIRSTCHAR: Broken assumption1!!\n");
+#endif
+ return false; /* FIXME: Broken assumption about the code shape. */
+ }
+ else
+ switch (*p)
+ {
+ /* Cases which stop the iteration. */
+ case succeed:
+ case exactn:
+ case charset:
+ case charset_not:
+ case anychar:
+ case syntaxspec:
+ case notsyntaxspec:
+ case categoryspec:
+ case notcategoryspec:
+ return f (p, arg);
+
+ /* Cases which may match the empty string. */
+ case at_dot:
+ case begbuf:
+ case no_op:
+ case wordbound:
+ case notwordbound:
+ case begline:
+ p++;
+ continue;
+
+ /* Cases which may match the empty string and may
+ tell us something about the next char. */
+ case endline:
+ case endbuf:
+ case wordbeg:
+ case wordend:
+ case symbeg:
+ case symend:
+ if (f (p, arg))
+ return true;
+ p++;
+ continue;
+
+ case jump:
+ case jump_n:
+ newp1 = extract_address (p + 1);
+ if (newp1 > p)
+ { /* Forward jump, boring. */
+ p = newp1;
+ continue;
+ }
+ switch (*newp1)
+ {
+ case on_failure_jump:
+ case on_failure_keep_string_jump:
+ case on_failure_jump_nastyloop:
+ case on_failure_jump_loop:
+ case on_failure_jump_smart:
+ case succeed_n:
+ newp2 = extract_address (newp1 + 1);
+ goto do_twoway_jump;
+ default:
+ newp2 = loop_end; /* "Safe" choice. */
+ goto do_jump;
+ }
+
+ case on_failure_jump:
+ case on_failure_keep_string_jump:
+ case on_failure_jump_nastyloop:
+ case on_failure_jump_loop:
+ case on_failure_jump_smart:
+ newp1 = extract_address (p + 1);
+ newp2 = p + 3;
+ /* For `+` loops, we often have an `on_failure_jump` that skips
+ forward over a subsequent `jump`. Recognize this pattern
+ since that subsequent `jump` is the one that jumps to the
+ loop-entry. */
+ newp2 = ((re_opcode_t) *newp2 == jump)
+ ? extract_address (newp2 + 1) : newp2;
+
+ do_twoway_jump:
+ /* We have to check that both destinations are safe.
+ Arrange for `newp1` to be the smaller of the two. */
+ if (newp1 > newp2)
+ (tmp = newp1, newp1 = newp2, newp2 = tmp);
+
+ if (newp2 <= p_orig) /* Both destinations go backward! */
+ {
+#if ENABLE_CHECKING
+ fprintf (stderr, "FORALL_FIRSTCHAR: Broken assumption2!!\n");
+#endif
+ return false;
+ }
+
+ if (!forall_firstchar_1 (newp2, pend, loop_beg, loop_end, f, arg))
+ return false;
+
+ do_jump:
+ eassert (newp2 <= loop_end);
+ if (newp1 <= p_orig)
+ {
+ if (newp1 < loop_beg)
+ {
+#if ENABLE_CHECKING
+ fprintf (stderr, "FORALL_FIRSTCHAR: Broken assumption3!!\n");
+#endif
+ return false;
+ }
+ else if (newp1 == loop_beg)
+ /* If we jump backward to the entry point of the current loop
+ it means it's a zero-length cycle through that loop;
+ this cycle itself does not break safety. */
+ return true;
+ else
+ /* We jump backward to a new loop, nested within the current
+ one. `newp1` is the entry point and `newp2` the exit of
+ that inner loop. */
+ /* `p` gets smaller, but termination is still ensured because
+ `loop_beg` gets bigger. */
+ (loop_beg = newp1, loop_end = newp2);
+ }
+ p = newp1;
+ continue;
+
+ case succeed_n:
+ newp1 = extract_address (p + 1);
+ newp2 = p + 5; /* Skip the two bytes containing the count. */
+ goto do_twoway_jump;
+
+ case set_number_at:
+ int offset = extract_number (p + 1);
+ DEBUG_STATEMENT (eassert (extract_number (p + 3)));
+ /* If we're setting the counter of an immediately following
+ `succeed_n`, then this next execution of `succeed_n` will do
+ nothing but decrement its counter and "fall through".
+ So we do the fall through here to avoid considering the
+ "on failure" part of the `succeed_n` which should only be
+ considered when coming from the `jump(_n)` at the end of
+ the loop. */
+ p += (offset == 5 && p[5] == succeed_n) ? 10 : 5;
+ continue;
+
+ case start_memory:
+ case stop_memory:
+ p += 2;
+ continue;
+
+ /* This could match the empty string, so we may need to continue,
+ but in most cases, this can match "anything", so we should
+ return `false` unless told otherwise. */
+ case duplicate:
+ if (!f (p, arg))
+ return false;
+ p += 2;
+ continue;
+
+ default:
+ abort (); /* We have listed all the cases. */
+ }
+ }
+}
+
+/* Iterate through all the char-matching operations directly reachable from P.
+ Return true if P is "safe", meaning that PEND cannot be reached directly
+ from P and all calls to F returned true.
+ Return false if PEND *may* be directly reachable from P or if one of
+ the calls to F returned false.
+ PEND can be NULL (and hence never reachable).
+
+ Call `F (POS, ARG)` for every POS directly reachable from P,
+ before reaching PEND, where POS is the position of a char-matching
+ operation (`exactn`, `charset`, ...).
+
+ For operations that match the empty string (`wordbeg`, ...), if F
+ returns true we stop going down that path immediately but if it returns
+ false we don't consider it as a failure and we simply look for the
+ next char-matching operations on that path.
+ For `duplicate`, it is the reverse: a false is an immediate failure
+ whereas a true just lets the analysis continue with the rest of the path.
+
+ This function can be used while building the bytecode (in which case
+ you should pass NULL for bufp), but if so, P and PEND need to delimit
+ a valid block such that there is not jump to a location outside
+ of [P...PEND]. */
+static bool
+forall_firstchar (struct re_pattern_buffer *bufp, re_char *p, re_char *pend,
+ bool f (re_char *p, void *arg), void *arg)
+{
+ eassert (!bufp || bufp->used);
+ eassert (pend || bufp->used);
+ return forall_firstchar_1 (p, pend,
+ bufp ? bufp->buffer - 1 : p,
+ bufp ? bufp->buffer + bufp->used + 1 : pend,
+ f, arg);
+}
+
+/* analyze_first_old.
If fastmap is non-NULL, go through the pattern and fill fastmap
with all the possible leading chars. If fastmap is NULL, don't
bother filling it up (obviously) and only return whether the
or if fastmap was not updated accurately. */
static bool
-analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte)
+analyze_first_old (re_char *p, re_char *pend, char *fastmap, bool multibyte)
{
int j, k;
int nbits;
{ /* We have to look down both arms.
We first go down the "straight" path so as to minimize
stack usage when going through alternatives. */
- bool r = analyze_first (p, pend, fastmap, multibyte);
+ bool r = analyze_first_old (p, pend, fastmap, multibyte);
if (r) return r;
p += j;
}
/* We reached the end without matching anything. */
return true;
-} /* analyze_first */
+} /* analyze_first_old */
+
+struct anafirst_data {
+ bool multibyte;
+ char *fastmap;
+ bool match_any_multibyte_characters;
+};
+
+static bool
+analyze_first_fastmap (const re_char *p, void *arg)
+{
+ struct anafirst_data *data = arg;
+
+ int j, k;
+ int nbits;
+ bool not;
+
+ switch (*p)
+ {
+ case succeed:
+ return false;
+
+ case duplicate:
+ /* If the first character has to match a backreference, that means
+ that the group was empty (since it already matched). Since this
+ is the only case that interests us here, we can assume that the
+ backreference must match the empty string and we need to
+ build the fastmap from the rest of the path. */
+ return true;
+
+ /* Following are the cases which match a character. These end
+ with 'break'. */
+
+ case exactn:
+ p++;
+ /* If multibyte is nonzero, the first byte of each
+ character is an ASCII or a leading code. Otherwise,
+ each byte is a character. Thus, this works in both
+ cases. */
+ data->fastmap[p[1]] = 1;
+ if (data->multibyte)
+ {
+ /* Cover the case of matching a raw char in a
+ multibyte regexp against unibyte. */
+ if (CHAR_BYTE8_HEAD_P (p[1]))
+ data->fastmap[CHAR_TO_BYTE8 (STRING_CHAR (p + 1))] = 1;
+ }
+ else
+ {
+ /* For the case of matching this unibyte regex
+ against multibyte, we must set a leading code of
+ the corresponding multibyte character. */
+ int c = RE_CHAR_TO_MULTIBYTE (p[1]);
+
+ data->fastmap[CHAR_LEADING_CODE (c)] = 1;
+ }
+ return true;
+
+ case anychar:
+ /* We could put all the chars except for \n (and maybe \0)
+ but we don't bother since it is generally not worth it. */
+ return false;
+
+ case charset_not:
+ {
+ /* Chars beyond end of bitmap are possible matches. */
+ for (j = CHARSET_BITMAP_SIZE (p) * BYTEWIDTH;
+ j < (1 << BYTEWIDTH); j++)
+ data->fastmap[j] = 1;
+ }
+ FALLTHROUGH;
+ case charset:
+ not = (re_opcode_t) *(p) == charset_not;
+ nbits = CHARSET_BITMAP_SIZE (p) * BYTEWIDTH;
+ p += 2;
+ for (j = 0; j < nbits; j++)
+ if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
+ data->fastmap[j] = 1;
+
+ /* To match raw bytes (in the 80..ff range) against multibyte
+ strings, add their leading bytes to the fastmap. */
+ for (j = 0x80; j < nbits; j++)
+ if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
+ data->fastmap[CHAR_LEADING_CODE (BYTE8_TO_CHAR (j))] = 1;
+
+ if (/* Any leading code can possibly start a character
+ which doesn't match the specified set of characters. */
+ not
+ ||
+ /* If we can match a character class, we can match any
+ multibyte characters. */
+ (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
+ && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
+
+ {
+ if (!data->match_any_multibyte_characters)
+ {
+ for (j = MIN_MULTIBYTE_LEADING_CODE;
+ j <= MAX_MULTIBYTE_LEADING_CODE; j++)
+ data->fastmap[j] = 1;
+ data->match_any_multibyte_characters = true;
+ }
+ }
+
+ else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
+ && data->match_any_multibyte_characters == false)
+ {
+ /* Set fastmap[I] to 1 where I is a leading code of each
+ multibyte character in the range table. */
+ int c, count;
+ unsigned char lc1, lc2;
+
+ /* Make P points the range table. '+ 2' is to skip flag
+ bits for a character class. */
+ p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
+
+ /* Extract the number of ranges in range table into COUNT. */
+ EXTRACT_NUMBER_AND_INCR (count, p);
+ for (; count > 0; count--, p += 3)
+ {
+ /* Extract the start and end of each range. */
+ EXTRACT_CHARACTER (c, p);
+ lc1 = CHAR_LEADING_CODE (c);
+ p += 3;
+ EXTRACT_CHARACTER (c, p);
+ lc2 = CHAR_LEADING_CODE (c);
+ for (j = lc1; j <= lc2; j++)
+ data->fastmap[j] = 1;
+ }
+ }
+ return true;
+
+ case syntaxspec:
+ case notsyntaxspec:
+ /* This match depends on text properties. These end with
+ aborting optimizations. */
+ return false;
+
+ case categoryspec:
+ case notcategoryspec:
+ not = (re_opcode_t)p[0] == notcategoryspec;
+ p++;
+ k = *p++;
+ for (j = (1 << BYTEWIDTH); j >= 0; j--)
+ if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
+ data->fastmap[j] = 1;
+
+ /* Any leading code can possibly start a character which
+ has or doesn't has the_malloc_fn specified category. */
+ if (!data->match_any_multibyte_characters)
+ {
+ for (j = MIN_MULTIBYTE_LEADING_CODE;
+ j <= MAX_MULTIBYTE_LEADING_CODE; j++)
+ data->fastmap[j] = 1;
+ data->match_any_multibyte_characters = true;
+ }
+ return true;
+
+ case endline:
+ case endbuf:
+ case wordbeg:
+ case wordend:
+ case symbeg:
+ case symend:
+ /* This false doesn't mean failure but rather "not succeeded yet". */
+ return false;
+
+ default:
+#if ENABLE_CHECKING
+ abort (); /* We have listed all the cases. */
+#endif
+ return false;
+ }
+}
+
+static bool
+analyze_first_null (const re_char *p, void *arg)
+{
+ switch (*p)
+ {
+ case succeed:
+ /* This is safe: we can't reach `pend` at all from here. */
+ return true;
+
+ case duplicate:
+ /* Either `duplicate` ends up matching a non-empty string, in which
+ case we're good, or it matches the empty string, in which case we
+ need to continue checking the rest of this path, which is exactly
+ what returning `true` does, here. */
+ return true;
+
+ case exactn:
+ case anychar:
+ case charset_not:
+ case charset:
+ case syntaxspec:
+ case notsyntaxspec:
+ case categoryspec:
+ case notcategoryspec:
+ return true;
+
+ case endline:
+ case endbuf:
+ case wordbeg:
+ case wordend:
+ case symbeg:
+ case symend:
+ /* This false doesn't mean failure but rather "not succeeded yet". */
+ return false;
+
+ default:
+#if ENABLE_CHECKING
+ abort (); /* We have listed all the cases. */
+#endif
+ return false;
+ }
+}
+
+/* analyze_first.
+ If fastmap is non-NULL, go through the pattern and fill fastmap
+ with all the possible leading chars. If fastmap is NULL, don't
+ bother filling it up (obviously) and only return whether the
+ pattern could potentially match the empty string.
+
+ Return false if p matches at least one char before reaching pend.
+ Return true if p..pend might match the empty string
+ or if fastmap was not updated accurately. */
+
+static bool
+analyze_first (struct re_pattern_buffer *bufp,
+ re_char *p, re_char *pend, char *fastmap)
+{
+ eassert (pend);
+ struct anafirst_data data = { bufp ? bufp->multibyte : false,
+ fastmap, false };
+ bool safe = forall_firstchar (bufp->used ? bufp : NULL, p, pend,
+ fastmap ? analyze_first_fastmap
+ : analyze_first_null,
+ &data);
+#if ENABLE_CHECKING
+ bool old = !!analyze_first_old (p, pend, fastmap, data.multibyte);
+ if (old && safe)
+ {
+ fprintf (stderr, "ANALYZE_FIRST: New optimization (fastmap=%d)!\n",
+ fastmap ? 1 : 0);
+ print_partial_compiled_pattern (stderr, p, pend);
+ }
+ else if (!old && !safe)
+ {
+ fprintf (stderr, "ANALYZE_FIRST: Lost an optimization (fastmap=%d)!\n",
+ fastmap ? 1 : 0);
+ print_partial_compiled_pattern (stderr, p, pend);
+ }
+#endif
+ return !safe;
+}
+
\f
/* Compute a fastmap for the compiled pattern in BUFP.
A fastmap records which of the (1 << BYTEWIDTH) possible
/* FIXME: Is the following assignment correct even when ANALYSIS < 0? */
bufp->fastmap_accurate = 1; /* It will be when we're done. */
- bufp->can_be_null = analyze_first (bufp->buffer, bufp->buffer + bufp->used,
- fastmap, RE_MULTIBYTE_P (bufp));
+ bufp->can_be_null = analyze_first (bufp, bufp->buffer,
+ bufp->buffer + bufp->used, fastmap);
} /* re_compile_fastmap */
\f
/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
return false;
}
+struct mutexcl_data {
+ struct re_pattern_buffer *bufp;
+ re_char *p1;
+};
+
+static bool
+mutually_exclusive_one (re_char *p2, void *arg)
+{
+ struct mutexcl_data *data = arg;
+ switch (*p2)
+ {
+ case endline:
+ case exactn:
+ return mutually_exclusive_exactn (data->bufp, data->p1, p2);
+ case charset:
+ {
+ if (*data->p1 == exactn)
+ return mutually_exclusive_exactn (data->bufp, p2, data->p1);
+ else
+ return mutually_exclusive_charset (data->bufp, data->p1, p2);
+ }
+
+ case charset_not:
+ switch (*data->p1)
+ {
+ case exactn:
+ return mutually_exclusive_exactn (data->bufp, p2, data->p1);
+ case charset:
+ return mutually_exclusive_charset (data->bufp, p2, data->p1);
+ case charset_not:
+ /* When we have two charset_not, it's very unlikely that
+ they don't overlap. The union of the two sets of excluded
+ chars should cover all possible chars, which, as a matter of
+ fact, is virtually impossible in multibyte buffers. */
+ return false;
+ }
+ return false;
+ case anychar:
+ return false; /* FIXME: exactn \n ? */
+ case syntaxspec:
+ return (*data->p1 == notsyntaxspec && data->p1[1] == p2[1]);
+ case notsyntaxspec:
+ return (*data->p1 == syntaxspec && data->p1[1] == p2[1]);
+ case categoryspec:
+ return (*data->p1 == notcategoryspec && data->p1[1] == p2[1]);
+ case notcategoryspec:
+ return (*data->p1 == categoryspec && data->p1[1] == p2[1]);
+
+ case endbuf:
+ case succeed:
+ return true;
+ case wordbeg:
+ return (*data->p1 == notsyntaxspec && data->p1[1] == Sword);
+ case wordend:
+ return (*data->p1 == syntaxspec && data->p1[1] == Sword);
+ case symbeg:
+ return (*data->p1 == notsyntaxspec
+ && (data->p1[1] == Ssymbol || data->p1[1] == Sword));
+ case symend:
+ return (*data->p1 == syntaxspec
+ && (data->p1[1] == Ssymbol || data->p1[1] == Sword));
+
+ case duplicate:
+ /* At this point, we know nothing about what this can match, sadly. */
+ return false;
+
+ default:
+#if ENABLE_CHECKING
+ abort (); /* We have listed all the cases. */
+#endif
+ return false;
+ }
+}
+
static bool
mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
re_char *p2)
{
- return mutually_exclusive_aux (bufp, p1, p2, bufp->buffer - 1, NULL);
+ struct mutexcl_data data = { bufp, p1 };
+ bool new = forall_firstchar (bufp, p2, NULL, mutually_exclusive_one, &data);
+#if ENABLE_CHECKING
+ bool old = mutually_exclusive_aux (bufp, p1, p2, bufp->buffer - 1, NULL);
+ if (old && !new)
+ {
+ fprintf (stderr, "MUTUALLY_EXCLUSIVE_P: Lost an optimization between %d and %d!\n",
+ (int)(p1 - bufp->buffer), (int)(p2 - bufp->buffer));
+ print_partial_compiled_pattern (stderr, bufp->buffer, bufp->buffer + bufp->used);
+ }
+ else if (!old && new)
+ {
+ fprintf (stderr, "MUTUALLY_EXCLUSIVE_P: New optimization between %d and %d!\n",
+ (int)(p1 - bufp->buffer), (int)(p2 - bufp->buffer));
+ print_partial_compiled_pattern (stderr, bufp->buffer, bufp->buffer + bufp->used);
+ }
+#endif
+ return new;
}
\f
/* Matching routines. */