regex.c: Remove the old analyzes functions

author Stefan Monnier <monnier@iro.umontreal.ca>

Fri, 29 Sep 2023 21:46:20 +0000 (17:46 -0400)

committer Stefan Monnier <monnier@iro.umontreal.ca>

Fri, 29 Sep 2023 21:46:20 +0000 (17:46 -0400)
author Stefan Monnier <monnier@iro.umontreal.ca>
Fri, 29 Sep 2023 21:46:20 +0000 (17:46 -0400)
committer Stefan Monnier <monnier@iro.umontreal.ca>
Fri, 29 Sep 2023 21:46:20 +0000 (17:46 -0400)
diff --git a/src/regex-emacs.c b/src/regex-emacs.c

index dbd5bcb89536c98a5d5771ddb4b28aa5d96ec3b2..79e9f2c105a93be13c922def05217ce89ba172e7 100644 (file)
--- a/src/regex-emacs.c
+++ b/src/regex-emacs.c
@@ -3044,319 +3044,6 @@ forall_firstchar (struct re_pattern_buffer *bufp, re_char *p, re_char *pend,
                               f, arg);
  }
  
-/* analyze_first_old.
-   If fastmap is non-NULL, go through the pattern and fill fastmap
-   with all the possible leading chars.  If fastmap is NULL, don't
-   bother filling it up (obviously) and only return whether the
-   pattern could potentially match the empty string.
-
-   Return false if p..pend matches at least one char.
-   Return true  if p..pend might match the empty string
-                or if fastmap was not updated accurately.  */
-
-static bool
-analyze_first_old (re_char *p, re_char *pend, char *fastmap, bool multibyte)
-{
-  int j, k;
-  int nbits;
-  bool not;
-
-  /* If all elements for base leading-codes in fastmap is set, this
-     flag is set true.  */
-  bool match_any_multibyte_characters = false;
-
-  eassert (p);
-
-  /* The loop below works as follows:
-     - It has a working-list kept in the PATTERN_STACK and which basically
-       starts by only containing a pointer to the first operation.
-     - If the opcode we're looking at is a match against some set of
-       chars, then we add those chars to the fastmap and go on to the
-       next work element from the worklist (done via 'break').
-     - If the opcode is a control operator on the other hand, we either
-       ignore it (if it's meaningless at this point, such as 'start_memory')
-       or execute it (if it's a jump).  If the jump has several destinations
-       (i.e. 'on_failure_jump'), then we push the other destination onto the
-       worklist.
-     We guarantee termination by ignoring backward jumps (more or less),
-     so that P is monotonically increasing.  More to the point, we
-     never set P (or push) anything '<= p1'.  */
-
-  while (p < pend)
-    {
-      /* P1 is used as a marker of how far back a 'on_failure_jump'
-        can go without being ignored.  It is normally equal to P
-        (which prevents any backward 'on_failure_jump') except right
-        after a plain 'jump', to allow patterns such as:
-           0: jump 10
-           3..9: <body>
-           10: on_failure_jump 3
-        as used for the *? operator.  */
-      re_char *p1 = p;
-
-      switch (*p++)
-       {
-       case succeed:
-         return true;
-
-       case duplicate:
-         /* If the first character has to match a backreference, that means
-            that the group was empty (since it already matched).  Since this
-            is the only case that interests us here, we can assume that the
-            backreference must match the empty string.  */
-         /* Note: Only true if we started from bufp->buffer (e.g. when
-            fastmap is non-NULL), but when fastmap is NULL we're only
-            trying to figure out if the body of a loop can possibly match
-            the empty string so it's safe (tho sometimes pessimistic)
-            to presume that `duplicate` can.  */
-         p++;
-         continue;
-
-
-      /* Following are the cases which match a character.  These end
-        with 'break'.  */
-
-       case exactn:
-         if (fastmap)
-           {
-             /* If multibyte is nonzero, the first byte of each
-                character is an ASCII or a leading code.  Otherwise,
-                each byte is a character.  Thus, this works in both
-                cases. */
-             fastmap[p[1]] = 1;
-             if (multibyte)
-               {
-                 /* Cover the case of matching a raw char in a
-                    multibyte regexp against unibyte.  */
-                 if (CHAR_BYTE8_HEAD_P (p[1]))
-                   fastmap[CHAR_TO_BYTE8 (STRING_CHAR (p + 1))] = 1;
-               }
-             else
-               {
-                 /* For the case of matching this unibyte regex
-                    against multibyte, we must set a leading code of
-                    the corresponding multibyte character.  */
-                 int c = RE_CHAR_TO_MULTIBYTE (p[1]);
-
-                 fastmap[CHAR_LEADING_CODE (c)] = 1;
-               }
-           }
-         break;
-
-
-       case anychar:
-         /* We could put all the chars except for \n (and maybe \0)
-            but we don't bother since it is generally not worth it.  */
-         if (!fastmap) break;
-         return true;
-
-
-       case charset_not:
-         if (!fastmap) break;
-         {
-           /* Chars beyond end of bitmap are possible matches.  */
-           for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
-                j < (1 << BYTEWIDTH); j++)
-             fastmap[j] = 1;
-         }
-         FALLTHROUGH;
-       case charset:
-         if (!fastmap) break;
-         not = (re_opcode_t) *(p - 1) == charset_not;
-         nbits = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
-         p++;
-         for (j = 0; j < nbits; j++)
-           if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
-             fastmap[j] = 1;
-
-         /* To match raw bytes (in the 80..ff range) against multibyte
-            strings, add their leading bytes to the fastmap.  */
-         for (j = 0x80; j < nbits; j++)
-           if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
-             fastmap[CHAR_LEADING_CODE (BYTE8_TO_CHAR (j))] = 1;
-
-         if (/* Any leading code can possibly start a character
-                which doesn't match the specified set of characters.  */
-             not
-             ||
-             /* If we can match a character class, we can match any
-                multibyte characters.  */
-             (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
-              && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
-
-           {
-             if (match_any_multibyte_characters == false)
-               {
-                 for (j = MIN_MULTIBYTE_LEADING_CODE;
-                      j <= MAX_MULTIBYTE_LEADING_CODE; j++)
-                   fastmap[j] = 1;
-                 match_any_multibyte_characters = true;
-               }
-           }
-
-         else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
-                  && match_any_multibyte_characters == false)
-           {
-             /* Set fastmap[I] to 1 where I is a leading code of each
-                multibyte character in the range table. */
-             int c, count;
-             unsigned char lc1, lc2;
-
-             /* Make P points the range table.  '+ 2' is to skip flag
-                bits for a character class.  */
-             p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
-
-             /* Extract the number of ranges in range table into COUNT.  */
-             EXTRACT_NUMBER_AND_INCR (count, p);
-             for (; count > 0; count--, p += 3)
-               {
-                 /* Extract the start and end of each range.  */
-                 EXTRACT_CHARACTER (c, p);
-                 lc1 = CHAR_LEADING_CODE (c);
-                 p += 3;
-                 EXTRACT_CHARACTER (c, p);
-                 lc2 = CHAR_LEADING_CODE (c);
-                 for (j = lc1; j <= lc2; j++)
-                   fastmap[j] = 1;
-               }
-           }
-         break;
-
-       case syntaxspec:
-       case notsyntaxspec:
-         if (!fastmap) break;
-         /* This match depends on text properties.  These end with
-            aborting optimizations.  */
-         return true;
-
-       case categoryspec:
-       case notcategoryspec:
-         if (!fastmap) break;
-         not = (re_opcode_t)p[-1] == notcategoryspec;
-         k = *p++;
-         for (j = (1 << BYTEWIDTH); j >= 0; j--)
-           if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
-             fastmap[j] = 1;
-
-         /* Any leading code can possibly start a character which
-            has or doesn't has the specified category.  */
-         if (match_any_multibyte_characters == false)
-           {
-             for (j = MIN_MULTIBYTE_LEADING_CODE;
-                  j <= MAX_MULTIBYTE_LEADING_CODE; j++)
-               fastmap[j] = 1;
-             match_any_multibyte_characters = true;
-           }
-         break;
-
-      /* All cases after this match the empty string.  These end with
-        'continue'.  */
-
-       case at_dot:
-       case no_op:
-       case begline:
-       case endline:
-       case begbuf:
-       case endbuf:
-       case wordbound:
-       case notwordbound:
-       case wordbeg:
-       case wordend:
-       case symbeg:
-       case symend:
-         continue;
-
-
-       case jump:
-         EXTRACT_NUMBER_AND_INCR (j, p);
-         if (j < 0)
-           /* Backward jumps can only go back to code that we've already
-              visited.  're_compile' should make sure this is true.  */
-           break;
-         p += j;
-         switch (*p)
-           {
-           case on_failure_jump:
-           case on_failure_keep_string_jump:
-           case on_failure_jump_loop:
-           case on_failure_jump_nastyloop:
-           case on_failure_jump_smart:
-             p++;
-             break;
-           default:
-             continue;
-           };
-         /* Keep P1 to allow the 'on_failure_jump' we are jumping to
-            to jump back to "just after here".  */
-         FALLTHROUGH;
-       case on_failure_jump:
-       case on_failure_keep_string_jump:
-       case on_failure_jump_nastyloop:
-       case on_failure_jump_loop:
-       case on_failure_jump_smart:
-         EXTRACT_NUMBER_AND_INCR (j, p);
-         if (p + j <= p1)
-           ; /* Backward jump to be ignored.  */
-         else
-           { /* We have to look down both arms.
-                We first go down the "straight" path so as to minimize
-                stack usage when going through alternatives.  */
-             bool r = analyze_first_old (p, pend, fastmap, multibyte);
-             if (r) return r;
-             p += j;
-           }
-         continue;
-
-
-       case jump_n:
-         /* This code simply does not properly handle forward jump_n.  */
-         DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); eassert (j < 0));
-         p += 4;
-         /* jump_n can either jump or fall through.  The (backward) jump
-            case has already been handled, so we only need to look at the
-            fallthrough case.  */
-         continue;
-
-       case succeed_n:
-         /* If N == 0, it should be an on_failure_jump_loop instead.
-            `j` can be negative because `EXTRACT_NUMBER` extracts a
-            signed number whereas `succeed_n` treats it as unsigned.  */
-         DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); eassert (j != 0));
-         p += 4;
-         /* We only care about one iteration of the loop, so we don't
-            need to consider the case where this behaves like an
-            on_failure_jump.  */
-         /* FIXME: Sadly, the above is not true when the loop's body
-            can match the empty string :-(  */
-         /* continue; */
-         return true;
-
-       case set_number_at:
-         p += 4;
-         continue;
-
-
-       case start_memory:
-       case stop_memory:
-         p += 1;
-         continue;
-
-
-       default:
-         abort (); /* We have listed all the cases.  */
-       } /* switch *p++ */
-
-      /* Getting here means we have found the possible starting
-        characters for one path of the pattern -- and that the empty
-        string does not match.  We need not follow this path further.  */
-      return false;
-    } /* while p */
-
-  /* We reached the end without matching anything.  */
-  return true;
-
-} /* analyze_first_old */
-
  struct anafirst_data {
    bool multibyte;
    char *fastmap;
@@ -3594,21 +3281,6 @@ analyze_first (struct re_pattern_buffer *bufp,
                                  fastmap ? analyze_first_fastmap
                                  : analyze_first_null,
                                  &data);
-#if ENABLE_CHECKING
-  bool old = !!analyze_first_old (p, pend, fastmap, data.multibyte);
-  if (old && safe)
-    {
-      fprintf (stderr, "ANALYZE_FIRST: New optimization (fastmap=%d)!\n",
-               fastmap ? 1 : 0);
-      print_partial_compiled_pattern (stderr, p, pend);
-    }
-  else if (!old && !safe)
-    {
-      fprintf (stderr, "ANALYZE_FIRST: Lost an optimization (fastmap=%d)!\n",
-               fastmap ? 1 : 0);
-      print_partial_compiled_pattern (stderr, p, pend);
-    }
-#endif
    return !safe;
  }
  
@@ -4056,30 +3728,6 @@ skip_one_char (re_char *p)
  }
  
  
-/* Jump over non-matching operations.  */
-static re_char *
-skip_noops (re_char *p, re_char *pend)
-{
-  while (p < pend)
-    {
-      switch (*p)
-       {
-       case start_memory:
-       case stop_memory:
-         p += 2; break;
-       case no_op:
-         p += 1; break;
-       case jump:
-         p = extract_address (p + 1);
-         break;
-       default:
-         return p;
-       }
-    }
-  eassert (p == pend);
-  return p;
-}
-
  /* Test if C matches charset op.  *PP points to the charset or charset_not
     opcode.  When the function finishes, *PP will be advanced past that opcode.
     C is character to test (possibly after translations) and CORIG is original
@@ -4247,199 +3895,6 @@ mutually_exclusive_charset (struct re_pattern_buffer *bufp, re_char *p1,
    return false;
  }
  
-/* True if "p1 matches something" implies "p2 fails".  */
-/* We're trying to follow all paths reachable from `p2`, but since some
-   loops can match the empty string, this can loop back to `p2`.
-
-   To avoid inf-looping, we take advantage of the fact that
-   the bytecode we generate is made of syntactically nested loops, more
-   specifically, every loop has a single entry point and single exit point.
-
-   The function takes 2 more arguments (`loop_entry` and `loop_exit`).
-   `loop_entry` points to the sole entry point of the current loop and
-   `loop_exit` points to its sole exit point (when non-NULL).
-
-   Jumps outside of `loop_entry..exit` should not occur.
-   The function can assume that `loop_exit` is "mutually exclusive".
-   The same holds for `loop_entry` except when `p2 == loop_entry`.
-
-   To guarantee termination, recursive calls should make sure that either
-   `loop_entry` is larger, or it's unchanged but `p2` is larger.
-
-   FIXME: This is failsafe (can't return true when it shouldn't)
-   but it could be too conservative if we start generating bytecode
-   with a different shape, so maybe we should bite the bullet and
-   replace done_beg/end with an actual list of positions we've
-   already processed.  */
-static bool
-mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1,
-                       re_char *p2, re_char *loop_entry, re_char *loop_exit)
-{
-  re_opcode_t op2;
-  unsigned char *pend = bufp->buffer + bufp->used;
-  re_char *p2_orig = p2;
-
-  eassert (p1 >= bufp->buffer && p1 < pend
-          && p2 >= bufp->buffer && p2 <= pend);
-
-  if (p2 == loop_exit)
-    return true;          /* Presumably already checked elsewhere.  */
-  eassert (loop_entry && p2 >= loop_entry);
-  if (p2 < loop_entry || (loop_exit && p2 > loop_exit))
-    { /* The assumptions about the shape of the code aren't true :-(  */
-#ifdef ENABLE_CHECKING
-      error ("Broken assumption in regex.c:mutually_exclusive_aux");
-#endif
-      return false;
-    }
-
-  /* Skip over open/close-group commands.
-     If what follows this loop is a ...+ construct,
-     look at what begins its body, since we will have to
-     match at least one of that.  */
-  p2 = skip_noops (p2, pend);
-  /* The same skip can be done for p1, except that this function
-     is only used in the case where p1 is a simple match operator.  */
-  /* p1 = skip_noops (p1, pend); */
-
-  eassert (p1 >= bufp->buffer && p1 < pend
-          && p2 >= bufp->buffer && p2 <= pend);
-
-  op2 = p2 == pend ? succeed : *p2;
-
-  switch (op2)
-    {
-    case succeed:
-    case endbuf:
-      /* If we're at the end of the pattern, we can change.  */
-      if (skip_one_char (p1))
-       {
-         DEBUG_PRINT ("  End of pattern: fast loop.\n");
-         return true;
-       }
-      break;
-
-    case endline:
-    case exactn:
-      return mutually_exclusive_exactn (bufp, p1, p2);
-
-    case charset:
-      {
-       if ((re_opcode_t) *p1 == exactn)
-         return mutually_exclusive_exactn (bufp, p2, p1);
-       else
-         return mutually_exclusive_charset (bufp, p1, p2);
-      }
-      break;
-
-    case charset_not:
-      switch (*p1)
-       {
-       case exactn:
-         return mutually_exclusive_exactn (bufp, p2, p1);
-       case charset:
-         return mutually_exclusive_charset (bufp, p2, p1);
-       case charset_not:
-         /* When we have two charset_not, it's very unlikely that
-            they don't overlap.  The union of the two sets of excluded
-            chars should cover all possible chars, which, as a matter of
-            fact, is virtually impossible in multibyte buffers.  */
-         break;
-       }
-      break;
-
-    case wordend:
-      return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
-    case symend:
-      return ((re_opcode_t) *p1 == syntaxspec
-              && (p1[1] == Ssymbol || p1[1] == Sword));
-    case notsyntaxspec:
-      return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
-
-    case wordbeg:
-      return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
-    case symbeg:
-      return ((re_opcode_t) *p1 == notsyntaxspec
-              && (p1[1] == Ssymbol || p1[1] == Sword));
-    case syntaxspec:
-      return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
-
-    case wordbound:
-      /* FIXME: This optimization seems correct after the first iteration
-         of the loop, but not for the very first :-(
-         IOW we'd need to pull out the first iteration and do:
-
-            syntaxspec w
-            on_failure_keep_string_jump end
-          loop:
-            syntaxspec w
-            goto loop
-          end:
-            wordbound
-
-      return (((re_opcode_t) *p1 == notsyntaxspec
-               || (re_opcode_t) *p1 == syntaxspec)
-              && p1[1] == Sword);  */
-      return false;
-
-    case categoryspec:
-      return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
-    case notcategoryspec:
-      return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
-
-    case on_failure_jump_nastyloop:
-    case on_failure_jump_smart:
-    case on_failure_jump_loop:
-    case on_failure_keep_string_jump:
-    case on_failure_jump:
-      {
-        int mcnt;
-       p2++;
-       EXTRACT_NUMBER_AND_INCR (mcnt, p2);
-       re_char *p2_other = p2 + mcnt, *tmp;
-       /* For `+` loops, we often have an `on_failure_jump` that skips forward
-          over a subsequent `jump` for lack of an `on_failure_dont_jump`
-          kind of thing.  Recognize this pattern since that subsequent
-          `jump` is the one that jumps to the loop-entry.  */
-       if ((re_opcode_t) p2[0] == jump && mcnt == 3)
-         p2 = extract_address (p2 + 1);
-
-       /* We have to check that both destinations are safe.
-          Arrange for `p2` to be the smaller of the two.  */
-       if (p2 > p2_other)
-         (tmp = p2, p2 = p2_other, p2_other = tmp);
-
-       if (p2_other <= p2_orig /* Both destinations go backward!  */
-           || !mutually_exclusive_aux (bufp, p1, p2_other,
-                                       loop_entry, loop_exit))
-         return false;
-
-       /* Now that we know that `p2_other` is a safe (i.e. mutually-exclusive)
-          position, let's check `p2`.  */
-       if (p2 == loop_entry)
-         /* If we jump backward to the entry point of the current loop
-            it means it's a zero-length cycle through that loop, so
-            this cycle itself does not break mutual-exclusion.  */
-         return true;
-       else if (p2 > p2_orig)
-         /* Boring forward jump.  */
-         return mutually_exclusive_aux (bufp, p1, p2, loop_entry, loop_exit);
-       else if (loop_entry < p2 && p2 < p2_orig)
-         /* We jump backward to a new loop, nested within the current one.
-            `p2` is the entry point and `p2_other` the exit of that inner.  */
-         return mutually_exclusive_aux (bufp, p1, p2, p2, p2_other);
-       else
-         return false;
-      }
-
-    default:
-      ;
-    }
-
-  /* Safe default.  */
-  return false;
-}
-
  struct mutexcl_data {
    struct re_pattern_buffer *bufp;
    re_char *p1;
@@ -4514,28 +3969,14 @@ mutually_exclusive_one (re_char *p2, void *arg)
      }
  }
  
+/* True if "p1 matches something" implies "p2 fails".  */
+
  static bool
  mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
                       re_char *p2)
  {
    struct mutexcl_data data = { bufp, p1 };
-  bool new = forall_firstchar (bufp, p2, NULL, mutually_exclusive_one, &data);
-#if ENABLE_CHECKING
-  bool old = mutually_exclusive_aux (bufp, p1, p2, bufp->buffer - 1, NULL);
-  if (old && !new)
-    {
-      fprintf (stderr, "MUTUALLY_EXCLUSIVE_P: Lost an optimization between %d and %d!\n",
-               (int)(p1 - bufp->buffer), (int)(p2 - bufp->buffer));
-      print_partial_compiled_pattern (stderr, bufp->buffer, bufp->buffer + bufp->used);
-    }
-  else if (!old && new)
-    {
-      fprintf (stderr, "MUTUALLY_EXCLUSIVE_P: New optimization between %d and %d!\n",
-               (int)(p1 - bufp->buffer), (int)(p2 - bufp->buffer));
-      print_partial_compiled_pattern (stderr, bufp->buffer, bufp->buffer + bufp->used);
-    }
-#endif
-  return new;
+  return forall_firstchar (bufp, p2, NULL, mutually_exclusive_one, &data);
  }
  \f
  /* Matching routines.  */
author	Stefan Monnier <monnier@iro.umontreal.ca>
	Fri, 29 Sep 2023 21:46:20 +0000 (17:46 -0400)
committer	Stefan Monnier <monnier@iro.umontreal.ca>
	Fri, 29 Sep 2023 21:46:20 +0000 (17:46 -0400)