From cc0d7d7a3867e4554f89262e4641c9845ee0d647 Mon Sep 17 00:00:00 2001 From: Stefan Monnier Date: Fri, 29 Sep 2023 14:55:24 -0400 Subject: [PATCH] search.c (re--describe-compiled): New function (bug#66261) This provides a fairly primitive but handy way to see what a regexp compiles to without having to enable REGEX_EMACS_DEBUG and wade through tons of stderr output. * doc/lispref/searching.texi (Regexp Problems): Mention `re--describe-compiled`. * src/regex-emacs.c (debug_putchar, print_fastmap) (print_partial_compiled_pattern, print_compiled_pattern): Add `dest` argument, and compile also when `ENABLE_CHECKING` is set. (DEBUG_PRINT_COMPILED_PATTERN, print_double_string, regex_compile): Adjust to additional argument. * src/regex-emacs.h (print_compiled_pattern): Declare. * src/search.c (Fre__describe_compiled): New function. (syms_of_search): Defsubr it. --- doc/lispref/searching.texi | 11 +++ etc/NEWS | 4 + src/regex-emacs.c | 176 +++++++++++++++++++------------------ src/regex-emacs.h | 4 + src/search.c | 41 +++++++++ 5 files changed, 151 insertions(+), 85 deletions(-) diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi index 66b33316faa..54b4e6f3a57 100644 --- a/doc/lispref/searching.texi +++ b/doc/lispref/searching.texi @@ -1972,6 +1972,17 @@ advice, don't be afraid of performing the matching in multiple function calls, each using a simpler regexp where backtracking can more easily be contained. +@defun re--describe-compiled regexp &optional raw +To help diagnose problems in your regexps or in the regexp engine +itself, this function returns a string describing the compiled +form of @var{regexp}. To make sense of it, it can be necessary +to read at least the description of the @code{re_opcode_t} type in the +@code{src/regex-emacs.c} file in Emacs' source code. + +It is currently able to give a meaningful description only if Emacs +was compiled with @code{--enable_checking}. +@end defun + @node Regexp Search @section Regular Expression Searching @cindex regular expression searching diff --git a/etc/NEWS b/etc/NEWS index 97ebc9a5de4..b3c7d3a8693 100644 --- a/etc/NEWS +++ b/etc/NEWS @@ -976,6 +976,10 @@ Use 'define-minor-mode' and 'define-globalized-minor-mode' instead. * Lisp Changes in Emacs 30.1 +** New function 're--describe-compiled' to see the innards of a regexp. +If you compiled with '--enable-checking', you can use this to help debug +either your regexp performance problems or the regexp engine. + +++ ** XLFDs are no longer restricted to 255 characters. 'font-xlfd-name' now returns an XLFD even if it is greater than 255 diff --git a/src/regex-emacs.c b/src/regex-emacs.c index ad140908609..f4ea867f3c0 100644 --- a/src/regex-emacs.c +++ b/src/regex-emacs.c @@ -268,7 +268,9 @@ typedef enum on_failure_jump, /* Like on_failure_jump, but pushes a placeholder instead of the - current string position when executed. */ + current string position when executed. Upon failure, + the current string position is thus not restored. + Used only for single-char loops that don't require backtracking. */ on_failure_keep_string_jump, /* Just like 'on_failure_jump', except that it checks that we @@ -434,38 +436,27 @@ extract_number_and_incr (re_char **source) /* If REGEX_EMACS_DEBUG is defined, print many voluminous messages (if the variable regex_emacs_debug is positive). */ -#ifdef REGEX_EMACS_DEBUG +#if defined REGEX_EMACS_DEBUG || ENABLE_CHECKING /* Use standard I/O for debugging. */ # include "sysstdio.h" -static int regex_emacs_debug = -100000; - -# define DEBUG_STATEMENT(e) e -# define DEBUG_PRINT(...) \ - if (regex_emacs_debug > 0) fprintf (stderr, __VA_ARGS__) -# define DEBUG_COMPILES_ARGUMENTS -# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ - if (regex_emacs_debug > 0) print_partial_compiled_pattern (s, e) -# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ - if (regex_emacs_debug > 0) print_double_string (w, s1, sz1, s2, sz2) - static void -debug_putchar (int c) +debug_putchar (FILE *dest, int c) { if (c >= 32 && c <= 126) - putc (c, stderr); + putc (c, dest); else { unsigned int uc = c; - fprintf (stderr, "{%02x}", uc); + fprintf (dest, "{%02x}", uc); } } /* Print the fastmap in human-readable form. */ static void -print_fastmap (char *fastmap) +print_fastmap (FILE *dest, char *fastmap) { bool was_a_range = false; int i = 0; @@ -475,7 +466,7 @@ print_fastmap (char *fastmap) if (fastmap[i++]) { was_a_range = false; - debug_putchar (i - 1); + debug_putchar (dest, i - 1); while (i < (1 << BYTEWIDTH) && fastmap[i]) { was_a_range = true; @@ -483,12 +474,12 @@ print_fastmap (char *fastmap) } if (was_a_range) { - debug_putchar ('-'); - debug_putchar (i - 1); + debug_putchar (dest, '-'); + debug_putchar (dest, i - 1); } } } - putc ('\n', stderr); + putc ('\n', dest); } @@ -496,7 +487,7 @@ print_fastmap (char *fastmap) the START pointer into it and ending just before the pointer END. */ static void -print_partial_compiled_pattern (re_char *start, re_char *end) +print_partial_compiled_pattern (FILE *dest, re_char *start, re_char *end) { int mcnt, mcnt2; re_char *p = start; @@ -504,50 +495,50 @@ print_partial_compiled_pattern (re_char *start, re_char *end) if (start == NULL) { - fputs ("(null)\n", stderr); + fputs ("(null)\n", dest); return; } /* Loop over pattern commands. */ while (p < pend) { - fprintf (stderr, "%td:\t", p - start); + fprintf (dest, "%td:\t", p - start); switch ((re_opcode_t) *p++) { case no_op: - fputs ("/no_op", stderr); + fputs ("/no_op", dest); break; case succeed: - fputs ("/succeed", stderr); + fputs ("/succeed", dest); break; case exactn: mcnt = *p++; - fprintf (stderr, "/exactn/%d", mcnt); + fprintf (dest, "/exactn/%d", mcnt); do { - debug_putchar ('/'); - debug_putchar (*p++); + debug_putchar (dest, '/'); + debug_putchar (dest, *p++); } while (--mcnt); break; case start_memory: - fprintf (stderr, "/start_memory/%d", *p++); + fprintf (dest, "/start_memory/%d", *p++); break; case stop_memory: - fprintf (stderr, "/stop_memory/%d", *p++); + fprintf (dest, "/stop_memory/%d", *p++); break; case duplicate: - fprintf (stderr, "/duplicate/%d", *p++); + fprintf (dest, "/duplicate/%d", *p++); break; case anychar: - fputs ("/anychar", stderr); + fputs ("/anychar", dest); break; case charset: @@ -558,11 +549,11 @@ print_partial_compiled_pattern (re_char *start, re_char *end) int length = CHARSET_BITMAP_SIZE (p - 1); bool has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1); - fprintf (stderr, "/charset [%s", + fprintf (dest, "/charset [%s", (re_opcode_t) *(p - 1) == charset_not ? "^" : ""); if (p + (*p & 0x7f) >= pend) - fputs (" !extends past end of pattern! ", stderr); + fputs (" !extends past end of pattern! ", dest); for (c = 0; c < 256; c++) if (c / 8 < length @@ -571,33 +562,33 @@ print_partial_compiled_pattern (re_char *start, re_char *end) /* Are we starting a range? */ if (last + 1 == c && ! in_range) { - debug_putchar ('-'); + debug_putchar (dest, '-'); in_range = true; } /* Have we broken a range? */ else if (last + 1 != c && in_range) { - debug_putchar (last); + debug_putchar (dest, last); in_range = false; } if (! in_range) - debug_putchar (c); + debug_putchar (dest, c); last = c; } if (in_range) - debug_putchar (last); + debug_putchar (dest, last); - debug_putchar (']'); + debug_putchar (dest, ']'); p += 1 + length; if (has_range_table) { int count; - fputs ("has-range-table", stderr); + fputs ("has-range-table", dest); /* ??? Should print the range table; for now, just skip it. */ p += 2; /* skip range table bits */ @@ -608,160 +599,175 @@ print_partial_compiled_pattern (re_char *start, re_char *end) break; case begline: - fputs ("/begline", stderr); + fputs ("/begline", dest); break; case endline: - fputs ("/endline", stderr); + fputs ("/endline", dest); break; case on_failure_jump: EXTRACT_NUMBER_AND_INCR (mcnt, p); - fprintf (stderr, "/on_failure_jump to %td", p + mcnt - start); + fprintf (dest, "/on_failure_jump to %td", p + mcnt - start); break; case on_failure_keep_string_jump: EXTRACT_NUMBER_AND_INCR (mcnt, p); - fprintf (stderr, "/on_failure_keep_string_jump to %td", + fprintf (dest, "/on_failure_keep_string_jump to %td", p + mcnt - start); break; case on_failure_jump_nastyloop: EXTRACT_NUMBER_AND_INCR (mcnt, p); - fprintf (stderr, "/on_failure_jump_nastyloop to %td", + fprintf (dest, "/on_failure_jump_nastyloop to %td", p + mcnt - start); break; case on_failure_jump_loop: EXTRACT_NUMBER_AND_INCR (mcnt, p); - fprintf (stderr, "/on_failure_jump_loop to %td", + fprintf (dest, "/on_failure_jump_loop to %td", p + mcnt - start); break; case on_failure_jump_smart: EXTRACT_NUMBER_AND_INCR (mcnt, p); - fprintf (stderr, "/on_failure_jump_smart to %td", + fprintf (dest, "/on_failure_jump_smart to %td", p + mcnt - start); break; case jump: EXTRACT_NUMBER_AND_INCR (mcnt, p); - fprintf (stderr, "/jump to %td", p + mcnt - start); + fprintf (dest, "/jump to %td", p + mcnt - start); break; case succeed_n: EXTRACT_NUMBER_AND_INCR (mcnt, p); EXTRACT_NUMBER_AND_INCR (mcnt2, p); - fprintf (stderr, "/succeed_n to %td, %d times", + fprintf (dest, "/succeed_n to %td, %d times", p - 2 + mcnt - start, mcnt2); break; case jump_n: EXTRACT_NUMBER_AND_INCR (mcnt, p); EXTRACT_NUMBER_AND_INCR (mcnt2, p); - fprintf (stderr, "/jump_n to %td, %d times", + fprintf (dest, "/jump_n to %td, %d times", p - 2 + mcnt - start, mcnt2); break; case set_number_at: EXTRACT_NUMBER_AND_INCR (mcnt, p); EXTRACT_NUMBER_AND_INCR (mcnt2, p); - fprintf (stderr, "/set_number_at location %td to %d", + fprintf (dest, "/set_number_at location %td to %d", p - 2 + mcnt - start, mcnt2); break; case wordbound: - fputs ("/wordbound", stderr); + fputs ("/wordbound", dest); break; case notwordbound: - fputs ("/notwordbound", stderr); + fputs ("/notwordbound", dest); break; case wordbeg: - fputs ("/wordbeg", stderr); + fputs ("/wordbeg", dest); break; case wordend: - fputs ("/wordend", stderr); + fputs ("/wordend", dest); break; case symbeg: - fputs ("/symbeg", stderr); + fputs ("/symbeg", dest); break; case symend: - fputs ("/symend", stderr); + fputs ("/symend", dest); break; case syntaxspec: - fputs ("/syntaxspec", stderr); + fputs ("/syntaxspec", dest); mcnt = *p++; - fprintf (stderr, "/%d", mcnt); + fprintf (dest, "/%d", mcnt); break; case notsyntaxspec: - fputs ("/notsyntaxspec", stderr); + fputs ("/notsyntaxspec", dest); mcnt = *p++; - fprintf (stderr, "/%d", mcnt); + fprintf (dest, "/%d", mcnt); break; case at_dot: - fputs ("/at_dot", stderr); + fputs ("/at_dot", dest); break; case categoryspec: - fputs ("/categoryspec", stderr); + fputs ("/categoryspec", dest); mcnt = *p++; - fprintf (stderr, "/%d", mcnt); + fprintf (dest, "/%d", mcnt); break; case notcategoryspec: - fputs ("/notcategoryspec", stderr); + fputs ("/notcategoryspec", dest); mcnt = *p++; - fprintf (stderr, "/%d", mcnt); + fprintf (dest, "/%d", mcnt); break; case begbuf: - fputs ("/begbuf", stderr); + fputs ("/begbuf", dest); break; case endbuf: - fputs ("/endbuf", stderr); + fputs ("/endbuf", dest); break; default: - fprintf (stderr, "?%d", *(p-1)); + fprintf (dest, "?%d", *(p-1)); } - putc ('\n', stderr); + putc ('\n', dest); } - fprintf (stderr, "%td:\tend of pattern.\n", p - start); + fprintf (dest, "%td:\tend of pattern.\n", p - start); } - -static void -print_compiled_pattern (struct re_pattern_buffer *bufp) +void +print_compiled_pattern (FILE *dest, struct re_pattern_buffer *bufp) { + if (!dest) + dest = stderr; re_char *buffer = bufp->buffer; - print_partial_compiled_pattern (buffer, buffer + bufp->used); - fprintf (stderr, "%td bytes used/%td bytes allocated.\n", + print_partial_compiled_pattern (dest, buffer, buffer + bufp->used); + fprintf (dest, "%td bytes used/%td bytes allocated.\n", bufp->used, bufp->allocated); if (bufp->fastmap_accurate && bufp->fastmap) { - fputs ("fastmap: ", stderr); - print_fastmap (bufp->fastmap); + fputs ("fastmap: ", dest); + print_fastmap (dest, bufp->fastmap); } - fprintf (stderr, "re_nsub: %td\t", bufp->re_nsub); - fprintf (stderr, "regs_alloc: %d\t", bufp->regs_allocated); - fprintf (stderr, "can_be_null: %d\n", bufp->can_be_null); + fprintf (dest, "re_nsub: %td\t", bufp->re_nsub); + fprintf (dest, "regs_alloc: %d\t", bufp->regs_allocated); + fprintf (dest, "can_be_null: %d\n", bufp->can_be_null); /* Perhaps we should print the translate table? */ } +#endif + +#ifdef REGEX_EMACS_DEBUG + +static int regex_emacs_debug = -100000; + +# define DEBUG_STATEMENT(e) e +# define DEBUG_PRINT(...) \ + if (regex_emacs_debug > 0) fprintf (stderr, __VA_ARGS__) +# define DEBUG_COMPILES_ARGUMENTS +# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ + if (regex_emacs_debug > 0) print_partial_compiled_pattern (stderr, s, e) +# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ + if (regex_emacs_debug > 0) print_double_string (w, s1, sz1, s2, sz2) static void print_double_string (re_char *where, re_char *string1, ptrdiff_t size1, @@ -775,12 +781,12 @@ print_double_string (re_char *where, re_char *string1, ptrdiff_t size1, if (FIRST_STRING_P (where)) { for (i = 0; i < string1 + size1 - where; i++) - debug_putchar (where[i]); + debug_putchar (stderr, where[i]); where = string2; } for (i = 0; i < string2 + size2 - where; i++) - debug_putchar (where[i]); + debug_putchar (stderr, where[i]); } } @@ -1760,7 +1766,7 @@ regex_compile (re_char *pattern, ptrdiff_t size, if (regex_emacs_debug > 0) { for (ptrdiff_t debug_count = 0; debug_count < size; debug_count++) - debug_putchar (pattern[debug_count]); + debug_putchar (stderr, pattern[debug_count]); putc ('\n', stderr); } #endif diff --git a/src/regex-emacs.h b/src/regex-emacs.h index bc357633135..d9adcc69443 100644 --- a/src/regex-emacs.h +++ b/src/regex-emacs.h @@ -195,4 +195,8 @@ extern bool re_iswctype (int ch, re_wctype_t cc); extern re_wctype_t re_wctype_parse (const unsigned char **strp, ptrdiff_t limit); +#if ENABLE_CHECKING +extern void print_compiled_pattern (FILE *dest, struct re_pattern_buffer *bufp); +#endif + #endif /* EMACS_REGEX_H */ diff --git a/src/search.c b/src/search.c index 742a78cb0cd..014fd97d423 100644 --- a/src/search.c +++ b/src/search.c @@ -3376,6 +3376,46 @@ the buffer. If the buffer doesn't have a cache, the value is nil. */) set_buffer_internal_1 (old); return val; } + +DEFUN ("re--describe-compiled", Fre__describe_compiled, Sre__describe_compiled, + 1, 2, 0, + doc: /* Return a string describing the compiled form of REGEXP. +If RAW is non-nil, just return the actual bytecode. */) + (Lisp_Object regexp, Lisp_Object raw) +{ + struct regexp_cache *cache_entry + = compile_pattern (regexp, NULL, + (!NILP (BVAR (current_buffer, case_fold_search)) + ? BVAR (current_buffer, case_canon_table) : Qnil), + false, + !NILP (BVAR (current_buffer, + enable_multibyte_characters))); + if (!NILP (raw)) + return make_unibyte_string (cache_entry->buf.buffer, + cache_entry->buf.used); + else + { /* FIXME: Why ENABLE_CHECKING? */ +#if !ENABLE_CHECKING + error ("Not available: rebuild with --enable-checking"); +#elsif HAVE_OPEN_MEMSTREAM + char *buffer = NULL; + size_t size = 0; + FILE* f = open_memstream (&buffer, &size); + if (!f) + report_file_error ("open_memstream failed", regexp); + print_compiled_pattern (f, &cache_entry->buf); + fclose (f); + if (!buffer) + return Qnil; + Lisp_Object description = make_unibyte_string (buffer, size); + free (buffer); + return description; +#else + print_compiled_pattern (stderr, &cache_entry->buf); + return build_string ("Description was sent to standard error"); +#endif + } +} static void syms_of_search_for_pdumper (void); @@ -3455,6 +3495,7 @@ is to bind it with `let' around a small expression. */); defsubr (&Smatch_data__translate); defsubr (&Sregexp_quote); defsubr (&Snewline_cache_check); + defsubr (&Sre__describe_compiled); pdumper_do_now_and_after_load (syms_of_search_for_pdumper); } -- 2.39.5