From: Mattias EngdegÄrd Date: Sun, 24 Feb 2019 21:12:52 +0000 (+0100) Subject: rx: fix `or' ordering by adding argument to regexp-opt X-Git-Tag: emacs-27.0.90~3506 X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=da758046da74e33273265cd2e72a8aa1a0c9c7e3;p=emacs.git rx: fix `or' ordering by adding argument to regexp-opt The rx `or' form may reorder its arguments in an unpredictable way, contrary to user expectation, since it sometimes uses `regexp-opt'. Add a NOREORDER option to `regexp-opt' for preventing it from producing a reordered regexp (Bug#34641). * doc/lispref/searching.texi (Regular Expression Functions): * etc/NEWS (Lisp Changes in Emacs 27.1): Describe the new regexp-opt NOREORDER argument. * lisp/emacs-lisp/regexp-opt.el (regexp-opt): Add NOREORDER. Make no attempt at regexp improvement if the set of strings contains a prefix of another string. (regexp-opt--contains-prefix): New. * lisp/emacs-lisp/rx.el (rx-or): Call regexp-opt with NOREORDER. * test/lisp/emacs-lisp/rx-tests.el: Test rx `or' form match order. --- diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi index cfbd2449b13..fb7f48474d5 100644 --- a/doc/lispref/searching.texi +++ b/doc/lispref/searching.texi @@ -950,7 +950,7 @@ whitespace: @end defun @cindex optimize regexp -@defun regexp-opt strings &optional paren +@defun regexp-opt strings &optional paren noreorder This function returns an efficient regular expression that will match any of the strings in the list @var{strings}. This is useful when you need to make matching or searching as fast as possible---for example, @@ -985,8 +985,15 @@ if it is necessary to ensure that a postfix operator appended to it will apply to the whole expression. @end table -The resulting regexp of @code{regexp-opt} is equivalent to but usually -more efficient than that of a simplified version: +The optional argument @var{noreorder}, if @code{nil} or omitted, +allows the returned regexp to match the strings in any order. If +non-@code{nil}, the match is guaranteed to be performed in the order +given, as if the strings were made into a regexp by joining them with +the @samp{\|} operator. + +Up to reordering, the resulting regexp of @code{regexp-opt} is +equivalent to but usually more efficient than that of a simplified +version: @example (defun simplified-regexp-opt (strings &optional paren) diff --git a/etc/NEWS b/etc/NEWS index 29ed7ab4819..7c95988ff52 100644 --- a/etc/NEWS +++ b/etc/NEWS @@ -1642,6 +1642,13 @@ MS-Windows. ** New module environment function 'process_input' to process user input while module code is running. ++++ +** The function 'regexp-opt' accepts an additional optional argument. +By default, the regexp returned by 'regexp-opt' may match the strings +in any order. If the new third argument is non-nil, the match is +guaranteed to be performed in the order given, as if the strings were +made into a regexp by joining them with '\|'. + * Changes in Emacs 27.1 on Non-Free Operating Systems diff --git a/lisp/emacs-lisp/regexp-opt.el b/lisp/emacs-lisp/regexp-opt.el index 63786c1508c..d0c5f2d3fc4 100644 --- a/lisp/emacs-lisp/regexp-opt.el +++ b/lisp/emacs-lisp/regexp-opt.el @@ -84,7 +84,7 @@ ;;; Code: ;;;###autoload -(defun regexp-opt (strings &optional paren) +(defun regexp-opt (strings &optional paren noreorder) "Return a regexp to match a string in the list STRINGS. Each string should be unique in STRINGS and should not contain any regexps, quoted or not. Optional PAREN specifies how the @@ -111,8 +111,14 @@ nil necessary to ensure that a postfix operator appended to it will apply to the whole expression. -The resulting regexp is equivalent to but usually more efficient -than that of a simplified version: +The optional argument NOREORDER, if nil or omitted, allows the +returned regexp to match the strings in any order. If non-nil, +the match is guaranteed to be performed in the order given, as if +the strings were made into a regexp by joining them with the +`\\|' operator. + +Up to reordering, the resulting regexp is equivalent to but +usually more efficient than that of a simplified version: (defun simplified-regexp-opt (strings &optional paren) (let ((parens @@ -133,7 +139,15 @@ than that of a simplified version: (open (cond ((stringp paren) paren) (paren "\\("))) (sorted-strings (delete-dups (sort (copy-sequence strings) 'string-lessp))) - (re (regexp-opt-group sorted-strings (or open t) (not open)))) + (re + ;; If NOREORDER is non-nil and the list contains a prefix + ;; of another string, we give up all attempts at optimisation. + ;; There is plenty of room for improvement (Bug#34641). + (if (and noreorder (regexp-opt--contains-prefix sorted-strings)) + (concat (or open "\\(?:") + (mapconcat #'regexp-quote strings "\\|") + "\\)") + (regexp-opt-group sorted-strings (or open t) (not open))))) (cond ((eq paren 'words) (concat "\\<" re "\\>")) ((eq paren 'symbols) @@ -313,6 +327,22 @@ CHARS should be a list of characters." (concat "[" dash caret "]")) (concat "[" bracket charset caret dash "]")))) + +(defun regexp-opt--contains-prefix (strings) + "Whether STRINGS contains a proper prefix of one of its other elements. +STRINGS must be a list of sorted strings without duplicates." + (let ((s strings)) + ;; In a lexicographically sorted list, a string always immediately + ;; succeeds one of its prefixes. + (while (and (cdr s) + (not (string-equal + (car s) + (substring (cadr s) 0 (min (length (car s)) + (length (cadr s))))))) + (setq s (cdr s))) + (cdr s))) + + (provide 'regexp-opt) ;;; regexp-opt.el ends here diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el index 715cd608c46..ca756efb493 100644 --- a/lisp/emacs-lisp/rx.el +++ b/lisp/emacs-lisp/rx.el @@ -393,7 +393,7 @@ FORM is of the form `(and FORM1 ...)'." (rx-group-if (if (memq nil (mapcar 'stringp (cdr form))) (mapconcat (lambda (x) (rx-form x '|)) (cdr form) "\\|") - (regexp-opt (cdr form))) + (regexp-opt (cdr form) nil t)) (and (memq rx-parent '(: * t)) rx-parent))) diff --git a/test/lisp/emacs-lisp/rx-tests.el b/test/lisp/emacs-lisp/rx-tests.el index e14feda347f..fa3d9b0d5ea 100644 --- a/test/lisp/emacs-lisp/rx-tests.el +++ b/test/lisp/emacs-lisp/rx-tests.el @@ -92,5 +92,18 @@ (*? "e") (+? "f") (\?? "g") (?? "h")))) "a*b+c?d?e*?f+?g??h??"))) +(ert-deftest rx-or () + ;; Test or-pattern reordering (Bug#34641). + (let ((s "abc")) + (should (equal (and (string-match (rx (or "abc" "ab" "a")) s) + (match-string 0 s)) + "abc")) + (should (equal (and (string-match (rx (or "ab" "abc" "a")) s) + (match-string 0 s)) + "ab")) + (should (equal (and (string-match (rx (or "a" "ab" "abc")) s) + (match-string 0 s)) + "a")))) + (provide 'rx-tests) ;; rx-tests.el ends here.