From: Mattias EngdegÄrd Date: Tue, 11 Feb 2020 19:04:42 +0000 (+0100) Subject: rx: Improve 'or' compositionality (bug#37659) X-Git-Tag: emacs-27.0.91~146 X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=49d3cd90bd80a225d5ec26027318ffb4606ff513;p=emacs.git rx: Improve 'or' compositionality (bug#37659) Perform 'regexp-opt' on nested 'or' forms, and after expansion of user-defined and 'eval' forms. Characters are now turned into strings for wider 'regexp-opt' scope. This preserves the longest-match semantics for string in 'or' forms over composition. * doc/lispref/searching.texi (Rx Constructs): Document. * lisp/emacs-lisp/rx.el (rx--normalise-or-arg) (rx--all-string-or-args): New. (rx--translate-or): Normalise arguments first, and check for strings in subforms. (rx--expand-eval): Extracted from rx--translate-eval. (rx--translate-eval): Call rx--expand-eval. * test/lisp/emacs-lisp/rx-tests.el (rx-or, rx-def-in-or): Add tests. * etc/NEWS: Announce. --- diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi index a4d5a27203f..1a090ebe101 100644 --- a/doc/lispref/searching.texi +++ b/doc/lispref/searching.texi @@ -1086,8 +1086,9 @@ Corresponding string regexp: @samp{@var{A}@var{B}@dots{}} @itemx @code{(| @var{rx}@dots{})} @cindex @code{|} in rx Match exactly one of the @var{rx}s. -If all arguments are string literals, the longest possible match -will always be used. Otherwise, either the longest match or the +If all arguments are strings, characters, or @code{or} forms +so constrained, the longest possible match will always be used. +Otherwise, either the longest match or the first (in left-to-right order) will be used. Without arguments, the expression will not match anything at all.@* Corresponding string regexp: @samp{@var{A}\|@var{B}\|@dots{}}. diff --git a/etc/NEWS b/etc/NEWS index e9dfd266b46..6e2b1fe00e2 100644 --- a/etc/NEWS +++ b/etc/NEWS @@ -2325,6 +2325,12 @@ expressions from simpler parts. +++ *** 'not' argument can now be a character or single-char string. ++++ +*** Nested 'or' forms of strings guarantee a longest match. +For example, (or (or "IN" "OUT") (or "INPUT" "OUTPUT")) now matches +the whole string "INPUT" if present, not just "IN". Previously, this +was only guaranteed inside a single 'or' form of string literals. + ** Frames +++ diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el index 1ee5e8294a6..a0b2444346a 100644 --- a/lisp/emacs-lisp/rx.el +++ b/lisp/emacs-lisp/rx.el @@ -254,22 +254,39 @@ Left-fold the list L, starting with X, by the binary function F." (setq l (cdr l))) x) +(defun rx--normalise-or-arg (form) + "Normalise the `or' argument FORM. +Characters become strings, user-definitions and `eval' forms are expanded, +and `or' forms are normalised recursively." + (cond ((characterp form) + (char-to-string form)) + ((and (consp form) (memq (car form) '(or |))) + (cons (car form) (mapcar #'rx--normalise-or-arg (cdr form)))) + ((and (consp form) (eq (car form) 'eval)) + (rx--normalise-or-arg (rx--expand-eval (cdr form)))) + (t + (let ((expanded (rx--expand-def form))) + (if expanded + (rx--normalise-or-arg expanded) + form))))) + +(defun rx--all-string-or-args (body) + "If BODY only consists of strings or such `or' forms, return all the strings. +Otherwise throw `rx--nonstring'." + (mapcan (lambda (form) + (cond ((stringp form) (list form)) + ((and (consp form) (memq (car form) '(or |))) + (rx--all-string-or-args (cdr form))) + (t (throw 'rx--nonstring nil)))) + body)) + (defun rx--translate-or (body) "Translate an or-pattern of zero or more rx items. Return (REGEXP . PRECEDENCE)." ;; FIXME: Possible improvements: ;; - ;; - Turn single characters to strings: (or ?a ?b) -> (or "a" "b"), - ;; so that they can be candidates for regexp-opt. - ;; - ;; - Translate compile-time strings (`eval' forms), again for regexp-opt. - ;; ;; - Flatten sub-patterns first: (or (or A B) (or C D)) -> (or A B C D) - ;; in order to improve effectiveness of regexp-opt. - ;; This would also help composability. - ;; - ;; - Use associativity to run regexp-opt on contiguous subsets of arguments - ;; if not all of them are strings. Example: + ;; Then call regexp-opt on runs of string arguments. Example: ;; (or (+ digit) "CHARLIE" "CHAN" (+ blank)) ;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank)) ;; @@ -279,27 +296,26 @@ Return (REGEXP . PRECEDENCE)." ;; so that (or "@" "%" digit (any "A-Z" space) (syntax word)) ;; -> (any "@" "%" digit "A-Z" space word) ;; -> "[A-Z@%[:digit:][:space:][:word:]]" - ;; - ;; Problem: If a subpattern is carefully written to be - ;; optimizable by regexp-opt, how do we prevent the transforms - ;; above from destroying that property? - ;; Example: (or "a" (or "abc" "abd" "abe")) (cond ((null body) ; No items: a never-matching regexp. (rx--empty)) ((null (cdr body)) ; Single item. (rx--translate (car body))) - ((rx--every #'stringp body) ; All strings. - (cons (list (regexp-opt body nil)) - t)) - ((rx--every #'rx--charset-p body) ; All charsets. - (rx--translate-union nil body)) (t - (cons (append (car (rx--translate (car body))) - (mapcan (lambda (item) - (cons "\\|" (car (rx--translate item)))) - (cdr body))) - nil)))) + (let* ((args (mapcar #'rx--normalise-or-arg body)) + (all-strings (catch 'rx--nonstring (rx--all-string-or-args args)))) + (cond + (all-strings ; Only strings. + (cons (list (regexp-opt all-strings nil)) + t)) + ((rx--every #'rx--charset-p args) ; All charsets. + (rx--translate-union nil args)) + (t + (cons (append (car (rx--translate (car args))) + (mapcan (lambda (item) + (cons "\\|" (car (rx--translate item)))) + (cdr args))) + nil))))))) (defun rx--charset-p (form) "Whether FORM looks like a charset, only consisting of character intervals @@ -840,11 +856,15 @@ Return (REGEXP . PRECEDENCE)." (cons (list (list 'regexp-quote arg)) 'seq)) (t (error "rx `literal' form with non-string argument"))))) -(defun rx--translate-eval (body) - "Translate the `eval' form. Return (REGEXP . PRECEDENCE)." +(defun rx--expand-eval (body) + "Expand `eval' arguments. Return a new rx form." (unless (and body (null (cdr body))) (error "rx `eval' form takes exactly one argument")) - (rx--translate (eval (car body)))) + (eval (car body))) + +(defun rx--translate-eval (body) + "Translate the `eval' form. Return (REGEXP . PRECEDENCE)." + (rx--translate (rx--expand-eval body))) (defvar rx--regexp-atomic-regexp nil) diff --git a/test/lisp/emacs-lisp/rx-tests.el b/test/lisp/emacs-lisp/rx-tests.el index 2e34d65a9aa..4888e1d9d1e 100644 --- a/test/lisp/emacs-lisp/rx-tests.el +++ b/test/lisp/emacs-lisp/rx-tests.el @@ -42,13 +42,24 @@ (ert-deftest rx-or () (should (equal (rx (or "ab" (| "c" nonl) "de")) "ab\\|c\\|.\\|de")) - (should (equal (rx (or "ab" "abc" "a")) + (should (equal (rx (or "ab" "abc" ?a)) "\\(?:a\\(?:bc?\\)?\\)")) + (should (equal (rx (or "ab" (| (or "abcd" "abcde")) (or "a" "abc"))) + "\\(?:a\\(?:b\\(?:c\\(?:de?\\)?\\)?\\)?\\)")) + (should (equal (rx (or "a" (eval (string ?a ?b)))) + "\\(?:ab?\\)")) (should (equal (rx (| nonl "a") (| "b" blank)) "\\(?:.\\|a\\)\\(?:b\\|[[:blank:]]\\)")) (should (equal (rx (|)) "\\`a\\`"))) +(ert-deftest rx-def-in-or () + (rx-let ((a b) + (b (or "abc" c)) + (c ?a)) + (should (equal (rx (or a (| "ab" "abcde") "abcd")) + "\\(?:a\\(?:b\\(?:c\\(?:de?\\)?\\)?\\)?\\)")))) + (ert-deftest rx-char-any () "Test character alternatives with `]' and `-' (Bug#25123)." (should (equal