From 3fd74915121a3eac265170e20bd19b3cde6a2589 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Mattias=20Engdeg=C3=A5rd?= Date: Sun, 30 Jun 2019 12:53:52 +0200 Subject: [PATCH] Optimise more inputs to `regexp-opt' (bug#36444) Use a more precise test to determine whether the input to `regexp-opt' is safe to optimise when KEEP-ORDER is non-nil, permitting more inputs to be optimised than before. For example, ("good" "goal" "go") is now accepted. * lisp/emacs-lisp/regexp-opt.el (regexp-opt): More precise test for whether the list is safe w.r.t. KEEP-ORDER. (regexp-opt--contains-prefix): Remove. * test/lisp/emacs-lisp/regexp-opt-tests.el: Use lexical-binding. (regexp-opt-test--permutation, regexp-opt-test--factorial) (regexp-opt-test--permutations, regexp-opt-test--match-all) (regexp-opt-test--check-perm, regexp-opt-test--explain-perm) (regexp-opt-keep-order): Test KEEP-ORDER. --- lisp/emacs-lisp/regexp-opt.el | 46 +++++++++--------- test/lisp/emacs-lisp/regexp-opt-tests.el | 62 +++++++++++++++++++++++- 2 files changed, 83 insertions(+), 25 deletions(-) diff --git a/lisp/emacs-lisp/regexp-opt.el b/lisp/emacs-lisp/regexp-opt.el index b6104f22e7d..ab52003cdf7 100644 --- a/lisp/emacs-lisp/regexp-opt.el +++ b/lisp/emacs-lisp/regexp-opt.el @@ -140,21 +140,34 @@ usually more efficient than that of a simplified version: (completion-ignore-case nil) (completion-regexp-list nil) (open (cond ((stringp paren) paren) (paren "\\("))) - (sorted-strings (delete-dups - (sort (copy-sequence strings) 'string-lessp))) (re (cond ;; No strings: return an unmatchable regexp. ((null strings) (concat (or open "\\(?:") regexp-unmatchable "\\)")) - ;; If we cannot reorder, give up all attempts at - ;; optimisation. There is room for improvement (Bug#34641). - ((and keep-order (regexp-opt--contains-prefix sorted-strings)) - (concat (or open "\\(?:") - (mapconcat #'regexp-quote strings "\\|") - "\\)")) + + ;; The algorithm will generate a pattern that matches + ;; longer strings in the list before shorter. If the + ;; list order matters, then no string must come after a + ;; proper prefix of that string. To check this, verify + ;; that a straight or-pattern matches each string + ;; entirely. + ((and keep-order + (let* ((case-fold-search nil) + (alts (mapconcat #'regexp-quote strings "\\|"))) + (and (let ((s strings)) + (while (and s + (string-match alts (car s)) + (= (match-end 0) (length (car s)))) + (setq s (cdr s))) + ;; If we exited early, we found evidence that + ;; regexp-opt-group cannot be used. + s) + (concat (or open "\\(?:") alts "\\)"))))) (t - (regexp-opt-group sorted-strings (or open t) (not open)))))) + (regexp-opt-group + (delete-dups (sort (copy-sequence strings) 'string-lessp)) + (or open t) (not open)))))) (cond ((eq paren 'words) (concat "\\<" re "\\>")) ((eq paren 'symbols) @@ -339,21 +352,6 @@ never matches anything." (concat "[" all "]"))))))) -(defun regexp-opt--contains-prefix (strings) - "Whether STRINGS contains a proper prefix of one of its other elements. -STRINGS must be a list of sorted strings without duplicates." - (let ((s strings)) - ;; In a lexicographically sorted list, a string always immediately - ;; succeeds one of its prefixes. - (while (and (cdr s) - (not (string-equal - (car s) - (substring (cadr s) 0 (min (length (car s)) - (length (cadr s))))))) - (setq s (cdr s))) - (cdr s))) - - (provide 'regexp-opt) ;;; regexp-opt.el ends here diff --git a/test/lisp/emacs-lisp/regexp-opt-tests.el b/test/lisp/emacs-lisp/regexp-opt-tests.el index 927de8c6a5f..3658964faac 100644 --- a/test/lisp/emacs-lisp/regexp-opt-tests.el +++ b/test/lisp/emacs-lisp/regexp-opt-tests.el @@ -1,4 +1,4 @@ -;;; regexp-opt-tests.el --- Tests for regexp-opt.el +;;; regexp-opt-tests.el --- Tests for regexp-opt.el -*- lexical-binding: t -*- ;; Copyright (C) 2013-2019 Free Software Foundation, Inc. @@ -25,6 +25,66 @@ (require 'regexp-opt) +(defun regexp-opt-test--permutation (n list) + "The Nth permutation of LIST, 0 ≤ N < (length LIST)!." + (let ((len (length list)) + (perm-list nil)) + (dotimes (i len) + (let* ((d (- len i)) + (k (mod n d))) + (push (nth k list) perm-list) + (setq list (append (butlast list (- (length list) k)) + (nthcdr (1+ k) list))) + (setq n (/ n d)))) + (nreverse perm-list))) + +(defun regexp-opt-test--factorial (n) + "N!" + (apply #'* (number-sequence 1 n))) + +(defun regexp-opt-test--permutations (list) + "All permutations of LIST." + (mapcar (lambda (i) (regexp-opt-test--permutation i list)) + (number-sequence 0 (1- (regexp-opt-test--factorial (length list)))))) + +(defun regexp-opt-test--match-all (words re) + (mapcar (lambda (w) (and (string-match re w) + (match-string 0 w))) + words)) + +(defun regexp-opt-test--check-perm (perm) + (let* ((ref-re (mapconcat #'regexp-quote perm "\\|")) + (opt-re (regexp-opt perm nil t)) + (ref (regexp-opt-test--match-all perm ref-re)) + (opt (regexp-opt-test--match-all perm opt-re))) + (equal opt ref))) + +(defun regexp-opt-test--explain-perm (perm) + (let* ((ref-re (mapconcat #'regexp-quote perm "\\|")) + (opt-re (regexp-opt perm nil t)) + (ref (regexp-opt-test--match-all perm ref-re)) + (opt (regexp-opt-test--match-all perm opt-re))) + (concat "\n" + (format "Naïve regexp: %s\n" ref-re) + (format "Optimised regexp: %s\n" opt-re) + (format "Got: %s\n" opt) + (format "Expected: %s\n" ref)))) + +(put 'regexp-opt-test--check-perm 'ert-explainer 'regexp-opt-test--explain-perm) + +(ert-deftest regexp-opt-keep-order () + "Check that KEEP-ORDER works." + (dolist (perm (regexp-opt-test--permutations '("abc" "bca" "cab"))) + (should (regexp-opt-test--check-perm perm))) + (dolist (perm (regexp-opt-test--permutations '("abc" "ab" "bca" "bc"))) + (should (regexp-opt-test--check-perm perm))) + (dolist (perm (regexp-opt-test--permutations '("abxy" "cdxy"))) + (should (regexp-opt-test--check-perm perm))) + (dolist (perm (regexp-opt-test--permutations '("afgx" "bfgx" "afgy" "bfgy"))) + (should (regexp-opt-test--check-perm perm))) + (dolist (perm (regexp-opt-test--permutations '("a" "ab" "ac" "abc"))) + (should (regexp-opt-test--check-perm perm)))) + (ert-deftest regexp-opt-charset () (should (equal (regexp-opt-charset '(?a ?b ?a)) "[ab]")) (should (equal (regexp-opt-charset '(?D ?d ?B ?a ?b ?C ?7 ?a ?c ?A)) -- 2.39.2