;;; regexp-opt.el --- generate efficient regexps to match strings.
-;; Copyright (C) 1994, 1995, 1996, 1997 Free Software Foundation, Inc.
+;; Copyright (C) 1994, 95, 96, 97, 98, 1999 Free Software Foundation, Inc.
;; Author: Simon Marshall <simon@gnu.org>
;; Keywords: strings, regexps
+;; Version: 1.07
;; This file is part of GNU Emacs.
;;; Commentary:
-;; The "opt" in "regexp-opt" stands for "optim\\(al\\|i\\(se\\|ze\\)\\)".
+;; The "opt" in "regexp-opt" stands for "optim\\(al\\|i[sz]e\\)".
;;
;; This package generates a regexp from a given list of strings (which matches
;; one of those strings) so that the regexp generated by:
;;
;; Searching using the above example `regexp-opt' regexp takes approximately
;; two-thirds of the time taken using the equivalent `mapconcat' regexp.
+;;
+;; Note that this package will also find common suffix strings if this does not
+;; increase the number of grouping constructs. For example:
+;;
+;; (regexp-opt '("these" "those"))
+;; => "th[eo]se"
+;;
+;; but:
+;;
+;; (regexp-opt '("barfly" "housefly"))
+;; => "barfly\\|housefly" rather than "\\(bar\\|house\\)fly"
;; Since this package was written to produce efficient regexps, not regexps
;; efficiently, it is probably not a good idea to in-line too many calls in
;; your code for such changes to have effect in your code.
;; Originally written for font-lock.el, from an idea from Stig's hl319.el, with
-;; thanks for ideas also to Michael Ernst, Bob Glickstein and Dan Nicolaescu.
-;; Please don't tell me that it doesn't produce optimal regexps; I know that
-;; already. For example, the above explanation for the meaning of "opt" would
-;; be more efficient as "optim\\(al\\|i[sz]e\\)", but this requires complex
-;; forward looking. But (ideas or) code to improve things (are) is welcome.
+;; thanks for ideas also to Michael Ernst, Bob Glickstein, Dan Nicolaescu and
+;; Stefan Monnier.
+;; No doubt `regexp-opt' doesn't always produce optimal regexps, so code, ideas
+;; or any other information to improve things are welcome.
\f
-;;; Code:
+;;; Code.
;;;###autoload
(defun regexp-opt (strings &optional paren)
;; If LAX non-nil, don't output parentheses if it doesn't require them.
;; Merges keywords to avoid backtracking in Emacs' regexp matcher.
;;
- ;; The basic idea is to find the shortest common prefix, remove it and
- ;; recurse. If there is no prefix, we divide the list into two so that (at
- ;; least) one half will have at least a one-character common prefix.
+ ;; The basic idea is to find the shortest common prefix or suffix, remove it
+ ;; and recurse. If there is no prefix, we divide the list into two so that
+ ;; (at least) one half will have at least a one-character common prefix.
;;
;; Also we delay the addition of grouping parenthesis as long as possible
;; until we're sure we need them, and try to remove one-character sequences
(let* ((open-group (if paren "\\(" ""))
(close-group (if paren "\\)" ""))
(open-charset (if lax "" open-group))
- (close-charset (if lax "" close-group)))
+ (close-charset (if lax "" close-group))
+ (open-presuf open-charset)
+ (close-presuf close-charset))
(cond
- ;; Protect against user-stupidity... could call error here
- ((null strings)
- nil)
+ ;;
+ ;; If there are no strings, just return the empty string.
+ ((= (length strings) 0)
+ "")
+ ;;
;; If there is only one string, just return it.
((= (length strings) 1)
(if (= (length (car strings)) 1)
close-charset))
;;
;; If all are one-character strings, just return a character set.
- ((= (length strings) (apply '+ (mapcar 'length strings)))
+ ((= (apply 'max (mapcar 'length strings)) 1)
(concat open-charset
(regexp-opt-charset strings)
close-charset))
;; We have a list of different length strings.
(t
(let ((prefix (try-completion "" (mapcar 'list strings)))
+ (suffix (regexp-opt-try-suffix strings))
(letters (let ((completion-regexp-list '("^.$")))
(all-completions "" (mapcar 'list strings)))))
(cond
;;
;; If there is a common prefix, remove it and recurse on the suffixes.
((> (length prefix) 0)
- (let* ((length (length prefix))
- (suffixes (mapcar (lambda (s) (substring s length)) strings)))
- (concat open-group
+ (let* ((end (length prefix))
+ (suffixes (mapcar (lambda (s) (substring s end)) strings)))
+ (concat open-presuf
(regexp-quote prefix) (regexp-opt-group suffixes t t)
- close-group)))
+ close-presuf)))
+ ;;
+ ;; If there is a common suffix, remove it and recurse on the prefixes.
+ ((> (length suffix) (if lax
+ 0
+ (- (apply 'max (mapcar 'length strings)) 2)))
+ (let* ((end (- (length suffix)))
+ (prefixes (sort (mapcar (lambda (s) (substring s 0 end))
+ strings)
+ 'string-lessp)))
+ (concat open-presuf
+ (regexp-opt-group prefixes t t) (regexp-quote suffix)
+ close-presuf)))
;;
;; If there are several one-character strings, remove them and recurse
;; on the rest (first so the final regexp finds the longest match).
(concat "[" dash caret "]")
(concat "[" bracket charset caret dash "]"))))
+(defun regexp-opt-try-suffix (strings)
+ ;;
+ ;; Return common suffix of each string in STRINGS. See `try-completion'.
+ ;;
+ (let* ((chars (mapcar (lambda (s) (mapcar 'identity s)) strings))
+ (srahc (mapcar 'reverse chars))
+ (sgnirts (mapcar (lambda (c) (mapconcat 'char-to-string c "")) srahc))
+ (xiffus (try-completion "" (mapcar 'list sgnirts))))
+ (mapconcat 'char-to-string (reverse (mapcar 'identity xiffus)) "")))
+
(provide 'regexp-opt)
;;; regexp-opt.el ends here