From f16766a0eb2a78b58a4856d31306fc37f913d70e Mon Sep 17 00:00:00 2001 From: =?utf8?q?Mattias=20Engdeg=C3=A5rd?= Date: Thu, 12 Dec 2019 23:04:00 +0100 Subject: [PATCH] Use `or' instead of `union' for charset union in rx Design change suggested by Stefan Monnier. * doc/lispref/searching.texi (Rx Constructs): * etc/NEWS: Document. * lisp/emacs-lisp/rx.el (rx--translate-or): Detect charset arguments. (rx--charset-p): New. (rx--translate-not, rx--charset-intervals, rx--translate-union): Change from `union' to `or'. (rx--translate-form, rx--builtin-forms, rx): Remove `union'. * test/lisp/emacs-lisp/rx-tests.el (rx-union, rx-def-in-union) (rx-intersection): Rename tests and change `union' to `or' and `|'. --- doc/lispref/searching.texi | 22 ++++++++-------- etc/NEWS | 6 ++--- lisp/emacs-lisp/rx.el | 41 ++++++++++++++++++----------- test/lisp/emacs-lisp/rx-tests.el | 44 ++++++++++++++++++-------------- 4 files changed, 65 insertions(+), 48 deletions(-) diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi index bf18f80f63f..0c6c7cc68b5 100644 --- a/doc/lispref/searching.texi +++ b/doc/lispref/searching.texi @@ -1214,20 +1214,19 @@ Corresponding string regexp: @samp{[@dots{}]} @item @code{(not @var{charspec})} @cindex @code{not} in rx Match a character not included in @var{charspec}. @var{charspec} can -be an @code{any}, @code{not}, @code{union}, @code{intersection}, -@code{syntax} or @code{category} form, or a character class.@* +be an @code{any}, @code{not}, @code{or}, @code{intersection}, +@code{syntax} or @code{category} form, or a character class. +If @var{charspec} is an @code{or} form, its arguments have the same +restrictions as those of @code{intersection}; see below.@* Corresponding string regexp: @samp{[^@dots{}]}, @samp{\S@var{code}}, @samp{\C@var{code}} -@item @code{(union @var{charset}@dots{})} -@itemx @code{(intersection @var{charset}@dots{})} -@cindex @code{union} in rx +@item @code{(intersection @var{charset}@dots{})} @cindex @code{intersection} in rx -Match a character that matches the union or intersection, -respectively, of the @var{charset}s. Each @var{charset} can be an -@code{any} form without character classes, or a @code{union}, -@code{intersection} or @code{not} form whose arguments are also -@var{charset}s. +Match a character included in all of the @var{charset}s. +Each @var{charset} can be an @code{any} form without character +classes, or an @code{intersection}, @code{or} or @code{not} form whose +arguments are also @var{charset}s. @item @code{not-newline}, @code{nonl} @cindex @code{not-newline} in rx @@ -1591,7 +1590,8 @@ when they are used, not when they are defined. User-defined forms are allowed wherever arbitrary @code{rx} expressions are expected; for example, in the body of a @code{zero-or-one} form, but not inside @code{any} or @code{category} -forms. They are also allowed inside @code{not} forms. +forms. They are also allowed inside @code{not} and +@code{intersection} forms. @end itemize @defmac rx-define name [arglist] rx-form diff --git a/etc/NEWS b/etc/NEWS index 4df123d787b..1e0422c761f 100644 --- a/etc/NEWS +++ b/etc/NEWS @@ -2120,9 +2120,9 @@ These macros add new forms to the rx notation. Both match any single character; 'anychar' is more descriptive. +++ -*** New 'union' and 'intersection' forms for character sets. -These permit composing character-matching expressions from simpler -parts. +*** New 'intersection' form for character sets. +With 'or' and 'not', it can be used to compose character-matching +expressions from simpler parts. ** Frames diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el index d4b21c3c9ad..a5cab1db888 100644 --- a/lisp/emacs-lisp/rx.el +++ b/lisp/emacs-lisp/rx.el @@ -273,10 +273,8 @@ Return (REGEXP . PRECEDENCE)." ;; (or (+ digit) "CHARLIE" "CHAN" (+ blank)) ;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank)) ;; - ;; - Fuse patterns into a single character alternative if they fit. - ;; regexp-opt will do that if all are strings, but we want to do that for: - ;; * symbols that expand to classes: space, alpha, ... - ;; * character alternatives: (any ...) + ;; - Optimise single-character alternatives better: + ;; * classes: space, alpha, ... ;; * (syntax S), for some S (whitespace, word) ;; so that (or "@" "%" digit (any "A-Z" space) (syntax word)) ;; -> (any "@" "%" digit "A-Z" space word) @@ -294,6 +292,8 @@ Return (REGEXP . PRECEDENCE)." ((rx--every #'stringp body) ; All strings. (cons (list (regexp-opt body nil t)) t)) + ((rx--every #'rx--charset-p body) ; All charsets. + (rx--translate-union nil body)) (t (cons (append (car (rx--translate (car body))) (mapcan (lambda (item) @@ -301,6 +301,19 @@ Return (REGEXP . PRECEDENCE)." (cdr body))) nil)))) +(defun rx--charset-p (form) + "Whether FORM looks like a charset, only consisting of character intervals +and set operations." + (or (and (consp form) + (or (and (memq (car form) '(any 'in 'char)) + (rx--every (lambda (x) (not (symbolp x))) (cdr form))) + (and (memq (car form) '(not or | intersection)) + (rx--every #'rx--charset-p (cdr form))))) + (and (or (symbolp form) (consp form)) + (let ((expanded (rx--expand-def form))) + (and expanded + (rx--charset-p expanded)))))) + (defun rx--string-to-intervals (str) "Decode STR as intervals: A-Z becomes (?A . ?Z), and the single character X becomes (?X . ?X). Return the intervals in a list." @@ -477,7 +490,7 @@ If NEGATED, negate the sense." (not negated) (rx--complement-intervals intervals) nil))) ;; FIXME: Consider turning `not' into a variadic operator, following SRE: -;; (not A B) = (not (union A B)) = (intersection (not A) (not B)), and +;; (not A B) = (not (or A B)) = (intersection (not A) (not B)), and ;; (not) = anychar. ;; Maybe allow singleton characters as arguments. @@ -498,7 +511,7 @@ If NEGATED, negate the sense (thus making it positive)." (rx--translate-category (not negated) (cdr arg))) ('not (rx--translate-not (not negated) (cdr arg))) - ('union + ((or 'or '|) (rx--translate-union (not negated) (cdr arg))) ('intersection (rx--translate-intersection (not negated) (cdr arg)))))) @@ -558,7 +571,7 @@ If NEGATED, negate the sense (thus making it positive)." (defun rx--charset-intervals (charset) "Return a sorted list of non-adjacent disjoint intervals from CHARSET. CHARSET is any expression allowed in a character set expression: -either `any' (no classes permitted), or `not', `union' or `intersection' +either `any' (no classes permitted), or `not', `or' or `intersection' forms whose arguments are charsets." (pcase charset (`(,(or 'any 'in 'char) . ,body) @@ -569,8 +582,8 @@ forms whose arguments are charsets." (cadr parsed))) (car parsed))) (`(not ,x) (rx--complement-intervals (rx--charset-intervals x))) - (`(union . ,xs) (rx--charset-union xs)) - (`(intersection . ,xs) (rx--charset-intersection xs)) + (`(,(or 'or '|) . ,body) (rx--charset-union body)) + (`(intersection . ,body) (rx--charset-intersection body)) (_ (let ((expanded (rx--expand-def charset))) (if expanded (rx--charset-intervals expanded) @@ -589,7 +602,7 @@ forms whose arguments are charsets." (mapcar #'rx--charset-intervals charsets))) (defun rx--translate-union (negated body) - "Translate a (union ...) construct. Return (REGEXP . PRECEDENCE). + "Translate an (or ...) construct of charsets. Return (REGEXP . PRECEDENCE). If NEGATED, negate the sense." (rx--intervals-to-alt negated (rx--charset-union body))) @@ -976,7 +989,6 @@ can expand to any number of values." ((or 'any 'in 'char) (rx--translate-any nil body)) ('not-char (rx--translate-any t body)) ('not (rx--translate-not nil body)) - ('union (rx--translate-union nil body)) ('intersection (rx--translate-intersection nil body)) ('repeat (rx--translate-repeat body)) @@ -1036,7 +1048,7 @@ can expand to any number of values." (t (error "Unknown rx form `%s'" op))))))) (defconst rx--builtin-forms - '(seq sequence : and or | any in char not-char not union intersection + '(seq sequence : and or | any in char not-char not intersection repeat = >= ** zero-or-more 0+ * one-or-more 1+ + @@ -1149,11 +1161,10 @@ CHAR Match a literal character. character, a string, a range as string \"A-Z\" or cons (?A . ?Z), or a character class (see below). Alias: in, char. (not CHARSPEC) Match one character not matched by CHARSPEC. CHARSPEC - can be (any ...), (union ...), (intersection ...), + can be (any ...), (or ...), (intersection ...), (syntax ...), (category ...), or a character class. -(union CHARSET...) Union of CHARSETs. (intersection CHARSET...) Intersection of CHARSETs. - CHARSET is (any...), (not...), (union...) or (intersection...). + CHARSET is (any...), (not...), (or...) or (intersection...). not-newline Match any character except a newline. Alias: nonl. anychar Match any character. Alias: anything. unmatchable Never match anything at all. diff --git a/test/lisp/emacs-lisp/rx-tests.el b/test/lisp/emacs-lisp/rx-tests.el index 0cd2c9590b7..344f46764c8 100644 --- a/test/lisp/emacs-lisp/rx-tests.el +++ b/test/lisp/emacs-lisp/rx-tests.el @@ -274,33 +274,36 @@ (should (equal (rx (not (not ascii)) (not (not (not (any "a-z"))))) "[[:ascii:]][^a-z]"))) -(ert-deftest rx-union () - (should (equal (rx (union)) +(ert-deftest rx-charset-or () + (should (equal (rx (or)) "\\`a\\`")) - (should (equal (rx (union (any "ba"))) + (should (equal (rx (or (any "ba"))) "[ab]")) - (should (equal (rx (union (any "a-f") (any "c-k" ?y) (any ?r "x-z"))) + (should (equal (rx (| (any "a-f") (any "c-k" ?y) (any ?r "x-z"))) "[a-krx-z]")) - (should (equal (rx (union (not (any "a-m")) (not (any "f-p")))) + (should (equal (rx (or (not (any "a-m")) (not (any "f-p")))) "[^f-m]")) - (should (equal (rx (union (any "e-m") (not (any "a-z")))) + (should (equal (rx (| (any "e-m") (not (any "a-z")))) "[^a-dn-z]")) - (should (equal (rx (union (not (any "g-r")) (not (any "t")))) + (should (equal (rx (or (not (any "g-r")) (not (any "t")))) "[^z-a]")) - (should (equal (rx (not (union (not (any "g-r")) (not (any "t"))))) + (should (equal (rx (not (or (not (any "g-r")) (not (any "t"))))) "\\`a\\`")) - (should (equal (rx (union (union (any "a-f") (any "u-z")) - (any "g-r"))) + (should (equal (rx (or (| (any "a-f") (any "u-z")) + (any "g-r"))) "[a-ru-z]")) - (should (equal (rx (union (intersection (any "c-z") (any "a-g")) - (not (any "a-k")))) + (should (equal (rx (or (intersection (any "c-z") (any "a-g")) + (not (any "a-k")))) "[^abh-k]"))) -(ert-deftest rx-def-in-union () +(ert-deftest rx-def-in-charset-or () (rx-let ((a (any "badc")) - (b (union a (any "def")))) - (should (equal(rx (union b (any "q"))) - "[a-fq]")))) + (b (| a (any "def")))) + (should (equal (rx (or b (any "q"))) + "[a-fq]"))) + (rx-let ((diff-| (a b) (not (or (not a) b)))) + (should (equal (rx (diff-| (any "a-z") (any "gr"))) + "[a-fh-qs-z]")))) (ert-deftest rx-intersection () (should (equal (rx (intersection)) @@ -321,15 +324,18 @@ (should (equal (rx (intersection (any "d-u") (intersection (any "e-z") (any "a-m")))) "[e-m]")) - (should (equal (rx (intersection (union (any "a-f") (any "f-t")) + (should (equal (rx (intersection (or (any "a-f") (any "f-t")) (any "e-w"))) "[e-t]"))) (ert-deftest rx-def-in-intersection () (rx-let ((a (any "a-g")) (b (intersection a (any "d-j")))) - (should (equal(rx (intersection b (any "e-k"))) - "[e-g]")))) + (should (equal (rx (intersection b (any "e-k"))) + "[e-g]"))) + (rx-let ((diff-& (a b) (intersection a (not b)))) + (should (equal (rx (diff-& (any "a-z") (any "m-p"))) + "[a-lq-z]")))) (ert-deftest rx-group () (should (equal (rx (group nonl) (submatch "x") -- 2.39.2