From ea93326cc046cb1beb7535cdf6d69b216b767685 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Mattias=20Engdeg=C3=A5rd?= Date: Fri, 6 Dec 2019 22:23:57 +0100 Subject: [PATCH] Add `union' and `intersection' to rx (bug#37849) These character set operations, together with `not' for set complement, improve the compositionality of rx, and reduce duplication in complicated cases. Named character classes are not permitted in set operations. * lisp/emacs-lisp/rx.el (rx--translate-any): Split into multiple functions. (rx--foldl, rx--parse-any, rx--generate-alt, rx--intervals-to-alt) (rx--complement-intervals, rx--intersect-intervals) (rx--union-intervals, rx--charset-intervals, rx--charset-union) (rx--charset-all, rx--charset-intersection, rx--translate-union) (rx--translate-intersection): New. (rx--translate-not, rx--translate-form, rx--builtin-forms, rx): Add `union' and `intersection'. * test/lisp/emacs-lisp/rx-tests.el (rx-union ,rx-def-in-union) (rx-intersection, rx-def-in-intersection): New tests. * doc/lispref/searching.texi (Rx Constructs): * etc/NEWS: Document `union' and `intersection'. --- doc/lispref/searching.texi | 14 +- etc/NEWS | 7 +- lisp/emacs-lisp/rx.el | 309 +++++++++++++++++++++---------- test/lisp/emacs-lisp/rx-tests.el | 57 ++++++ 4 files changed, 289 insertions(+), 98 deletions(-) diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi index 0cb30010c5e..5bf3c5b067f 100644 --- a/doc/lispref/searching.texi +++ b/doc/lispref/searching.texi @@ -1214,11 +1214,21 @@ Corresponding string regexp: @samp{[@dots{}]} @item @code{(not @var{charspec})} @cindex @code{not} in rx Match a character not included in @var{charspec}. @var{charspec} can -be an @code{any}, @code{not}, @code{syntax} or @code{category} form, or a -character class.@* +be an @code{any}, @code{not}, @code{union}, @code{intersection}, +@code{syntax} or @code{category} form, or a character class.@* Corresponding string regexp: @samp{[^@dots{}]}, @samp{\S@var{code}}, @samp{\C@var{code}} +@item @code{(union @var{charset}@dots{})} +@itemx @code{(intersection @var{charset}@dots{})} +@cindex @code{union} in rx +@cindex @code{intersection} in rx +Match a character that matches the union or intersection, +respectively, of the @var{charset}s. Each @var{charset} can be an +@code{any} form without character classes, or a @code{union}, +@code{intersection} or @code{not} form whose arguments are also +@var{charset}s. + @item @code{not-newline}, @code{nonl} @cindex @code{not-newline} in rx @cindex @code{nonl} in rx diff --git a/etc/NEWS b/etc/NEWS index 923890decf7..69b51b7f44e 100644 --- a/etc/NEWS +++ b/etc/NEWS @@ -2110,9 +2110,14 @@ at run time, instead of a constant string. These macros add new forms to the rx notation. +++ -*** 'anychar' is now an alias for 'anything' +*** 'anychar' is now an alias for 'anything'. Both match any single character; 'anychar' is more descriptive. ++++ +*** New 'union' and 'intersection' forms for character sets. +These permit composing character-matching expressions from simpler +parts. + ** Frames +++ diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el index a92c613b9aa..d4b21c3c9ad 100644 --- a/lisp/emacs-lisp/rx.el +++ b/lisp/emacs-lisp/rx.el @@ -246,6 +246,14 @@ Return (REGEXP . PRECEDENCE)." (setq list (cdr list))) (null list)) +(defun rx--foldl (f x l) + "(F (F (F X L0) L1) L2) ... +Left-fold the list L, starting with X, by the binary function F." + (while l + (setq x (funcall f x (car l))) + (setq l (cdr l))) + x) + (defun rx--translate-or (body) "Translate an or-pattern of zero or more rx items. Return (REGEXP . PRECEDENCE)." @@ -343,22 +351,11 @@ INTERVALS is a list of (START . END) with START ≤ END, sorted by START." (setq tail d))) intervals)) -;; FIXME: Consider expanding definitions inside (any ...) and (not ...), -;; and perhaps allow (any ...) inside (any ...). -;; It would be benefit composability (build a character alternative by pieces) -;; and be handy for obtaining the complement of a defined set of -;; characters. (See, for example, python.el:421, `not-simple-operator'.) -;; (Expansion in other non-rx positions is probably not a good idea: -;; syntax, category, backref, and the integer parameters of group-n, -;; =, >=, **, repeat) -;; Similar effect could be attained by ensuring that -;; (or (any X) (any Y)) -> (any X Y), and find a way to compose negative -;; sets. `and' is taken, but we could add -;; (intersection (not (any X)) (not (any Y))) -> (not (any X Y)). - -(defun rx--translate-any (negated body) - "Translate an (any ...) construct. Return (REGEXP . PRECEDENCE). -If NEGATED, negate the sense." +(defun rx--parse-any (body) + "Parse arguments of an (any ...) construct. +Return (INTERVALS . CLASSES), where INTERVALS is a sorted list of +disjoint intervals (each a cons of chars), and CLASSES +a list of named character classes in the order they occur in BODY." (let ((classes nil) (strings nil) (conses nil)) @@ -380,81 +377,109 @@ If NEGATED, negate the sense." (or (memq class classes) (progn (push class classes) t)))))) (t (error "Invalid rx `any' argument: %s" arg)))) - (let ((items - ;; Translate strings and conses into nonoverlapping intervals, - ;; and add classes as symbols at the end. - (append - (rx--condense-intervals - (sort (append conses - (mapcan #'rx--string-to-intervals strings)) - #'car-less-than-car)) - (reverse classes)))) - - ;; Move lone ] and range ]-x to the start. - (let ((rbrac-l (assq ?\] items))) - (when rbrac-l - (setq items (cons rbrac-l (delq rbrac-l items))))) - - ;; Split x-] and move the lone ] to the start. - (let ((rbrac-r (rassq ?\] items))) - (when (and rbrac-r (not (eq (car rbrac-r) ?\]))) - (setcdr rbrac-r ?\\) - (setq items (cons '(?\] . ?\]) items)))) - - ;; Split ,-- (which would end up as ,- otherwise). - (let ((dash-r (rassq ?- items))) - (when (eq (car dash-r) ?,) - (setcdr dash-r ?,) - (setq items (nconc items '((?- . ?-)))))) - - ;; Remove - (lone or at start of interval) - (let ((dash-l (assq ?- items))) - (when dash-l - (if (eq (cdr dash-l) ?-) - (setq items (delq dash-l items)) ; Remove lone - - (setcar dash-l ?.)) ; Reduce --x to .-x - (setq items (nconc items '((?- . ?-)))))) - - ;; Deal with leading ^ and range ^-x. - (when (and (consp (car items)) - (eq (caar items) ?^) - (cdr items)) - ;; Move ^ and ^-x to second place. - (setq items (cons (cadr items) - (cons (car items) (cddr items))))) + (cons (rx--condense-intervals + (sort (append conses + (mapcan #'rx--string-to-intervals strings)) + #'car-less-than-car)) + (reverse classes)))) + +(defun rx--generate-alt (negated intervals classes) + "Generate a character alternative. Return (REGEXP . PRECEDENCE). +If NEGATED is non-nil, negate the result; INTERVALS is a sorted +list of disjoint intervals and CLASSES a list of named character +classes." + (let ((items (append intervals classes))) + ;; Move lone ] and range ]-x to the start. + (let ((rbrac-l (assq ?\] items))) + (when rbrac-l + (setq items (cons rbrac-l (delq rbrac-l items))))) + + ;; Split x-] and move the lone ] to the start. + (let ((rbrac-r (rassq ?\] items))) + (when (and rbrac-r (not (eq (car rbrac-r) ?\]))) + (setcdr rbrac-r ?\\) + (setq items (cons '(?\] . ?\]) items)))) + + ;; Split ,-- (which would end up as ,- otherwise). + (let ((dash-r (rassq ?- items))) + (when (eq (car dash-r) ?,) + (setcdr dash-r ?,) + (setq items (nconc items '((?- . ?-)))))) + + ;; Remove - (lone or at start of interval) + (let ((dash-l (assq ?- items))) + (when dash-l + (if (eq (cdr dash-l) ?-) + (setq items (delq dash-l items)) ; Remove lone - + (setcar dash-l ?.)) ; Reduce --x to .-x + (setq items (nconc items '((?- . ?-)))))) + + ;; Deal with leading ^ and range ^-x. + (when (and (consp (car items)) + (eq (caar items) ?^) + (cdr items)) + ;; Move ^ and ^-x to second place. + (setq items (cons (cadr items) + (cons (car items) (cddr items))))) - (cond - ;; Empty set: if negated, any char, otherwise match-nothing. - ((null items) - (if negated - (rx--translate-symbol 'anything) - (rx--empty))) - ;; Single non-negated character. - ((and (null (cdr items)) - (consp (car items)) - (eq (caar items) (cdar items)) - (not negated)) - (cons (list (regexp-quote (char-to-string (caar items)))) - t)) - ;; At least one character or class, possibly negated. - (t - (cons - (list - (concat - "[" - (and negated "^") - (mapconcat (lambda (item) - (cond ((symbolp item) - (format "[:%s:]" item)) - ((eq (car item) (cdr item)) - (char-to-string (car item))) - ((eq (1+ (car item)) (cdr item)) - (string (car item) (cdr item))) - (t - (string (car item) ?- (cdr item))))) - items nil) - "]")) - t)))))) + (cond + ;; Empty set: if negated, any char, otherwise match-nothing. + ((null items) + (if negated + (rx--translate-symbol 'anything) + (rx--empty))) + ;; Single non-negated character. + ((and (null (cdr items)) + (consp (car items)) + (eq (caar items) (cdar items)) + (not negated)) + (cons (list (regexp-quote (char-to-string (caar items)))) + t)) + ;; At least one character or class, possibly negated. + (t + (cons + (list + (concat + "[" + (and negated "^") + (mapconcat (lambda (item) + (cond ((symbolp item) + (format "[:%s:]" item)) + ((eq (car item) (cdr item)) + (char-to-string (car item))) + ((eq (1+ (car item)) (cdr item)) + (string (car item) (cdr item))) + (t + (string (car item) ?- (cdr item))))) + items nil) + "]")) + t))))) + +(defun rx--translate-any (negated body) + "Translate an (any ...) construct. Return (REGEXP . PRECEDENCE). +If NEGATED, negate the sense." + (let ((parsed (rx--parse-any body))) + (rx--generate-alt negated (car parsed) (cdr parsed)))) + +(defun rx--intervals-to-alt (negated intervals) + "Generate a character alternative from an interval set. +Return (REGEXP . PRECEDENCE). +INTERVALS is a sorted list of disjoint intervals. +If NEGATED, negate the sense." + ;; Detect whether the interval set is better described in + ;; complemented form. This is not just a matter of aesthetics: any + ;; range from ASCII to raw bytes will automatically exclude the + ;; entire non-ASCII Unicode range by the regexp engine. + (if (rx--every (lambda (iv) (not (<= (car iv) #x3ffeff (cdr iv)))) + intervals) + (rx--generate-alt negated intervals nil) + (rx--generate-alt + (not negated) (rx--complement-intervals intervals) nil))) + +;; FIXME: Consider turning `not' into a variadic operator, following SRE: +;; (not A B) = (not (union A B)) = (intersection (not A) (not B)), and +;; (not) = anychar. +;; Maybe allow singleton characters as arguments. (defun rx--translate-not (negated body) "Translate a (not ...) construct. Return (REGEXP . PRECEDENCE). @@ -472,10 +497,14 @@ If NEGATED, negate the sense (thus making it positive)." ('category (rx--translate-category (not negated) (cdr arg))) ('not - (rx--translate-not (not negated) (cdr arg)))))) + (rx--translate-not (not negated) (cdr arg))) + ('union + (rx--translate-union (not negated) (cdr arg))) + ('intersection + (rx--translate-intersection (not negated) (cdr arg)))))) ((let ((class (cdr (assq arg rx--char-classes)))) (and class - (rx--translate-any (not negated) (list class))))) + (rx--generate-alt (not negated) nil (list class))))) ((eq arg 'word-boundary) (rx--translate-symbol (if negated 'word-boundary 'not-word-boundary))) @@ -484,6 +513,91 @@ If NEGATED, negate the sense (thus making it positive)." (rx--translate-not negated (list expanded))))) (t (error "Illegal argument to rx `not': %S" arg))))) +(defun rx--complement-intervals (intervals) + "Complement of the interval list INTERVALS." + (let ((compl nil) + (c 0)) + (dolist (iv intervals) + (when (< c (car iv)) + (push (cons c (1- (car iv))) compl)) + (setq c (1+ (cdr iv)))) + (when (< c (max-char)) + (push (cons c (max-char)) compl)) + (nreverse compl))) + +(defun rx--intersect-intervals (ivs-a ivs-b) + "Intersection of the interval lists IVS-A and IVS-B." + (let ((isect nil)) + (while (and ivs-a ivs-b) + (let ((a (car ivs-a)) + (b (car ivs-b))) + (cond + ((< (cdr a) (car b)) (setq ivs-a (cdr ivs-a))) + ((> (car a) (cdr b)) (setq ivs-b (cdr ivs-b))) + (t + (push (cons (max (car a) (car b)) + (min (cdr a) (cdr b))) + isect) + (setq ivs-a (cdr ivs-a)) + (setq ivs-b (cdr ivs-b)) + (cond ((< (cdr a) (cdr b)) + (push (cons (1+ (cdr a)) (cdr b)) + ivs-b)) + ((> (cdr a) (cdr b)) + (push (cons (1+ (cdr b)) (cdr a)) + ivs-a))))))) + (nreverse isect))) + +(defun rx--union-intervals (ivs-a ivs-b) + "Union of the interval lists IVS-A and IVS-B." + (rx--complement-intervals + (rx--intersect-intervals + (rx--complement-intervals ivs-a) + (rx--complement-intervals ivs-b)))) + +(defun rx--charset-intervals (charset) + "Return a sorted list of non-adjacent disjoint intervals from CHARSET. +CHARSET is any expression allowed in a character set expression: +either `any' (no classes permitted), or `not', `union' or `intersection' +forms whose arguments are charsets." + (pcase charset + (`(,(or 'any 'in 'char) . ,body) + (let ((parsed (rx--parse-any body))) + (when (cdr parsed) + (error + "Character class not permitted in set operations: %S" + (cadr parsed))) + (car parsed))) + (`(not ,x) (rx--complement-intervals (rx--charset-intervals x))) + (`(union . ,xs) (rx--charset-union xs)) + (`(intersection . ,xs) (rx--charset-intersection xs)) + (_ (let ((expanded (rx--expand-def charset))) + (if expanded + (rx--charset-intervals expanded) + (error "Bad character set: %S" charset)))))) + +(defun rx--charset-union (charsets) + "Union of CHARSETS, as a set of intervals." + (rx--foldl #'rx--union-intervals nil + (mapcar #'rx--charset-intervals charsets))) + +(defconst rx--charset-all (list (cons 0 (max-char)))) + +(defun rx--charset-intersection (charsets) + "Intersection of CHARSETS, as a set of intervals." + (rx--foldl #'rx--intersect-intervals rx--charset-all + (mapcar #'rx--charset-intervals charsets))) + +(defun rx--translate-union (negated body) + "Translate a (union ...) construct. Return (REGEXP . PRECEDENCE). +If NEGATED, negate the sense." + (rx--intervals-to-alt negated (rx--charset-union body))) + +(defun rx--translate-intersection (negated body) + "Translate an (intersection ...) construct. Return (REGEXP . PRECEDENCE). +If NEGATED, negate the sense." + (rx--intervals-to-alt negated (rx--charset-intersection body))) + (defun rx--atomic-regexp (item) "ITEM is (REGEXP . PRECEDENCE); return a regexp of precedence t." (if (eq (cdr item) t) @@ -862,6 +976,8 @@ can expand to any number of values." ((or 'any 'in 'char) (rx--translate-any nil body)) ('not-char (rx--translate-any t body)) ('not (rx--translate-not nil body)) + ('union (rx--translate-union nil body)) + ('intersection (rx--translate-intersection nil body)) ('repeat (rx--translate-repeat body)) ('= (rx--translate-= body)) @@ -920,7 +1036,7 @@ can expand to any number of values." (t (error "Unknown rx form `%s'" op))))))) (defconst rx--builtin-forms - '(seq sequence : and or | any in char not-char not + '(seq sequence : and or | any in char not-char not union intersection repeat = >= ** zero-or-more 0+ * one-or-more 1+ + @@ -1033,8 +1149,11 @@ CHAR Match a literal character. character, a string, a range as string \"A-Z\" or cons (?A . ?Z), or a character class (see below). Alias: in, char. (not CHARSPEC) Match one character not matched by CHARSPEC. CHARSPEC - can be (any ...), (syntax ...), (category ...), - or a character class. + can be (any ...), (union ...), (intersection ...), + (syntax ...), (category ...), or a character class. +(union CHARSET...) Union of CHARSETs. +(intersection CHARSET...) Intersection of CHARSETs. + CHARSET is (any...), (not...), (union...) or (intersection...). not-newline Match any character except a newline. Alias: nonl. anychar Match any character. Alias: anything. unmatchable Never match anything at all. diff --git a/test/lisp/emacs-lisp/rx-tests.el b/test/lisp/emacs-lisp/rx-tests.el index 317dae2990b..0cd2c9590b7 100644 --- a/test/lisp/emacs-lisp/rx-tests.el +++ b/test/lisp/emacs-lisp/rx-tests.el @@ -274,6 +274,63 @@ (should (equal (rx (not (not ascii)) (not (not (not (any "a-z"))))) "[[:ascii:]][^a-z]"))) +(ert-deftest rx-union () + (should (equal (rx (union)) + "\\`a\\`")) + (should (equal (rx (union (any "ba"))) + "[ab]")) + (should (equal (rx (union (any "a-f") (any "c-k" ?y) (any ?r "x-z"))) + "[a-krx-z]")) + (should (equal (rx (union (not (any "a-m")) (not (any "f-p")))) + "[^f-m]")) + (should (equal (rx (union (any "e-m") (not (any "a-z")))) + "[^a-dn-z]")) + (should (equal (rx (union (not (any "g-r")) (not (any "t")))) + "[^z-a]")) + (should (equal (rx (not (union (not (any "g-r")) (not (any "t"))))) + "\\`a\\`")) + (should (equal (rx (union (union (any "a-f") (any "u-z")) + (any "g-r"))) + "[a-ru-z]")) + (should (equal (rx (union (intersection (any "c-z") (any "a-g")) + (not (any "a-k")))) + "[^abh-k]"))) + +(ert-deftest rx-def-in-union () + (rx-let ((a (any "badc")) + (b (union a (any "def")))) + (should (equal(rx (union b (any "q"))) + "[a-fq]")))) + +(ert-deftest rx-intersection () + (should (equal (rx (intersection)) + "[^z-a]")) + (should (equal (rx (intersection (any "ba"))) + "[ab]")) + (should (equal (rx (intersection (any "a-j" "u-z") (any "c-k" ?y) + (any "a-i" "x-z"))) + "[c-iy]")) + (should (equal (rx (intersection (not (any "a-m")) (not (any "f-p")))) + "[^a-p]")) + (should (equal (rx (intersection (any "a-z") (not (any "g-q")))) + "[a-fr-z]")) + (should (equal (rx (intersection (any "a-d") (any "e"))) + "\\`a\\`")) + (should (equal (rx (not (intersection (any "a-d") (any "e")))) + "[^z-a]")) + (should (equal (rx (intersection (any "d-u") + (intersection (any "e-z") (any "a-m")))) + "[e-m]")) + (should (equal (rx (intersection (union (any "a-f") (any "f-t")) + (any "e-w"))) + "[e-t]"))) + +(ert-deftest rx-def-in-intersection () + (rx-let ((a (any "a-g")) + (b (intersection a (any "d-j")))) + (should (equal(rx (intersection b (any "e-k"))) + "[e-g]")))) + (ert-deftest rx-group () (should (equal (rx (group nonl) (submatch "x") (group-n 3 "y") (submatch-n 13 "z") (backref 1)) -- 2.39.2