From d167888c5b7740af3300ee363c5121519dada0a2 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Mattias=20Engdeg=C3=A5rd?= Date: Sun, 30 Jul 2023 21:53:06 +0200 Subject: [PATCH] rx performance improvements * lisp/emacs-lisp/rx.el (rx--generate-alt): Treat the intervals and classes lists separately without joining, to reduce allocation. Handle special cases first. (rx--union-intervals): Implement directly instead of using intersection and complement. * test/lisp/emacs-lisp/rx-tests.el (rx-any): Adapt test, as some character alternatives are now slightly different. (rx--complement-intervals, rx--union-intervals) (rx--intersect-intervals): New unit tests. --- lisp/emacs-lisp/rx.el | 176 ++++++++++++++++++------------- test/lisp/emacs-lisp/rx-tests.el | 53 +++++++++- 2 files changed, 154 insertions(+), 75 deletions(-) diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el index 19c82d9b23d..5fad84964cc 100644 --- a/lisp/emacs-lisp/rx.el +++ b/lisp/emacs-lisp/rx.el @@ -419,86 +419,96 @@ a list of named character classes in the order they occur in BODY." If NEGATED is non-nil, negate the result; INTERVALS is a sorted list of disjoint intervals and CLASSES a list of named character classes." - (let ((items (append intervals classes))) - ;; Move lone ] and range ]-x to the start. - (let ((rbrac-l (assq ?\] items))) - (when rbrac-l - (setq items (cons rbrac-l (delq rbrac-l items))))) - - ;; Split x-] and move the lone ] to the start. - (let ((rbrac-r (rassq ?\] items))) - (when (and rbrac-r (not (eq (car rbrac-r) ?\]))) - (setcdr rbrac-r ?\\) - (setq items (cons '(?\] . ?\]) items)))) - - ;; Split ,-- (which would end up as ,- otherwise). - (let ((dash-r (rassq ?- items))) - (when (eq (car dash-r) ?,) - (setcdr dash-r ?,) - (setq items (nconc items '((?- . ?-)))))) - - ;; Remove - (lone or at start of interval) - (let ((dash-l (assq ?- items))) - (when dash-l - (if (eq (cdr dash-l) ?-) - (setq items (delq dash-l items)) ; Remove lone - - (setcar dash-l ?.)) ; Reduce --x to .-x - (setq items (nconc items '((?- . ?-)))))) - - ;; Deal with leading ^ and range ^-x in non-negated set. - (when (and (eq (car-safe (car items)) ?^) - (not negated)) - (if (eq (cdar items) ?^) - ;; single leading ^ - (when (cdr items) - ;; Move the ^ to second place. - (setq items (cons (cadr items) - (cons (car items) (cddr items))))) - ;; Split ^-x to _-x^ - (setq items (cons (cons ?_ (cdar items)) - (cons '(?^ . ?^) - (cdr items)))))) - - (cond - ;; Empty set: if negated, any char, otherwise match-nothing. - ((null items) + ;; No, this is not pretty code. You try doing it in a way that is both + ;; elegant and efficient. Or just one of the two. I dare you. + (cond + ;; Single character. + ((and intervals (eq (caar intervals) (cdar intervals)) + (null (cdr intervals)) + (null classes)) + (let ((ch (caar intervals))) (if negated - (rx--translate-symbol 'anything) - (rx--empty))) - ;; Single non-negated character. - ((and (null (cdr items)) - (consp (car items)) - (eq (caar items) (cdar items)) - (not negated)) - (cons (list (regexp-quote (char-to-string (caar items)))) - t)) - ;; Negated newline. - ((and (equal items '((?\n . ?\n))) - negated) - (rx--translate-symbol 'nonl)) - ;; At least one character or class, possibly negated. - (t + (if (eq ch ?\n) + ;; Single negated newline. + (rx--translate-symbol 'nonl) + ;; Single negated character (other than newline). + (cons (list (string ?\[ ?^ ch ?\])) t)) + ;; Single literal character. + (cons (list (regexp-quote (char-to-string ch))) t)))) + + ;; Empty set (or any char). + ((and (null intervals) (null classes)) + (if negated + (rx--translate-symbol 'anything) + (rx--empty))) + + ;; More than one character, or at least one class. + (t + (let ((dash nil) (caret nil)) + ;; Move ] and range ]-x to the start. + (let ((rbrac-l (assq ?\] intervals))) + (when rbrac-l + (setq intervals (cons rbrac-l (remq rbrac-l intervals))))) + + ;; Split x-] and move the lone ] to the start. + (let ((rbrac-r (rassq ?\] intervals))) + (when (and rbrac-r (not (eq (car rbrac-r) ?\]))) + (setcdr rbrac-r ?\\) + (setq intervals (cons '(?\] . ?\]) intervals)))) + + ;; Split ,-- (which would end up as ,- otherwise). + (let ((dash-r (rassq ?- intervals))) + (when (eq (car dash-r) ?,) + (setcdr dash-r ?,) + (setq dash "-"))) + + ;; Remove - (lone or at start of interval) + (let ((dash-l (assq ?- intervals))) + (when dash-l + (if (eq (cdr dash-l) ?-) + (setq intervals (remq dash-l intervals)) ; Remove lone - + (setcar dash-l ?.)) ; Reduce --x to .-x + (setq dash "-"))) + + ;; Deal with leading ^ and range ^-x in non-negated set. + (when (and (eq (caar intervals) ?^) + (not negated)) + (if (eq (cdar intervals) ?^) + ;; single leading ^ + (if (or (cdr intervals) classes) + ;; something else to put before the ^ + (progn + (setq intervals (cdr intervals)) ; remove lone ^ + (setq caret "^")) ; put ^ (almost) last + ;; nothing else but a lone - + (setq intervals (cons '(?- . ?-) intervals)) ; move - first + (setq dash nil)) + ;; split ^-x to _-x^ + (setq intervals `((?_ . ,(cdar intervals)) (?^ . ?^) + . ,(cdr intervals))))) + (cons (list (concat "[" (and negated "^") - (mapconcat (lambda (item) - (cond ((symbolp item) - (format "[:%s:]" item)) - ((eq (car item) (cdr item)) - (char-to-string (car item))) - ((eq (1+ (car item)) (cdr item)) - (string (car item) (cdr item))) + (mapconcat (lambda (iv) + (cond ((eq (car iv) (cdr iv)) + (char-to-string (car iv))) + ((eq (1+ (car iv)) (cdr iv)) + (string (car iv) (cdr iv))) ;; Ranges that go between normal chars and raw bytes ;; must be split to avoid being mutilated ;; by Emacs's regexp parser. - ((<= (car item) #x3fff7f (cdr item)) - (string (car item) ?- #x3fff7f - #x3fff80 ?- (cdr item))) + ((<= (car iv) #x3fff7f (cdr iv)) + (string (car iv) ?- #x3fff7f + #x3fff80 ?- (cdr iv))) (t - (string (car item) ?- (cdr item))))) - items nil) + (string (car iv) ?- (cdr iv))))) + intervals) + (mapconcat (lambda (cls) (format "[:%s:]" cls)) classes) + caret ; ^ or nothing + dash ; - or nothing "]")) t))))) @@ -602,10 +612,28 @@ If NEGATED, negate the sense (thus making it positive)." (defun rx--union-intervals (ivs-a ivs-b) "Union of the interval lists IVS-A and IVS-B." - (rx--complement-intervals - (rx--intersect-intervals - (rx--complement-intervals ivs-a) - (rx--complement-intervals ivs-b)))) + (let ((union nil)) + (while (and ivs-a ivs-b) + (let ((a (car ivs-a)) + (b (car ivs-b))) + (cond + ((< (1+ (cdr a)) (car b)) ; a before b, not adacent + (push a union) + (setq ivs-a (cdr ivs-a))) + ((< (1+ (cdr b)) (car a)) ; b before a, not adacent + (push b union) + (setq ivs-b (cdr ivs-b))) + (t ; a and b adjacent or overlap + (setq ivs-a (cdr ivs-a)) + (setq ivs-b (cdr ivs-b)) + (if (< (cdr a) (cdr b)) + (push (cons (min (car a) (car b)) + (cdr b)) + ivs-b) + (push (cons (min (car a) (car b)) + (cdr a)) + ivs-a)))))) + (nconc (nreverse union) (or ivs-a ivs-b)))) (defun rx--charset-intervals (charset) "Return a sorted list of non-adjacent disjoint intervals from CHARSET. diff --git a/test/lisp/emacs-lisp/rx-tests.el b/test/lisp/emacs-lisp/rx-tests.el index 4928d5adf9d..7d7e0068eed 100644 --- a/test/lisp/emacs-lisp/rx-tests.el +++ b/test/lisp/emacs-lisp/rx-tests.el @@ -148,7 +148,7 @@ (should (equal (rx (any "-]^" ascii) (not (any "-]^" ascii))) "[]^[:ascii:]-][^]^[:ascii:]-]")) (should (equal (rx (any "^" lower upper) (not (any "^" lower upper))) - "[[:lower:]^[:upper:]][^^[:lower:][:upper:]]")) + "[[:lower:][:upper:]^][^^[:lower:][:upper:]]")) (should (equal (rx (any "-" lower upper) (not (any "-" lower upper))) "[[:lower:][:upper:]-][^[:lower:][:upper:]-]")) (should (equal (rx (any "]" lower upper) (not (any "]" lower upper))) @@ -610,6 +610,57 @@ (rx-submatch-n '(group-n 3 (+ nonl) eol))) "\\(?3:.+$\\)"))) +;;; unit tests for internal functions + +(ert-deftest rx--complement-intervals () + (should (equal (rx--complement-intervals '()) + '((0 . #x3fffff)))) + (should (equal (rx--complement-intervals '((10 . 20) (30 . 40))) + '((0 . 9) (21 . 29) (41 . #x3fffff)))) + (should (equal (rx--complement-intervals '((0 . #x3fffff))) + '())) + (should (equal (rx--complement-intervals + '((0 . 10) (20 . 20) (30 . #x3fffff))) + '((11 . 19) (21 . 29))))) + +(ert-deftest rx--union-intervals () + (should (equal (rx--union-intervals '() '()) '())) + (should (equal (rx--union-intervals '() '((10 . 20) (30 . 40))) + '((10 . 20) (30 . 40)))) + (should (equal (rx--union-intervals '((10 . 20) (30 . 40)) '()) + '((10 . 20) (30 . 40)))) + (should (equal (rx--union-intervals '((5 . 15) (18 . 24) (32 . 40)) + '((10 . 20) (30 . 40) (50 . 60))) + '((5 . 24) (30 . 40) (50 . 60)))) + (should (equal (rx--union-intervals '((10 . 20) (30 . 40) (50 . 60)) + '((0 . 9) (21 . 29) (41 . 50))) + '((0 . 60)))) + (should (equal (rx--union-intervals '((10 . 20) (30 . 40)) + '((12 . 18) (28 . 42))) + '((10 . 20) (28 . 42)))) + (should (equal (rx--union-intervals '((10 . 20) (30 . 40)) + '((0 . #x3fffff))) + '((0 . #x3fffff))))) + +(ert-deftest rx--intersect-intervals () + (should (equal (rx--intersect-intervals '() '()) '())) + (should (equal (rx--intersect-intervals '() '((10 . 20) (30 . 40))) + '())) + (should (equal (rx--intersect-intervals '((10 . 20) (30 . 40)) '()) + '())) + (should (equal (rx--intersect-intervals '((5 . 15) (18 . 24) (32 . 40)) + '((10 . 20) (30 . 40) (50 . 60))) + '((10 . 15) (18 . 20) (32 . 40)))) + (should (equal (rx--intersect-intervals '((10 . 20) (30 . 40) (50 . 60)) + '((0 . 9) (21 . 29) (41 . 50))) + '((50 . 50)))) + (should (equal (rx--intersect-intervals '((10 . 20) (30 . 40)) + '((12 . 18) (28 . 42))) + '((12 . 18) (30 . 40)))) + (should (equal (rx--intersect-intervals '((10 . 20) (30 . 40)) + '((0 . #x3fffff))) + '((10 . 20) (30 . 40))))) + (provide 'rx-tests) ;;; rx-tests.el ends here -- 2.39.2