From 157e735ce89ede9cc939f4ed0f72c5af7ae60735 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Mattias=20Engdeg=C3=A5rd?= Date: Mon, 17 Jul 2023 13:05:21 +0200 Subject: [PATCH] Don't distort character ranges in rx translation MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The Emacs regexp engine interprets character ranges from ASCII to raw bytes, such as [a-\xfe], as not including non-ASCII Unicode at all; ranges from non-ACII Unicode to raw bytes, such as [ü-\x91], are ignored entirely. To make rx produce a translation that works as intended, split ranges that that go from ordinary characters to raw bytes. Such ranges may appear from set manipulation and regexp optimisation. * lisp/emacs-lisp/rx.el (rx--generate-alt): Split intervals that straddle the char-raw boundary when rendering a string regexp from an interval set. * test/lisp/emacs-lisp/rx-tests.el (rx-char-any-raw-byte): Add test cases. --- lisp/emacs-lisp/rx.el | 6 ++++++ test/lisp/emacs-lisp/rx-tests.el | 12 +++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el index e82490ffee5..f1eb3e308a2 100644 --- a/lisp/emacs-lisp/rx.el +++ b/lisp/emacs-lisp/rx.el @@ -484,6 +484,12 @@ classes." (char-to-string (car item))) ((eq (1+ (car item)) (cdr item)) (string (car item) (cdr item))) + ;; Ranges that go between normal chars and raw bytes + ;; must be split to avoid being mutilated + ;; by Emacs's regexp parser. + ((<= (car item) #x3fff7f (cdr item)) + (string (car item) ?- #x3fff7f + #x3fff80 ?- (cdr item))) (t (string (car item) ?- (cdr item))))) items nil) diff --git a/test/lisp/emacs-lisp/rx-tests.el b/test/lisp/emacs-lisp/rx-tests.el index 028250b7352..995d297ff08 100644 --- a/test/lisp/emacs-lisp/rx-tests.el +++ b/test/lisp/emacs-lisp/rx-tests.el @@ -98,7 +98,17 @@ "[\177Å\211\326-\377]")) ;; Split range; \177-\377ÿ should not be optimized to \177-\377. (should (equal (rx (any "\177-\377" ?ÿ)) - "[\177ÿ\200-\377]"))) + "[\177ÿ\200-\377]")) + ;; Range between normal chars and raw bytes: must be split to be parsed + ;; correctly by the Emacs regexp engine. + (should (equal + (rx (any (0 . #x3fffff)) (any (?G . #x3fff9a)) (any (?Ü . #x3ffff2))) + "[\0-\x3fff7f\x80-\xff][G-\x3fff7f\x80-\x9a][Ü-\x3fff7f\x80-\xf2]")) + ;; As above but with ranges in string form. For historical reasons, + ;; we special-case ASCII-to-raw ranges to exclude non-ASCII unicode. + (should (equal + (rx (any "\x00-\xff") (any "G-\x9a") (any "Ü-\xf2")) + "[\0-\x7f\x80-\xff][G-\x7f\x80-\x9a][Ü-\x3fff7f\x80-\xf2]"))) (ert-deftest rx-any () (should (equal (rx (any ?A (?C . ?D) "F-H" "J-L" "M" "N-P" "Q" "RS")) -- 2.39.5