From 478bbf7c80e71ff84f0e4e1363bf86e93d9c51c3 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Mattias=20Engdeg=C3=A5rd?= Date: Fri, 15 Feb 2019 19:27:48 +0100 Subject: [PATCH] Prevent over-eager rx character range condensation MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit `rx' incorrectly considers character ranges between ASCII and raw bytes to cover all codes in-between, which includes all non-ASCII Unicode chars. This causes (any "\000-\377" ?Å) to be simplified to (any "\000-\377"), which is not at all the same thing: [\000-\377] really means [\000-\177\200-\377] (Bug#34492). * lisp/emacs-lisp/rx.el (rx-any-condense-range): Split ranges going from ASCII to raw bytes. * test/lisp/emacs-lisp/rx-tests.el (rx-char-any-raw-byte): Add test case. * etc/NEWS: Mention the overall change (Bug#33205). --- etc/NEWS | 8 ++++++++ lisp/emacs-lisp/rx.el | 7 +++++++ test/lisp/emacs-lisp/rx-tests.el | 6 +++++- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/etc/NEWS b/etc/NEWS index 70a50c02c4e..0cafbaae96c 100644 --- a/etc/NEWS +++ b/etc/NEWS @@ -1101,6 +1101,14 @@ subexpression. When there is no menu for a mode, display the mode name after the indicator instead of just the indicator (which is sometimes cryptic). +** rx + +--- +*** rx now handles raw bytes in character alternatives correctly, +when given in a string. Previously, '(any "\x80-\xff")' would match +characters U+0080...U+00FF. Now the expression matches raw bytes in +the 128...255 range, as expected. + * New Modes and Packages in Emacs 27.1 diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el index b2299030a1b..715cd608c46 100644 --- a/lisp/emacs-lisp/rx.el +++ b/lisp/emacs-lisp/rx.el @@ -429,6 +429,13 @@ Only both edges of each range is checked." ;; set L list of all ranges (mapc (lambda (e) (cond ((stringp e) (push e str)) ((numberp e) (push (cons e e) l)) + ;; Ranges between ASCII and raw bytes are split, + ;; to prevent accidental inclusion of Unicode + ;; characters later on. + ((and (<= (car e) #x7f) + (>= (cdr e) #x3fff80)) + (push (cons (car e) #x7f) l) + (push (cons #x3fff80 (cdr e)) l)) (t (push e l)))) args) ;; condense overlapped ranges in L diff --git a/test/lisp/emacs-lisp/rx-tests.el b/test/lisp/emacs-lisp/rx-tests.el index f15e1016f7c..e14feda347f 100644 --- a/test/lisp/emacs-lisp/rx-tests.el +++ b/test/lisp/emacs-lisp/rx-tests.el @@ -53,7 +53,11 @@ ;; Range of raw characters, multibyte. (should (equal (string-match-p (rx (any "Å\211\326-\377\177")) "XY\355\177\327") - 2))) + 2)) + ;; Split range; \177-\377ÿ should not be optimised to \177-\377. + (should (equal (string-match-p (rx (any "\177-\377" ?ÿ)) + "ÿA\310B") + 0))) (ert-deftest rx-pcase () (should (equal (pcase "a 1 2 3 1 1 b" -- 2.39.5