From ece5ace4a52eda26d9fe9563206781944aed16d0 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Mattias=20Engdeg=C3=A5rd?= Date: Mon, 24 Jul 2023 09:44:01 +0200 Subject: [PATCH] rx: better not-wordchar and (syntax word) translation * lisp/emacs-lisp/rx.el: Add tables of legacy syntax. (rx--translate-symbol): Translate the legacy construct `not-wordchar` as (not wordchar), which is more intuitively obvious. * lisp/emacs-lisp/rx.el (rx--translate-syntax): Generate the shorter \w and \W instead of \sw and \Sw. * test/lisp/emacs-lisp/rx-tests.el (rx-atoms, rx-syntax, rx-not): Adapt tests. --- lisp/emacs-lisp/rx.el | 42 ++++++++++++++++++++++++++++++-- test/lisp/emacs-lisp/rx-tests.el | 9 ++++--- 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el index 5fad84964cc..d46d0ca5a98 100644 --- a/lisp/emacs-lisp/rx.el +++ b/lisp/emacs-lisp/rx.el @@ -35,8 +35,43 @@ ;; Olin Shivers's SRE, with concessions to Emacs regexp peculiarities, ;; and the older Emacs package Sregex. +;;; Legacy syntax still accepted by rx: +;; +;; These are constructs from earlier rx and sregex implementations +;; that were mistakes, accidents or just not very good ideas in hindsight. + +;; Obsolete: accepted but not documented +;; +;; Obsolete Preferred +;; -------------------------------------------------------- +;; (not word-boundary) not-word-boundary +;; (not-syntax X) (not (syntax X)) +;; not-wordchar (not wordchar) +;; (not-char ...) (not (any ...)) +;; any nonl, not-newline +;; (repeat N FORM) (= N FORM) +;; (syntax CHARACTER) (syntax NAME) +;; (syntax CHAR-SYM) [1] (syntax NAME) +;; (category chinse-two-byte) (category chinese-two-byte) +;; unibyte ascii +;; multibyte nonascii +;; -------------------------------------------------------- +;; [1] where CHAR-SYM is a symbol with single-character name + +;; Obsolescent: accepted and documented but discouraged +;; +;; Obsolescent Preferred +;; -------------------------------------------------------- +;; (and ...) (seq ...), (: ...), (sequence ...) +;; anything anychar +;; minimal-match, maximal-match lazy ops: ??, *?, +? + +;; FIXME: Prepare a phase-out by emitting compile-time warnings about +;; at least some of the legacy constructs above. + ;;; Code: + ;; The `rx--translate...' functions below return (REGEXP . PRECEDENCE), ;; where REGEXP is a list of string expressions that will be ;; concatenated into a regexp, and PRECEDENCE is one of @@ -167,7 +202,7 @@ Each entry is: ('not-word-boundary (cons (list "\\B") t)) ('symbol-start (cons (list "\\_<") t)) ('symbol-end (cons (list "\\_>") t)) - ('not-wordchar (cons (list "\\W") t)) + ('not-wordchar (rx--translate '(not wordchar))) (_ (cond ((let ((class (cdr (assq sym rx--char-classes)))) @@ -817,7 +852,10 @@ Return (REGEXP . PRECEDENCE)." (setq syntax char))))))) (unless syntax (error "Unknown rx syntax name `%s'" sym))) - (cons (list (string ?\\ (if negated ?S ?s) syntax)) + ;; Produce \w and \W instead of \sw and \Sw, for smaller size. + (cons (list (if (eq syntax ?w) + (string ?\\ (if negated ?W ?w)) + (string ?\\ (if negated ?S ?s) syntax))) t))) (defconst rx--categories diff --git a/test/lisp/emacs-lisp/rx-tests.el b/test/lisp/emacs-lisp/rx-tests.el index 7d7e0068eed..ae83f28d9c4 100644 --- a/test/lisp/emacs-lisp/rx-tests.el +++ b/test/lisp/emacs-lisp/rx-tests.el @@ -284,7 +284,7 @@ "^\\`\\'\\`\\'\\`\\'\\`\\'$")) (should (equal (rx point word-start word-end bow eow symbol-start symbol-end word-boundary not-word-boundary not-wordchar) - "\\=\\<\\>\\<\\>\\_<\\_>\\b\\B\\W")) + "\\=\\<\\>\\<\\>\\_<\\_>\\b\\B[^[:word:]]")) (should (equal (rx digit numeric num control cntrl) "[[:digit:]][[:digit:]][[:digit:]][[:cntrl:]][[:cntrl:]]")) (should (equal (rx hex-digit hex xdigit blank) @@ -306,7 +306,7 @@ (should (equal (rx (syntax whitespace) (syntax punctuation) (syntax word) (syntax symbol) (syntax open-parenthesis) (syntax close-parenthesis)) - "\\s-\\s.\\sw\\s_\\s(\\s)")) + "\\s-\\s.\\w\\s_\\s(\\s)")) (should (equal (rx (syntax string-quote) (syntax paired-delimiter) (syntax escape) (syntax character-quote) (syntax comment-start) (syntax comment-end) @@ -354,8 +354,9 @@ "\\B")) (should (equal (rx (not ascii) (not lower-case) (not wordchar)) "[^[:ascii:]][^[:lower:]][^[:word:]]")) - (should (equal (rx (not (syntax punctuation)) (not (syntax escape))) - "\\S.\\S\\")) + (should (equal (rx (not (syntax punctuation)) (not (syntax escape)) + (not (syntax word))) + "\\S.\\S\\\\W")) (should (equal (rx (not (category tone-mark)) (not (category lao))) "\\C4\\Co")) (should (equal (rx (not (not ascii)) (not (not (not (any "a-z"))))) -- 2.39.2