-;;; rx.el --- sexp notation for regular expressions -*- lexical-binding: t -*-
+;;; rx.el --- S-exp notation for regexps --*- lexical-binding: t -*-
;; Copyright (C) 2001-2019 Free Software Foundation, Inc.
-;; Author: Gerd Moellmann <gerd@gnu.org>
-;; Maintainer: emacs-devel@gnu.org
-;; Keywords: strings, regexps, extensions
-
;; This file is part of GNU Emacs.
;; GNU Emacs is free software: you can redistribute it and/or modify
;;; Commentary:
-;; This is another implementation of sexp-form regular expressions.
-;; It was unfortunately written without being aware of the Sregex
-;; package coming with Emacs, but as things stand, Rx completely
-;; covers all regexp features, which Sregex doesn't, doesn't suffer
-;; from the bugs mentioned in the commentary section of Sregex, and
-;; uses a nicer syntax (IMHO, of course :-).
-
-;; This significantly extended version of the original, is almost
-;; compatible with Sregex. The only incompatibility I (fx) know of is
-;; that the `repeat' form can't have multiple regexp args.
-
-;; Now alternative forms are provided for a degree of compatibility
-;; with Olin Shivers' attempted definitive SRE notation. SRE forms
-;; not catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>,
-;; ,<exp>, (word ...), word+, posix-string, and character class forms.
-;; Some forms are inconsistent with SRE, either for historical reasons
-;; or because of the implementation -- simple translation into Emacs
-;; regexp strings. These include: any, word. Also, case-sensitivity
-;; and greediness are controlled by variables external to the regexp,
-;; and you need to feed the forms to the `posix-' functions to get
-;; SRE's POSIX semantics. There are probably more difficulties.
-
-;; Rx translates a sexp notation for regular expressions into the
-;; usual string notation. The translation can be done at compile-time
-;; by using the `rx' macro. The `regexp' and `literal' forms accept
-;; non-constant expressions, in which case `rx' will translate to a
-;; `concat' expression. Translation can be done fully at run time by
-;; calling function `rx-to-string'. See the documentation of `rx' for
-;; a complete description of the sexp notation.
-;;
-;; Some examples of string regexps and their sexp counterparts:
-;;
-;; "^[a-z]*"
-;; (rx line-start (0+ (in "a-z")))
-;;
-;; "\n[^ \t]"
-;; (rx ?\n (not (in " \t")))
-;;
-;; "\\*\\*\\* EOOH \\*\\*\\*\n"
-;; (rx "*** EOOH ***\n")
-;;
-;; "\\<\\(catch\\|finally\\)\\>[^_]"
-;; (rx word-start (submatch (or "catch" "finally")) word-end
-;; (not (in ?_)))
-;;
-;; "[ \t\n]*:\\($\\|[^:]+\\)"
-;; (rx (* (in " \t\n")) ":"
-;; (submatch (or line-end (+ (not (in ?:))))))
-;;
-;; "^content-transfer-encoding:\\(?:\n?[\t ]\\)*quoted-printable\\(?:\n?[\t ]\\)*"
-;; (rx line-start
-;; "content-transfer-encoding:"
-;; (* (? ?\n) (in " \t"))
-;; "quoted-printable"
-;; (* (? ?\n) (in " \t")))
-;;
-;; (concat "^\\(?:" something-else "\\)")
-;; (rx line-start (regexp something-else))
-;;
-;; (regexp-opt '(STRING1 STRING2 ...))
-;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically
-;; calls `regexp-opt' as needed.
-;;
-;; "^;;\\s-*\n\\|^\n"
-;; (rx (or (seq line-start ";;" (0+ space) ?\n)
-;; (seq line-start ?\n)))
+;; This facility allows writing regexps in a sexp-based language
+;; instead of strings. Regexps in the `rx' notation are easier to
+;; read, write and maintain; they can be indented and commented in a
+;; natural way, and are easily composed by program code.
+;; The translation to string regexp is done by a macro and does not
+;; incur any extra processing during run time. Example:
;;
-;; "\\$[I]d: [^ ]+ \\([^ ]+\\) "
-;; (rx "$Id: "
-;; (1+ (not (in " ")))
-;; " "
-;; (submatch (1+ (not (in " "))))
-;; " ")
+;; (rx bos (or (not (any "^"))
+;; (seq "^" (or " *" "["))))
;;
-;; "\\\\\\\\\\[\\w+"
-;; (rx "\\\\[" (1+ word))
-;;
-;; etc.
-
-;;; History:
+;; => "\\`\\(?:[^^]\\|\\^\\(?: \\*\\|\\[\\)\\)"
;;
+;; The notation is much influenced by and retains some compatibility with
+;; Olin Shivers's SRE, with concessions to Emacs regexp peculiarities,
+;; and the older Emacs package Sregex.
;;; Code:
-(require 'cl-lib)
-(require 'cl-extra)
-
-;; FIXME: support macros.
-
-(defvar rx-constituents ;Not `const' because some modes extend it.
- '((and . (rx-and 0 nil))
- (seq . and) ; SRE
- (: . and) ; SRE
- (sequence . and) ; sregex
- (or . (rx-or 0 nil))
- (| . or) ; SRE
- (not-newline . ".")
- (nonl . not-newline) ; SRE
- (anything . (rx-anything 0 nil))
- (any . (rx-any 1 nil rx-check-any)) ; inconsistent with SRE
- (any . ".") ; sregex
- (in . any)
- (char . any) ; sregex
- (not-char . (rx-not-char 1 nil rx-check-any)) ; sregex
- (not . (rx-not 1 1 rx-check-not))
- (repeat . (rx-repeat 2 nil))
- (= . (rx-= 2 nil)) ; SRE
- (>= . (rx->= 2 nil)) ; SRE
- (** . (rx-** 2 nil)) ; SRE
- (submatch . (rx-submatch 1 nil)) ; SRE
- (group . submatch) ; sregex
- (submatch-n . (rx-submatch-n 2 nil))
- (group-n . submatch-n)
- (zero-or-more . (rx-kleene 1 nil))
- (one-or-more . (rx-kleene 1 nil))
- (zero-or-one . (rx-kleene 1 nil))
- (\? . zero-or-one) ; SRE
- (\?? . zero-or-one)
- (* . zero-or-more) ; SRE
- (*? . zero-or-more)
- (0+ . zero-or-more)
- (+ . one-or-more) ; SRE
- (+? . one-or-more)
- (1+ . one-or-more)
- (optional . zero-or-one)
- (opt . zero-or-one) ; sregex
- (minimal-match . (rx-greedy 1 1))
- (maximal-match . (rx-greedy 1 1))
- (backref . (rx-backref 1 1 rx-check-backref))
- (line-start . "^")
- (bol . line-start) ; SRE
- (line-end . "$")
- (eol . line-end) ; SRE
- (string-start . "\\`")
- (bos . string-start) ; SRE
- (bot . string-start) ; sregex
- (string-end . "\\'")
- (eos . string-end) ; SRE
- (eot . string-end) ; sregex
- (buffer-start . "\\`")
- (buffer-end . "\\'")
- (point . "\\=")
- (word-start . "\\<")
- (bow . word-start) ; SRE
- (word-end . "\\>")
- (eow . word-end) ; SRE
- (word-boundary . "\\b")
- (not-word-boundary . "\\B") ; sregex
- (symbol-start . "\\_<")
- (symbol-end . "\\_>")
- (syntax . (rx-syntax 1 1))
- (not-syntax . (rx-not-syntax 1 1)) ; sregex
- (category . (rx-category 1 1 rx-check-category))
- (eval . (rx-eval 1 1))
- (literal . (rx-literal 1 1 stringp))
- (regexp . (rx-regexp 1 1 stringp))
- (regex . regexp) ; sregex
- (digit . "[[:digit:]]")
- (numeric . digit) ; SRE
- (num . digit) ; SRE
- (control . "[[:cntrl:]]") ; SRE
- (cntrl . control) ; SRE
- (hex-digit . "[[:xdigit:]]") ; SRE
- (hex . hex-digit) ; SRE
- (xdigit . hex-digit) ; SRE
- (blank . "[[:blank:]]") ; SRE
- (graphic . "[[:graph:]]") ; SRE
- (graph . graphic) ; SRE
- (printing . "[[:print:]]") ; SRE
- (print . printing) ; SRE
- (alphanumeric . "[[:alnum:]]") ; SRE
- (alnum . alphanumeric) ; SRE
- (letter . "[[:alpha:]]")
- (alphabetic . letter) ; SRE
- (alpha . letter) ; SRE
- (ascii . "[[:ascii:]]") ; SRE
- (nonascii . "[[:nonascii:]]")
- (lower . "[[:lower:]]") ; SRE
- (lower-case . lower) ; SRE
- (punctuation . "[[:punct:]]") ; SRE
- (punct . punctuation) ; SRE
- (space . "[[:space:]]") ; SRE
- (whitespace . space) ; SRE
- (white . space) ; SRE
- (upper . "[[:upper:]]") ; SRE
- (upper-case . upper) ; SRE
- (word . "[[:word:]]") ; inconsistent with SRE
- (wordchar . word) ; sregex
- (not-wordchar . "\\W"))
- "Alist of sexp form regexp constituents.
-Each element of the alist has the form (SYMBOL . DEFN).
-SYMBOL is a valid constituent of sexp regular expressions.
-If DEFN is a string, SYMBOL is translated into DEFN.
-If DEFN is a symbol, use the definition of DEFN, recursively.
-Otherwise, DEFN must be a list (FUNCTION MIN-ARGS MAX-ARGS PREDICATE).
-FUNCTION is used to produce code for SYMBOL. MIN-ARGS and MAX-ARGS
-are the minimum and maximum number of arguments the function-form
-sexp constituent SYMBOL may have in sexp regular expressions.
-MAX-ARGS nil means no limit. PREDICATE, if specified, means that
-all arguments must satisfy PREDICATE.")
-
-
-(defconst rx-syntax
- '((whitespace . ?-)
- (punctuation . ?.)
- (word . ?w)
- (symbol . ?_)
- (open-parenthesis . ?\()
- (close-parenthesis . ?\))
- (expression-prefix . ?\')
- (string-quote . ?\")
- (paired-delimiter . ?$)
- (escape . ?\\)
- (character-quote . ?/)
- (comment-start . ?<)
- (comment-end . ?>)
- (string-delimiter . ?|)
- (comment-delimiter . ?!))
- "Alist mapping Rx syntax symbols to syntax characters.
-Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
-symbol in `(syntax SYMBOL)', and CHAR is the syntax character
-corresponding to SYMBOL, as it would be used with \\s or \\S in
-regular expressions.")
-
-
-(defconst rx-categories
- '((space-for-indent . ?\s)
- (base . ?.)
- (consonant . ?0)
- (base-vowel . ?1)
- (upper-diacritical-mark . ?2)
- (lower-diacritical-mark . ?3)
- (tone-mark . ?4)
- (symbol . ?5)
- (digit . ?6)
- (vowel-modifying-diacritical-mark . ?7)
- (vowel-sign . ?8)
- (semivowel-lower . ?9)
- (not-at-end-of-line . ?<)
- (not-at-beginning-of-line . ?>)
- (alpha-numeric-two-byte . ?A)
- (chinese-two-byte . ?C)
- (chinse-two-byte . ?C) ;; A typo in Emacs 21.1-24.3.
- (greek-two-byte . ?G)
- (japanese-hiragana-two-byte . ?H)
- (indian-two-byte . ?I)
- (japanese-katakana-two-byte . ?K)
- (strong-left-to-right . ?L)
- (korean-hangul-two-byte . ?N)
- (strong-right-to-left . ?R)
- (cyrillic-two-byte . ?Y)
- (combining-diacritic . ?^)
- (ascii . ?a)
- (arabic . ?b)
- (chinese . ?c)
- (ethiopic . ?e)
- (greek . ?g)
- (korean . ?h)
- (indian . ?i)
- (japanese . ?j)
- (japanese-katakana . ?k)
- (latin . ?l)
- (lao . ?o)
- (tibetan . ?q)
- (japanese-roman . ?r)
- (thai . ?t)
- (vietnamese . ?v)
- (hebrew . ?w)
- (cyrillic . ?y)
- (can-break . ?|))
- "Alist mapping symbols to category characters.
-Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
-symbol in `(category SYMBOL)', and CHAR is the category character
-corresponding to SYMBOL, as it would be used with `\\c' or `\\C' in
-regular expression strings.")
-
-
-(defvar rx-greedy-flag t
- "Non-nil means produce greedy regular expressions for `zero-or-one',
-`zero-or-more', and `one-or-more'. Dynamically bound.")
-
-(defvar rx--compile-to-lisp nil
- "Nil means return a regexp as a string.
-Non-nil means we may return a lisp form which produces a
-string (used for `rx' macro).")
-
-(defun rx-info (op head)
- "Return parsing/code generation info for OP.
-If OP is the space character ASCII 32, return info for the symbol `?'.
-If OP is the character `?', return info for the symbol `??'.
-See also `rx-constituents'.
-If HEAD is non-nil, then OP is the head of a sexp, otherwise it's
-a standalone symbol."
- (cond ((eq op ? ) (setq op '\?))
- ((eq op ??) (setq op '\??)))
- (let (old-op)
- (while (and (not (null op)) (symbolp op))
- (setq old-op op)
- (setq op (cdr (assq op rx-constituents)))
- (when (if head (stringp op) (consp op))
- ;; We found something but of the wrong kind. Let's look for an
- ;; alternate definition for the other case.
- (let ((new-op
- (cdr (assq old-op (cdr (memq (assq old-op rx-constituents)
- rx-constituents))))))
- (if (and new-op (not (if head (stringp new-op) (consp new-op))))
- (setq op new-op))))))
- op)
-
-
-(defun rx-check (form)
- "Check FORM according to its car's parsing info."
- (unless (listp form)
- (error "rx `%s' needs argument(s)" form))
- (let* ((rx (rx-info (car form) 'head))
- (nargs (1- (length form)))
- (min-args (nth 1 rx))
- (max-args (nth 2 rx))
- (type-pred (nth 3 rx)))
- (when (and (not (null min-args))
- (< nargs min-args))
- (error "rx form `%s' requires at least %d args"
- (car form) min-args))
- (when (and (not (null max-args))
- (> nargs max-args))
- (error "rx form `%s' accepts at most %d args"
- (car form) max-args))
- (when type-pred
- (dolist (sub-form (cdr form))
- (unless (funcall type-pred sub-form)
- (error "rx form `%s' requires args satisfying `%s'"
- (car form) type-pred))))))
-
-
-(defun rx-group-if (regexp group)
- "Put shy groups around REGEXP if seemingly necessary when GROUP
-is non-nil."
+;; The `rx--translate...' functions below return (REGEXP . PRECEDENCE),
+;; where REGEXP is a list of string expressions that will be
+;; concatenated into a regexp, and PRECEDENCE is one of
+;;
+;; t -- can be used as argument to postfix operators (eg. "a")
+;; seq -- can be concatenated in sequence with other seq or higher (eg. "ab")
+;; lseq -- can be concatenated to the left of rseq or higher (eg. "^a")
+;; rseq -- can be concatenated to the right of lseq or higher (eg. "a$")
+;; nil -- can only be used in alternatives (eg. "a\\|b")
+;;
+;; They form a lattice:
+;;
+;; t highest precedence
+;; |
+;; seq
+;; / \
+;; lseq rseq
+;; \ /
+;; nil lowest precedence
+
+
+(defconst rx--char-classes
+ '((digit . digit)
+ (numeric . digit)
+ (num . digit)
+ (control . cntrl)
+ (cntrl . cntrl)
+ (hex-digit . xdigit)
+ (hex . xdigit)
+ (xdigit . xdigit)
+ (blank . blank)
+ (graphic . graph)
+ (graph . graph)
+ (printing . print)
+ (print . print)
+ (alphanumeric . alnum)
+ (alnum . alnum)
+ (letter . alpha)
+ (alphabetic . alpha)
+ (alpha . alpha)
+ (ascii . ascii)
+ (nonascii . nonascii)
+ (lower . lower)
+ (lower-case . lower)
+ (punctuation . punct)
+ (punct . punct)
+ (space . space)
+ (whitespace . space)
+ (white . space)
+ (upper . upper)
+ (upper-case . upper)
+ (word . word)
+ (wordchar . word)
+ (unibyte . unibyte)
+ (multibyte . multibyte))
+ "Alist mapping rx symbols to character classes.
+Most of the names are from SRE.")
+
+(defvar rx-constituents nil
+ "Alist of old-style rx extensions, for compatibility.
+
+Each element is (SYMBOL . DEF).
+
+If DEF is a symbol, then SYMBOL is an alias of DEF.
+
+If DEF is a string, then SYMBOL is a plain rx symbol defined as the
+ regexp string DEF.
+
+If DEF is a list on the form (FUN MIN-ARGS MAX-ARGS PRED), then
+ SYMBOL is an rx form with at least MIN-ARGS and at most
+ MAX-ARGS arguments. If MAX-ARGS is nil, then there is no upper limit.
+ FUN is a function taking the entire rx form as single argument
+ and returning the translated regexp string.
+ If PRED is non-nil, it is a predicate that all actual arguments must
+ satisfy.")
+
+;; TODO: Additions to consider:
+;; - A better name for `anything', like `any-char' or `anychar'.
+;; - A name for (or), maybe `unmatchable'.
+;; - A construct like `or' but without the match order guarantee,
+;; maybe `unordered-or'. Useful for composition or generation of
+;; alternatives; permits more effective use of regexp-opt.
+
+(defun rx--translate-symbol (sym)
+ "Translate an rx symbol. Return (REGEXP . PRECEDENCE)."
+ (pcase sym
+ ;; Use `list' instead of a quoted list to wrap the strings here,
+ ;; since the return value may be mutated.
+ ((or 'nonl 'not-newline 'any) (cons (list ".") t))
+ ('anything (rx--translate-form '(or nonl "\n")))
+ ((or 'bol 'line-start) (cons (list "^") 'lseq))
+ ((or 'eol 'line-end) (cons (list "$") 'rseq))
+ ((or 'bos 'string-start 'bot 'buffer-start) (cons (list "\\`") t))
+ ((or 'eos 'string-end 'eot 'buffer-end) (cons (list "\\'") t))
+ ('point (cons (list "\\=") t))
+ ((or 'bow 'word-start) (cons (list "\\<") t))
+ ((or 'eow 'word-end) (cons (list "\\>") t))
+ ('word-boundary (cons (list "\\b") t))
+ ('not-word-boundary (cons (list "\\B") t))
+ ('symbol-start (cons (list "\\_<") t))
+ ('symbol-end (cons (list "\\_>") t))
+ ('not-wordchar (cons (list "\\W") t))
+ (_
+ (cond
+ ((let ((class (cdr (assq sym rx--char-classes))))
+ (and class (cons (list (concat "[[:" (symbol-name class) ":]]")) t))))
+
+ ;; For compatibility with old rx.
+ ((let ((entry (assq sym rx-constituents)))
+ (and (progn
+ (while (and entry (not (stringp (cdr entry))))
+ (setq entry
+ (if (symbolp (cdr entry))
+ ;; Alias for another entry.
+ (assq (cdr entry) rx-constituents)
+ ;; Wrong type, try further down the list.
+ (assq (car entry)
+ (cdr (memq entry rx-constituents))))))
+ entry)
+ (cons (list (cdr entry)) nil))))
+ (t (error "Unknown rx symbol `%s'" sym))))))
+
+(defun rx--enclose (left-str rexp right-str)
+ "Bracket REXP by LEFT-STR and RIGHT-STR."
+ (append (list left-str) rexp (list right-str)))
+
+(defun rx--bracket (rexp)
+ (rx--enclose "\\(?:" rexp "\\)"))
+
+(defun rx--sequence (left right)
+ "Return the sequence (concatenation) of two translated items,
+each on the form (REGEXP . PRECEDENCE), returning (REGEXP . PRECEDENCE)."
+ ;; Concatenation rules:
+ ;; seq ++ seq -> seq
+ ;; lseq ++ seq -> lseq
+ ;; seq ++ rseq -> rseq
+ ;; lseq ++ rseq -> nil
+ (cond ((not (car left)) right)
+ ((not (car right)) left)
+ (t
+ (let ((l (if (memq (cdr left) '(nil rseq))
+ (cons (rx--bracket (car left)) t)
+ left))
+ (r (if (memq (cdr right) '(nil lseq))
+ (cons (rx--bracket (car right)) t)
+ right)))
+ (cons (append (car l) (car r))
+ (if (eq (cdr l) 'lseq)
+ (if (eq (cdr r) 'rseq)
+ nil ; lseq ++ rseq
+ 'lseq) ; lseq ++ seq
+ (if (eq (cdr r) 'rseq)
+ 'rseq ; seq ++ rseq
+ 'seq))))))) ; seq ++ seq
+
+(defun rx--translate-seq (body)
+ "Translate a sequence of one or more rx items. Return (REGEXP . PRECEDENCE)."
+ (if body
+ (let* ((items (mapcar #'rx--translate body))
+ (result (car items)))
+ (dolist (item (cdr items))
+ (setq result (rx--sequence result item)))
+ result)
+ (cons nil 'seq)))
+
+(defun rx--empty ()
+ "Regexp that never matches anything."
+ (cons (list regexp-unmatchable) 'seq))
+
+;; `cl-every' replacement to avoid bootstrapping problems.
+(defun rx--every (pred list)
+ "Whether PRED is true for every element of LIST."
+ (while (and list (funcall pred (car list)))
+ (setq list (cdr list)))
+ (null list))
+
+(defun rx--translate-or (body)
+ "Translate an or-pattern of one of more rx items.
+Return (REGEXP . PRECEDENCE)."
+ ;; FIXME: Possible improvements:
+ ;;
+ ;; - Turn single characters to strings: (or ?a ?b) -> (or "a" "b"),
+ ;; so that they can be candidates for regexp-opt.
+ ;;
+ ;; - Translate compile-time strings (`eval' forms), again for regexp-opt.
+ ;;
+ ;; - Flatten sub-patterns first: (or (or A B) (or C D)) -> (or A B C D)
+ ;; in order to improve effectiveness of regexp-opt.
+ ;; This would also help composability.
+ ;;
+ ;; - Use associativity to run regexp-opt on contiguous subsets of arguments
+ ;; if not all of them are strings. Example:
+ ;; (or (+ digit) "CHARLIE" "CHAN" (+ blank))
+ ;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank))
+ ;;
+ ;; - Fuse patterns into a single character alternative if they fit.
+ ;; regexp-opt will do that if all are strings, but we want to do that for:
+ ;; * symbols that expand to classes: space, alpha, ...
+ ;; * character alternatives: (any ...)
+ ;; * (syntax S), for some S (whitespace, word)
+ ;; so that (or "@" "%" digit (any "A-Z" space) (syntax word))
+ ;; -> (any "@" "%" digit "A-Z" space word)
+ ;; -> "[A-Z@%[:digit:][:space:][:word:]]"
+ ;;
+ ;; Problem: If a subpattern is carefully written to to be
+ ;; optimisable by regexp-opt, how do we prevent the transforms
+ ;; above from destroying that property?
+ ;; Example: (or "a" (or "abc" "abd" "abe"))
(cond
- ;; for some repetition
- ((eq group '*) (if (rx-atomic-p regexp) (setq group nil)))
- ;; for concatenation
- ((eq group ':)
- (if (rx-atomic-p
- (if (and (stringp regexp)
- (string-match
- "\\(?:[?*+]\\??\\|\\\\{[0-9]*,?[0-9]*\\\\}\\)\\'" regexp))
- (substring regexp 0 (match-beginning 0))
- regexp))
- (setq group nil)))
- ;; for OR
- ((eq group '|) (setq group nil))
- ;; do anyway
- ((eq group t))
- ((rx-atomic-p regexp t) (setq group nil)))
- (cond ((and group (stringp regexp))
- (concat "\\(?:" regexp "\\)"))
- (group `("\\(?:" ,@regexp "\\)"))
- (t regexp)))
-
-
-(defvar rx-parent)
-;; dynamically bound in some functions.
-
-
-(defun rx-and (form)
- "Parse and produce code from FORM.
-FORM is of the form `(and FORM1 ...)'."
- (rx-check form)
- (rx-group-if
- (rx--subforms (cdr form) ':)
- (and (memq rx-parent '(* t)) rx-parent)))
-
-
-(defun rx-or (form)
- "Parse and produce code from FORM, which is `(or FORM1 ...)'."
- (rx-check form)
- (rx-group-if
- (cond
- ((null (cdr form)) regexp-unmatchable)
- ((cl-every #'stringp (cdr form))
- (regexp-opt (cdr form) nil t))
- (t (rx--subforms (cdr form) '| "\\|")))
- (and (memq rx-parent '(: * t)) rx-parent)))
-
-
-(defun rx-anything (form)
- "Match any character."
- (if (consp form)
- (error "rx `anything' syntax error: %s" form))
- (rx-or (list 'or 'not-newline ?\n)))
-
-
-(defun rx-any-delete-from-range (char ranges)
- "Delete by side effect character CHAR from RANGES.
-Only both edges of each range is checked."
- (let (m)
- (cond
- ((memq char ranges) (setq ranges (delq char ranges)))
- ((setq m (assq char ranges))
- (if (eq (1+ char) (cdr m))
- (setcar (memq m ranges) (1+ char))
- (setcar m (1+ char))))
- ((setq m (rassq char ranges))
- (if (eq (1- char) (car m))
- (setcar (memq m ranges) (1- char))
- (setcdr m (1- char)))))
- ranges))
-
-
-(defun rx-any-condense-range (args)
- "Condense by side effect ARGS as range for Rx `any'."
- (let (str
- l)
- ;; set STR list of all strings
- ;; set L list of all ranges
- (mapc (lambda (e) (cond ((stringp e) (push e str))
- ((numberp e) (push (cons e e) l))
- ;; Ranges between ASCII and raw bytes are split,
- ;; to prevent accidental inclusion of Unicode
- ;; characters later on.
- ((and (<= (car e) #x7f)
- (>= (cdr e) #x3fff80))
- (push (cons (car e) #x7f) l)
- (push (cons #x3fff80 (cdr e)) l))
- (t (push e l))))
- args)
- ;; condense overlapped ranges in L
- (let ((tail (setq l (sort l #'car-less-than-car)))
- d)
- (while (setq d (cdr tail))
- (if (>= (cdar tail) (1- (caar d)))
- (progn
- (setcdr (car tail) (max (cdar tail) (cdar d)))
- (setcdr tail (cdr d)))
- (setq tail d))))
- ;; Separate small ranges to single number, and delete dups.
- (nconc
- (apply #'nconc
- (mapcar (lambda (e)
- (cond
- ((= (car e) (cdr e)) (list (car e)))
- ((= (1+ (car e)) (cdr e)) (list (car e) (cdr e)))
- ((list e))))
- l))
- (delete-dups str))))
-
-
-(defun rx-check-any-string (str)
- "Turn the `any' argument string STR into a list of characters.
-The original order is not preserved. Ranges, \"A-Z\", become pairs, (?A . ?Z)."
+ ((null body) ; No items: a never-matching regexp.
+ (rx--empty))
+ ((null (cdr body)) ; Single item.
+ (rx--translate (car body)))
+ ((rx--every #'stringp body) ; All strings.
+ (cons (list (regexp-opt body nil t))
+ t))
+ (t
+ (cons (append (car (rx--translate (car body)))
+ (mapcan (lambda (item)
+ (cons "\\|" (car (rx--translate item))))
+ (cdr body)))
+ nil))))
+
+(defun rx--string-to-intervals (str)
+ "Decode STR as intervals: A-Z becomes (?A . ?Z), and the single
+character X becomes (?X . ?X). Return the intervals in a list."
+ ;; We could just do string-to-multibyte on the string and work with
+ ;; that instead of this `decode-char' workaround.
(let ((decode-char
- ;; Make sure raw bytes are decoded as such, to avoid confusion with
- ;; U+0080..U+00FF.
(if (multibyte-string-p str)
#'identity
- (lambda (c) (if (<= #x80 c #xff)
- (+ c #x3fff00)
- c))))
+ #'unibyte-char-to-multibyte))
(len (length str))
(i 0)
- (ret nil))
- (if (= 0 len)
- (error "String arg for Rx `any' must not be empty"))
+ (intervals nil))
(while (< i len)
(cond ((and (< i (- len 2))
- (= (aref str (+ i 1)) ?-))
+ (= (aref str (1+ i)) ?-))
;; Range.
(let ((start (funcall decode-char (aref str i)))
(end (funcall decode-char (aref str (+ i 2)))))
- (cond ((< start end) (push (cons start end) ret))
- ((= start end) (push start ret))
+ (cond ((and (<= start #x7f) (>= end #x3fff80))
+ ;; Ranges between ASCII and raw bytes are split to
+ ;; avoid having them absorb Unicode characters
+ ;; caught in-between.
+ (push (cons start #x7f) intervals)
+ (push (cons #x3fff80 end) intervals))
+ ((<= start end)
+ (push (cons start end) intervals))
(t
- (error "Rx character range `%c-%c' is reversed"
- start end)))
+ (error "Invalid rx `any' range: %s"
+ (substring str i 3))))
(setq i (+ i 3))))
(t
;; Single character.
- (push (funcall decode-char (aref str i)) ret)
+ (let ((char (funcall decode-char (aref str i))))
+ (push (cons char char) intervals))
(setq i (+ i 1)))))
- ret))
-
-
-(defun rx-check-any (arg)
- "Check arg ARG for Rx `any'."
- (cond
- ((integerp arg) (list arg))
- ((symbolp arg)
- (let ((translation (condition-case nil
- (rx-form arg)
- (error nil))))
- (if (or (null translation)
- (null (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'" translation)))
- (error "Invalid char class `%s' in Rx `any'" arg))
- (list (substring translation 1 -1)))) ; strip outer brackets
- ((and (characterp (car-safe arg)) (characterp (cdr-safe arg)))
- (unless (<= (car arg) (cdr arg))
- (error "Rx character range `%c-%c' is reversed"
- (car arg) (cdr arg)))
- (list arg))
- ((stringp arg) (rx-check-any-string arg))
- ((error
- "rx `any' requires string, character, char pair or char class args"))))
-
-
-(defun rx-any (form)
- "Parse and produce code from FORM, which is `(any ARG ...)'.
-ARG is optional."
- (rx-check form)
- (let* ((args (rx-any-condense-range
- (apply
- #'nconc
- (mapcar #'rx-check-any (cdr form)))))
- m
- s)
- (cond
- ;; single close bracket
- ;; => "[]...-]" or "[]...--.]"
- ((memq ?\] args)
- ;; set ] at the beginning
- (setq args (cons ?\] (delq ?\] args)))
- ;; set - at the end
- (if (or (memq ?- args) (assq ?- args))
- (setq args (nconc (rx-any-delete-from-range ?- args)
- (list ?-)))))
- ;; close bracket starts a range
- ;; => "[]-....-]" or "[]-.--....]"
- ((setq m (assq ?\] args))
- ;; bring it to the beginning
- (setq args (cons m (delq m args)))
- (cond ((memq ?- args)
- ;; to the end
- (setq args (nconc (delq ?- args) (list ?-))))
- ((setq m (assq ?- args))
- ;; next to the bracket's range, make the second range
- (setcdr args (cons m (delq m (cdr args)))))))
- ;; bracket in the end range
- ;; => "[]...-]"
- ((setq m (rassq ?\] args))
- ;; set ] at the beginning
- (setq args (cons ?\] (rx-any-delete-from-range ?\] args)))
- ;; set - at the end
- (if (or (memq ?- args) (assq ?- args))
- (setq args (nconc (rx-any-delete-from-range ?- args)
- (list ?-)))))
- ;; {no close bracket appears}
- ;;
- ;; bring single bar to the beginning
- ((memq ?- args)
- (setq args (cons ?- (delq ?- args))))
- ;; bar start a range, bring it to the beginning
- ((setq m (assq ?- args))
- (setq args (cons m (delq m args))))
- ;;
- ;; hat at the beginning?
- ((or (eq (car args) ?^) (eq (car-safe (car args)) ?^))
- (setq args (if (cdr args)
- `(,(cadr args) ,(car args) ,@(cddr args))
- (nconc (rx-any-delete-from-range ?^ args)
- (list ?^))))))
- ;; some 1-char?
- (if (and (null (cdr args)) (numberp (car args))
- (or (= 1 (length
- (setq s (regexp-quote (string (car args))))))
- (and (equal (car args) ?^) ;; unnecessary predicate?
- (null (eq rx-parent '!)))))
- s
- (concat "["
- (mapconcat
- (lambda (e) (cond
- ((numberp e) (string e))
- ((consp e)
- (if (and (= (1+ (car e)) (cdr e))
- ;; rx-any-condense-range should
- ;; prevent this case from happening.
- (null (memq (car e) '(?\] ?-)))
- (null (memq (cdr e) '(?\] ?-))))
- (string (car e) (cdr e))
- (string (car e) ?- (cdr e))))
- (e)))
- args
- nil)
- "]"))))
-
-
-(defun rx-check-not (arg)
- "Check arg ARG for Rx `not'."
- (unless (or (and (symbolp arg)
- (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'"
- (condition-case nil
- (rx-form arg)
- (error ""))))
- (eq arg 'word-boundary)
- (and (consp arg)
- (memq (car arg) '(not any in syntax category))))
- (error "rx `not' syntax error: %s" arg))
- t)
-
-
-(defun rx-not (form)
- "Parse and produce code from FORM. FORM is `(not ...)'."
- (rx-check form)
- (let ((result (rx-form (cadr form) '!))
- case-fold-search)
- (cond ((string-match "\\`\\[\\^" result)
- (cond
- ((equal result "[^]") "[^^]")
- ((and (= (length result) 4) (null (eq rx-parent '!)))
- (regexp-quote (substring result 2 3)))
- ((concat "[" (substring result 2)))))
- ((eq ?\[ (aref result 0))
- (concat "[^" (substring result 1)))
- ((string-match "\\`\\\\[scbw]" result)
- (concat (upcase (substring result 0 2))
- (substring result 2)))
- ((string-match "\\`\\\\[SCBW]" result)
- (concat (downcase (substring result 0 2))
- (substring result 2)))
- (t
- (concat "[^" result "]")))))
-
-
-(defun rx-not-char (form)
- "Parse and produce code from FORM. FORM is `(not-char ...)'."
- (rx-check form)
- (rx-not `(not (in ,@(cdr form)))))
-
-
-(defun rx-not-syntax (form)
- "Parse and produce code from FORM. FORM is `(not-syntax SYNTAX)'."
- (rx-check form)
- (rx-not `(not (syntax ,@(cdr form)))))
-
-
-(defun rx-trans-forms (form &optional skip)
- "If FORM's length is greater than two, transform it to length two.
-A form (HEAD REST ...) becomes (HEAD (and REST ...)).
-If SKIP is non-nil, allow that number of items after the head, i.e.
-`(= N REST ...)' becomes `(= N (and REST ...))' if SKIP is 1."
- (unless skip (setq skip 0))
- (let ((tail (nthcdr (1+ skip) form)))
- (if (= (length tail) 1)
- form
- (let ((form (copy-sequence form)))
- (setcdr (nthcdr skip form) (list (cons 'and tail)))
- form))))
-
-
-(defun rx-= (form)
- "Parse and produce code from FORM `(= N ...)'."
- (rx-check form)
- (setq form (rx-trans-forms form 1))
- (unless (and (integerp (nth 1 form))
- (> (nth 1 form) 0))
- (error "rx `=' requires positive integer first arg"))
- (let ((subform (rx-form (nth 2 form) '*)))
- (if (stringp subform)
- (format "%s\\{%d\\}" subform (nth 1 form))
- `(,@subform ,(format "\\{%d\\}" (nth 1 form))))))
-
-
-(defun rx->= (form)
- "Parse and produce code from FORM `(>= N ...)'."
- (rx-check form)
- (setq form (rx-trans-forms form 1))
- (unless (and (integerp (nth 1 form))
- (> (nth 1 form) 0))
- (error "rx `>=' requires positive integer first arg"))
- (let ((subform (rx-form (nth 2 form) '*)))
- (if (stringp subform)
- (format "%s\\{%d,\\}" subform (nth 1 form))
- `(,@subform ,(format "\\{%d,\\}" (nth 1 form))))))
-
-
-(defun rx-** (form)
- "Parse and produce code from FORM `(** N M ...)'."
- (rx-check form)
- (rx-form (cons 'repeat (cdr (rx-trans-forms form 2))) '*))
-
-
-(defun rx-repeat (form)
- "Parse and produce code from FORM.
-FORM is either `(repeat N FORM1)' or `(repeat N M FORMS...)'."
- (rx-check form)
- (if (> (length form) 4)
- (setq form (rx-trans-forms form 2)))
- (if (null (nth 2 form))
- (setq form (cons (nth 0 form) (cons (nth 1 form) (nthcdr 3 form)))))
- (cond ((= (length form) 3)
- (unless (and (integerp (nth 1 form))
- (> (nth 1 form) 0))
- (error "rx `repeat' requires positive integer first arg"))
- (let ((subform (rx-form (nth 2 form) '*)))
- (if (stringp subform)
- (format "%s\\{%d\\}" subform (nth 1 form))
- `(,@subform ,(format "\\{%d\\}" (nth 1 form))))))
- ((or (not (integerp (nth 2 form)))
- (< (nth 2 form) 0)
- (not (integerp (nth 1 form)))
- (< (nth 1 form) 0)
- (< (nth 2 form) (nth 1 form)))
- (error "rx `repeat' range error"))
- (t
- (let ((subform (rx-form (nth 3 form) '*)))
- (if (stringp subform)
- (format "%s\\{%d,%d\\}" subform (nth 1 form) (nth 2 form))
- `(,@subform ,(format "\\{%d,%d\\}" (nth 1 form) (nth 2 form))))))))
-
-
-(defun rx-submatch (form)
- "Parse and produce code from FORM, which is `(submatch ...)'."
- (let ((subforms (rx--subforms (cdr form) ':)))
- (if (stringp subforms)
- (concat "\\(" subforms "\\)")
- `("\\(" ,@subforms "\\)"))))
-
-(defun rx-submatch-n (form)
- "Parse and produce code from FORM, which is `(submatch-n N ...)'."
- (let ((n (nth 1 form))
- (subforms (rx--subforms (cddr form) ':)))
- (unless (and (integerp n) (> n 0))
- (error "rx `submatch-n' argument must be positive"))
- (if (stringp subforms)
- (concat "\\(?" (number-to-string n) ":" subforms "\\)")
- `("\\(?" ,(number-to-string n) ":" ,@subforms "\\)"))))
-
-(defun rx-backref (form)
- "Parse and produce code from FORM, which is `(backref N)'."
- (rx-check form)
- (format "\\%d" (nth 1 form)))
-
-(defun rx-check-backref (arg)
- "Check arg ARG for Rx `backref'."
- (or (and (integerp arg) (>= arg 1) (<= arg 9))
- (error "rx `backref' requires numeric 1<=arg<=9: %s" arg)))
-
-(defun rx-kleene (form)
- "Parse and produce code from FORM.
-FORM is `(OP FORM1)', where OP is one of the `zero-or-one',
-`zero-or-more' etc. operators.
-If OP is one of `*', `+', `?', produce a greedy regexp.
-If OP is one of `*?', `+?', `??', produce a non-greedy regexp.
-If OP is anything else, produce a greedy regexp if `rx-greedy-flag'
-is non-nil."
- (rx-check form)
- (setq form (rx-trans-forms form))
- (let ((suffix (cond ((memq (car form) '(* + \? ?\s)) "")
- ((memq (car form) '(*? +? \?? ??)) "?")
- (rx-greedy-flag "")
- (t "?")))
- (op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*")
- ((memq (car form) '(+ +? 1+ one-or-more)) "+")
- (t "?")))
- (subform (rx-form (cadr form) '*)))
- (rx-group-if
- (if (stringp subform)
- (concat subform op suffix)
- `(,@subform ,(concat op suffix)))
- (and (memq rx-parent '(t *)) rx-parent))))
-
-
-(defun rx-atomic-p (r &optional lax)
- "Return non-nil if regexp string R is atomic.
-An atomic regexp R is one such that a suffix operator
-appended to R will apply to all of R. For example, \"a\"
-\"[abc]\" and \"\\(ab\\|ab*c\\)\" are atomic and \"ab\",
-\"[ab]c\", and \"ab\\|ab*c\" are not atomic.
-
-This function may return false negatives, but it will not
-return false positives. It is nevertheless useful in
-situations where an efficiency shortcut can be taken only if a
-regexp is atomic. The function can be improved to detect
-more cases of atomic regexps. Presently, this function
-detects the following categories of atomic regexp;
-
- a group or shy group: \\(...\\)
- a character class: [...]
- a single character: a
-
-On the other hand, false negatives will be returned for
-regexps that are atomic but end in operators, such as
-\"a+\". I think these are rare. Probably such cases could
-be detected without much effort. A guarantee of no false
-negatives would require a theoretic specification of the set
-of all atomic regexps."
- (if (and rx--compile-to-lisp
- (not (stringp r)))
- nil ;; Runtime value, we must assume non-atomic.
- (let ((l (length r)))
+ intervals))
+
+(defun rx--condense-intervals (intervals)
+ "Merge adjacent and overlapping intervals by mutation, preserving the order.
+INTERVALS is a list of (START . END) with START ≤ END, sorted by START."
+ (let ((tail intervals)
+ d)
+ (while (setq d (cdr tail))
+ (if (>= (cdar tail) (1- (caar d)))
+ (progn
+ (setcdr (car tail) (max (cdar tail) (cdar d)))
+ (setcdr tail (cdr d)))
+ (setq tail d)))
+ intervals))
+
+(defun rx--translate-any (negated body)
+ "Translate an (any ...) construct. Return (REGEXP . PRECEDENCE).
+If NEGATED, negate the sense."
+ (let ((classes nil)
+ (strings nil)
+ (conses nil))
+ ;; Collect strings, conses and characters, and classes in separate bins.
+ (dolist (arg body)
+ (cond ((stringp arg)
+ (push arg strings))
+ ((and (consp arg)
+ (characterp (car arg))
+ (characterp (cdr arg))
+ (<= (car arg) (cdr arg)))
+ ;; Copy the cons, in case we need to modify it.
+ (push (cons (car arg) (cdr arg)) conses))
+ ((characterp arg)
+ (push (cons arg arg) conses))
+ ((and (symbolp arg)
+ (let ((class (cdr (assq arg rx--char-classes))))
+ (and class (push class classes)))))
+ (t (error "Invalid rx `any' argument: %s" arg))))
+ (let ((items
+ ;; Translate strings and conses into nonoverlapping intervals,
+ ;; and add classes as symbols at the end.
+ (append
+ (rx--condense-intervals
+ (sort (append conses
+ (mapcan #'rx--string-to-intervals strings))
+ #'car-less-than-car))
+ (reverse classes))))
+
+ ;; Move lone ] and range ]-x to the start.
+ (let ((rbrac-l (assq ?\] items)))
+ (when rbrac-l
+ (setq items (cons rbrac-l (delq rbrac-l items)))))
+
+ ;; Split x-] and move the lone ] to the start.
+ (let ((rbrac-r (rassq ?\] items)))
+ (when (and rbrac-r (not (eq (car rbrac-r) ?\])))
+ (setcdr rbrac-r ?\\)
+ (setq items (cons '(?\] . ?\]) items))))
+
+ ;; Split ,-- (which would end up as ,- otherwise).
+ (let ((dash-r (rassq ?- items)))
+ (when (eq (car dash-r) ?,)
+ (setcdr dash-r ?,)
+ (setq items (nconc items '((?- . ?-))))))
+
+ ;; Remove - (lone or at start of interval)
+ (let ((dash-l (assq ?- items)))
+ (when dash-l
+ (if (eq (cdr dash-l) ?-)
+ (setq items (delq dash-l items)) ; Remove lone -
+ (setcar dash-l ?.)) ; Reduce --x to .-x
+ (setq items (nconc items '((?- . ?-))))))
+
+ ;; Deal with leading ^ and range ^-x.
+ (when (and (consp (car items))
+ (eq (caar items) ?^)
+ (cdr items))
+ ;; Move ^ and ^-x to second place.
+ (setq items (cons (cadr items)
+ (cons (car items) (cddr items)))))
+
(cond
- ((<= l 1))
- ((= l 2) (= (aref r 0) ?\\))
- ((= l 3) (string-match "\\`\\(?:\\\\[cCsS_]\\|\\[[^^]\\]\\)" r))
- ((null lax)
- (cond
- ((string-match "\\`\\[\\^?]?\\(?:\\[:[a-z]+:]\\|[^]]\\)*]\\'" r))
- ((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^)]\\)*\\\\)\\'" r))))))))
-
-
-(defun rx-syntax (form)
- "Parse and produce code from FORM, which is `(syntax SYMBOL)'."
- (rx-check form)
- (let* ((sym (cadr form))
- (syntax (cdr (assq sym rx-syntax))))
+ ;; Empty set: if negated, any char, otherwise match-nothing.
+ ((null items)
+ (if negated
+ (rx--translate-symbol 'anything)
+ (rx--empty)))
+ ;; Single non-negated character.
+ ((and (null (cdr items))
+ (consp (car items))
+ (eq (caar items) (cdar items))
+ (not negated))
+ (cons (list (regexp-quote (char-to-string (caar items))))
+ t))
+ ;; At least one character or class, possibly negated.
+ (t
+ (cons
+ (list
+ (concat
+ "["
+ (and negated "^")
+ (mapconcat (lambda (item)
+ (cond ((symbolp item)
+ (format "[:%s:]" item))
+ ((eq (car item) (cdr item))
+ (char-to-string (car item)))
+ ((eq (1+ (car item)) (cdr item))
+ (string (car item) (cdr item)))
+ (t
+ (string (car item) ?- (cdr item)))))
+ items nil)
+ "]"))
+ t))))))
+
+(defun rx--translate-not (negated body)
+ "Translate a (not ...) construct. Return (REGEXP . PRECEDENCE).
+If NEGATED, negate the sense (thus making it positive)."
+ (unless (and body (null (cdr body)))
+ (error "rx `not' form takes exactly one argument"))
+ (let ((arg (car body)))
+ (cond
+ ((consp arg)
+ (pcase (car arg)
+ ((or 'any 'in 'char) (rx--translate-any (not negated) (cdr arg)))
+ ('syntax (rx--translate-syntax (not negated) (cdr arg)))
+ ('category (rx--translate-category (not negated) (cdr arg)))
+ ('not (rx--translate-not (not negated) (cdr arg)))
+ (_ (error "Illegal argument to rx `not': %S" arg))))
+ ((eq arg 'word-boundary)
+ (rx--translate-symbol
+ (if negated 'word-boundary 'not-word-boundary)))
+ (t
+ (let ((class (cdr (assq arg rx--char-classes))))
+ (if class
+ (rx--translate-any (not negated) (list class))
+ (error "Illegal argument to rx `not': %s" arg)))))))
+
+(defun rx--atomic-regexp (item)
+ "ITEM is (REGEXP . PRECEDENCE); return a regexp of precedence t."
+ (if (eq (cdr item) t)
+ (car item)
+ (rx--bracket (car item))))
+
+(defun rx--translate-counted-repetition (min-count max-count body)
+ (let ((operand (rx--translate-seq body)))
+ (if (car operand)
+ (cons (append
+ (rx--atomic-regexp operand)
+ (list (concat "\\{"
+ (number-to-string min-count)
+ (cond ((null max-count) ",")
+ ((< min-count max-count)
+ (concat "," (number-to-string max-count))))
+ "\\}")))
+ t)
+ operand)))
+
+(defun rx--check-repeat-arg (name min-args body)
+ (unless (>= (length body) min-args)
+ (error "rx `%s' requires at least %d argument%s"
+ name min-args (if (= min-args 1) "" "s")))
+ ;; There seems to be no reason to disallow zero counts.
+ (unless (natnump (car body))
+ (error "rx `%s' first argument must be nonnegative" name)))
+
+(defun rx--translate-bounded-repetition (name body)
+ (let ((min-count (car body))
+ (max-count (cadr body))
+ (items (cddr body)))
+ (unless (and (natnump min-count)
+ (natnump max-count)
+ (<= min-count max-count))
+ (error "rx `%s' range error" name))
+ (rx--translate-counted-repetition min-count max-count items)))
+
+(defun rx--translate-repeat (body)
+ (rx--check-repeat-arg 'repeat 2 body)
+ (if (= (length body) 2)
+ (rx--translate-counted-repetition (car body) (car body) (cdr body))
+ (rx--translate-bounded-repetition 'repeat body)))
+
+(defun rx--translate-** (body)
+ (rx--check-repeat-arg '** 2 body)
+ (rx--translate-bounded-repetition '** body))
+
+(defun rx--translate->= (body)
+ (rx--check-repeat-arg '>= 1 body)
+ (rx--translate-counted-repetition (car body) nil (cdr body)))
+
+(defun rx--translate-= (body)
+ (rx--check-repeat-arg '= 1 body)
+ (rx--translate-counted-repetition (car body) (car body) (cdr body)))
+
+(defvar rx--greedy t)
+
+(defun rx--translate-rep (op-string greedy body)
+ "Translate a repetition; OP-STRING is one of \"*\", \"+\" or \"?\".
+GREEDY is a boolean. Return (REGEXP . PRECEDENCE)."
+ (let ((operand (rx--translate-seq body)))
+ (if (car operand)
+ (cons (append (rx--atomic-regexp operand)
+ (list (concat op-string (unless greedy "?"))))
+ ;; The result has precedence seq to avoid (? (* "a")) -> "a*?"
+ 'seq)
+ operand)))
+
+(defun rx--control-greedy (greedy body)
+ "Translate the sequence BODY with greediness GREEDY.
+Return (REGEXP . PRECEDENCE)."
+ (let ((rx--greedy greedy))
+ (rx--translate-seq body)))
+
+(defun rx--translate-group (body)
+ "Translate the `group' form. Return (REGEXP . PRECEDENCE)."
+ (cons (rx--enclose "\\("
+ (car (rx--translate-seq body))
+ "\\)")
+ t))
+
+(defun rx--translate-group-n (body)
+ "Translate the `group-n' form. Return (REGEXP . PRECEDENCE)."
+ (unless (and (integerp (car body)) (> (car body) 0))
+ (error "rx `group-n' requires a positive number as first argument"))
+ (cons (rx--enclose (concat "\\(?" (number-to-string (car body)) ":")
+ (car (rx--translate-seq (cdr body)))
+ "\\)")
+ t))
+
+(defun rx--translate-backref (body)
+ "Translate the `backref' form. Return (REGEXP . PRECEDENCE)."
+ (unless (and (= (length body) 1) (integerp (car body)) (<= 1 (car body) 9))
+ (error "rx `backref' requires an argument in the range 1..9"))
+ (cons (list "\\" (number-to-string (car body))) t))
+
+(defconst rx--syntax-codes
+ '((whitespace . ?-) ; SPC also accepted
+ (punctuation . ?.)
+ (word . ?w) ; W also accepted
+ (symbol . ?_)
+ (open-parenthesis . ?\()
+ (close-parenthesis . ?\))
+ (expression-prefix . ?\')
+ (string-quote . ?\")
+ (paired-delimiter . ?$)
+ (escape . ?\\)
+ (character-quote . ?/)
+ (comment-start . ?<)
+ (comment-end . ?>)
+ (string-delimiter . ?|)
+ (comment-delimiter . ?!)))
+
+(defun rx--translate-syntax (negated body)
+ "Translate the `syntax' form. Return (REGEXP . PRECEDENCE)."
+ (unless (and body (null (cdr body)))
+ (error "rx `syntax' form takes exactly one argument"))
+ (let* ((sym (car body))
+ (syntax (cdr (assq sym rx--syntax-codes))))
(unless syntax
- ;; Try sregex compatibility.
(cond
- ((characterp sym) (setq syntax sym))
+ ;; Syntax character directly (sregex compatibility)
+ ((and (characterp sym) (rassq sym rx--syntax-codes))
+ (setq syntax sym))
+ ;; Syntax character as symbol (sregex compatibility)
((symbolp sym)
(let ((name (symbol-name sym)))
- (if (= 1 (length name))
- (setq syntax (aref name 0))))))
+ (when (= (length name) 1)
+ (let ((char (string-to-char name)))
+ (when (rassq char rx--syntax-codes)
+ (setq syntax char)))))))
(unless syntax
- (error "Unknown rx syntax `%s'" sym)))
- (format "\\s%c" syntax)))
-
-
-(defun rx-check-category (form)
- "Check the argument FORM of a `(category FORM)'."
- (unless (or (integerp form)
- (cdr (assq form rx-categories)))
- (error "Unknown category `%s'" form))
- t)
-
-
-(defun rx-category (form)
- "Parse and produce code from FORM, which is `(category SYMBOL)'."
- (rx-check form)
- (let ((char (if (integerp (cadr form))
- (cadr form)
- (cdr (assq (cadr form) rx-categories)))))
- (format "\\c%c" char)))
-
-
-(defun rx-eval (form)
- "Parse and produce code from FORM, which is `(eval FORM)'."
- (rx-check form)
- (rx-form (eval (cadr form)) rx-parent))
-
-
-(defun rx-greedy (form)
- "Parse and produce code from FORM.
-If FORM is `(minimal-match FORM1)', non-greedy versions of `*',
-`+', and `?' operators will be used in FORM1. If FORM is
-`(maximal-match FORM1)', greedy operators will be used."
- (rx-check form)
- (let ((rx-greedy-flag (eq (car form) 'maximal-match)))
- (rx-form (cadr form) rx-parent)))
-
-
-(defun rx-regexp (form)
- "Parse and produce code from FORM, which is `(regexp STRING)'."
- (cond ((stringp (cadr form))
- (rx-group-if (cadr form) rx-parent))
- (rx--compile-to-lisp
- ;; Always group non-string forms, since we can't be sure they
- ;; are atomic.
- (rx-group-if (cdr form) t))
- (t (rx-check form))))
-
-(defun rx-literal (form)
- "Parse and produce code from FORM, which is `(literal STRING-EXP)'."
- (cond ((stringp (cadr form))
- ;; This is allowed, but makes little sense, you could just
- ;; use STRING directly.
- (rx-group-if (regexp-quote (cadr form)) rx-parent))
- (rx--compile-to-lisp
- (rx-group-if `((regexp-quote ,(cadr form))) rx-parent))
- (t (rx-check form))))
-
-(defun rx-form (form &optional parent)
- "Parse and produce code for regular expression FORM.
-FORM is a regular expression in sexp form.
-PARENT shows which type of expression calls and controls putting of
-shy groups around the result and some more in other functions."
- (let ((rx-parent parent))
- (cond
- ((stringp form)
- (rx-group-if (regexp-quote form)
- (if (and (eq parent '*) (< 1 (length form)))
- parent)))
- ((integerp form)
- (regexp-quote (char-to-string form)))
- ((symbolp form)
- (let ((info (rx-info form nil)))
- (cond ((stringp info)
- info)
- ((null info)
- (error "Unknown rx form `%s'" form))
- (t
- (funcall (nth 0 info) form)))))
- ((consp form)
- (let ((info (rx-info (car form) 'head)))
- (unless (consp info)
- (error "Unknown rx form `%s'" (car form)))
- (funcall (nth 0 info) form)))
- (t
- (error "rx syntax error at `%s'" form)))))
-
-(defun rx--subforms (subforms &optional parent separator)
- "Produce code for regular expressions SUBFORMS.
-SUBFORMS is a list of regular expression sexps.
-PARENT controls grouping, as in `rx-form'.
-Insert SEPARATOR between the code from each of SUBFORMS."
- (if (null (cdr subforms))
- ;; Zero or one forms, no need for grouping.
- (and subforms (rx-form (car subforms)))
- (let ((listify (lambda (x)
- (if (listp x) (copy-sequence x)
- (list x)))))
- (setq subforms (mapcar (lambda (x) (rx-form x parent)) subforms))
- (cond ((or (not rx--compile-to-lisp)
- (cl-every #'stringp subforms))
- (mapconcat #'identity subforms separator))
- (separator
- (nconc (funcall listify (car subforms))
- (mapcan (lambda (x)
- (cons separator (funcall listify x)))
- (cdr subforms))))
- (t (mapcan listify subforms))))))
+ (error "Unknown rx syntax name `%s'" sym)))
+ (cons (list (string ?\\ (if negated ?S ?s) syntax))
+ t)))
+
+(defconst rx--categories
+ '((space-for-indent . ?\s)
+ (base . ?.)
+ (consonant . ?0)
+ (base-vowel . ?1)
+ (upper-diacritical-mark . ?2)
+ (lower-diacritical-mark . ?3)
+ (tone-mark . ?4)
+ (symbol . ?5)
+ (digit . ?6)
+ (vowel-modifying-diacritical-mark . ?7)
+ (vowel-sign . ?8)
+ (semivowel-lower . ?9)
+ (not-at-end-of-line . ?<)
+ (not-at-beginning-of-line . ?>)
+ (alpha-numeric-two-byte . ?A)
+ (chinese-two-byte . ?C)
+ (chinse-two-byte . ?C) ; A typo in Emacs 21.1-24.3.
+ (greek-two-byte . ?G)
+ (japanese-hiragana-two-byte . ?H)
+ (indian-two-byte . ?I)
+ (japanese-katakana-two-byte . ?K)
+ (strong-left-to-right . ?L)
+ (korean-hangul-two-byte . ?N)
+ (strong-right-to-left . ?R)
+ (cyrillic-two-byte . ?Y)
+ (combining-diacritic . ?^)
+ (ascii . ?a)
+ (arabic . ?b)
+ (chinese . ?c)
+ (ethiopic . ?e)
+ (greek . ?g)
+ (korean . ?h)
+ (indian . ?i)
+ (japanese . ?j)
+ (japanese-katakana . ?k)
+ (latin . ?l)
+ (lao . ?o)
+ (tibetan . ?q)
+ (japanese-roman . ?r)
+ (thai . ?t)
+ (vietnamese . ?v)
+ (hebrew . ?w)
+ (cyrillic . ?y)
+ (can-break . ?|)))
+
+(defun rx--translate-category (negated body)
+ "Translate the `category' form. Return (REGEXP . PRECEDENCE)."
+ (unless (and body (null (cdr body)))
+ (error "rx `category' form takes exactly one argument"))
+ (let* ((arg (car body))
+ (category
+ (cond ((symbolp arg)
+ (let ((cat (assq arg rx--categories)))
+ (unless cat
+ (error "Unknown rx category `%s'" arg))
+ (cdr cat)))
+ ((characterp arg) arg)
+ (t (error "Invalid rx `category' argument `%s'" arg)))))
+ (cons (list (string ?\\ (if negated ?C ?c) category))
+ t)))
+
+(defvar rx--delayed-evaluation nil
+ "Whether to allow certain forms to be evaluated at runtime.")
+
+(defun rx--translate-literal (body)
+ "Translate the `literal' form. Return (REGEXP . PRECEDENCE)."
+ (unless (and body (null (cdr body)))
+ (error "rx `literal' form takes exactly one argument"))
+ (let ((arg (car body)))
+ (cond ((stringp arg)
+ (cons (list (regexp-quote arg)) (if (= (length arg) 1) t 'seq)))
+ (rx--delayed-evaluation
+ (cons (list (list 'regexp-quote arg)) 'seq))
+ (t (error "rx `literal' form with non-string argument")))))
+
+(defun rx--translate-eval (body)
+ "Translate the `eval' form. Return (REGEXP . PRECEDENCE)."
+ (unless (and body (null (cdr body)))
+ (error "rx `eval' form takes exactly one argument"))
+ (rx--translate (eval (car body))))
+
+(defvar rx--regexp-atomic-regexp nil)
+
+(defun rx--translate-regexp (body)
+ "Translate the `regexp' form. Return (REGEXP . PRECEDENCE)."
+ (unless (and body (null (cdr body)))
+ (error "rx `regexp' form takes exactly one argument"))
+ (let ((arg (car body)))
+ (cond ((stringp arg)
+ ;; Generate the regexp when needed, since rx isn't
+ ;; necessarily present in the byte-compilation environment.
+ (unless rx--regexp-atomic-regexp
+ (setq rx--regexp-atomic-regexp
+ ;; Match atomic (precedence t) regexps: may give
+ ;; false negatives but no false positives, assuming
+ ;; the target string is syntactically correct.
+ (rx-to-string
+ '(seq
+ bos
+ (or (seq "["
+ (opt "^")
+ (opt "]")
+ (* (or (seq "[:" (+ (any "a-z")) ":]")
+ (not (any "]"))))
+ "]")
+ anything
+ (seq "\\"
+ (or anything
+ (seq (any "sScC_") anything)
+ (seq "("
+ (* (or (not (any "\\"))
+ (seq "\\" (not (any ")")))))
+ "\\)"))))
+ eos)
+ t)))
+ (cons (list arg)
+ (if (string-match-p rx--regexp-atomic-regexp arg) t nil)))
+ (rx--delayed-evaluation
+ (cons (list arg) nil))
+ (t (error "rx `regexp' form with non-string argument")))))
+
+(defun rx--translate-compat-form (def form)
+ "Translate a compatibility form from `rx-constituents'.
+DEF is the definition tuple. Return (REGEXP . PRECEDENCE)."
+ (let* ((fn (nth 0 def))
+ (min-args (nth 1 def))
+ (max-args (nth 2 def))
+ (predicate (nth 3 def))
+ (nargs (1- (length form))))
+ (when (< nargs min-args)
+ (error "The `%s' form takes at least %d argument(s)"
+ (car form) min-args))
+ (when (and max-args (> nargs max-args))
+ (error "The `%s' form takes at most %d argument(s)"
+ (car form) max-args))
+ (when (and predicate (not (rx--every predicate (cdr form))))
+ (error "The `%s' form requires arguments satisfying `%s'"
+ (car form) predicate))
+ (let ((regexp (funcall fn form)))
+ (unless (stringp regexp)
+ (error "The `%s' form did not expand to a string" (car form)))
+ (cons (list regexp) nil))))
+
+(defun rx--translate-form (form)
+ "Translate an rx form (list structure). Return (REGEXP . PRECEDENCE)."
+ (let ((body (cdr form)))
+ (pcase (car form)
+ ((or 'seq : 'and 'sequence) (rx--translate-seq body))
+ ((or 'or '|) (rx--translate-or body))
+ ((or 'any 'in 'char) (rx--translate-any nil body))
+ ('not-char (rx--translate-any t body))
+ ('not (rx--translate-not nil body))
+
+ ('repeat (rx--translate-repeat body))
+ ('= (rx--translate-= body))
+ ('>= (rx--translate->= body))
+ ('** (rx--translate-** body))
+
+ ((or 'zero-or-more '0+) (rx--translate-rep "*" rx--greedy body))
+ ((or 'one-or-more '1+) (rx--translate-rep "+" rx--greedy body))
+ ((or 'zero-or-one 'opt 'optional) (rx--translate-rep "?" rx--greedy body))
+
+ ('* (rx--translate-rep "*" t body))
+ ('+ (rx--translate-rep "+" t body))
+ ((or '\? ?\s) (rx--translate-rep "?" t body))
+
+ ('*? (rx--translate-rep "*" nil body))
+ ('+? (rx--translate-rep "+" nil body))
+ ((or '\?? ??) (rx--translate-rep "?" nil body))
+
+ ('minimal-match (rx--control-greedy nil body))
+ ('maximal-match (rx--control-greedy t body))
+
+ ((or 'group 'submatch) (rx--translate-group body))
+ ((or 'group-n 'submatch-n) (rx--translate-group-n body))
+ ('backref (rx--translate-backref body))
+
+ ('syntax (rx--translate-syntax nil body))
+ ('not-syntax (rx--translate-syntax t body))
+ ('category (rx--translate-category nil body))
+
+ ('literal (rx--translate-literal body))
+ ('eval (rx--translate-eval body))
+ ((or 'regexp 'regex) (rx--translate-regexp body))
+
+ (op
+ (unless (symbolp op)
+ (error "Bad rx operator `%S'" op))
+
+ ;; For compatibility with old rx.
+ (let ((entry (assq op rx-constituents)))
+ (if (progn
+ (while (and entry (not (consp (cdr entry))))
+ (setq entry
+ (if (symbolp (cdr entry))
+ ;; Alias for another entry.
+ (assq (cdr entry) rx-constituents)
+ ;; Wrong type, try further down the list.
+ (assq (car entry)
+ (cdr (memq entry rx-constituents))))))
+ entry)
+ (rx--translate-compat-form (cdr entry) form)
+ (error "Unknown rx form `%s'" op)))))))
+
+;; Defined here rather than in re-builder to lower the odds that it
+;; will be kept in sync with changes.
+(defconst rx--builtin-forms
+ '(seq sequence : and or | any in char not-char not
+ repeat = >= **
+ zero-or-more 0+ *
+ one-or-more 1+ +
+ zero-or-one opt optional \?
+ *? +? \??
+ minimal-match maximal-match
+ group submatch group-n submatch-n backref
+ syntax not-syntax category
+ literal eval regexp regex)
+ "List of built-in rx forms. For use in re-builder only.")
+
+(defun rx--translate (item)
+ "Translate the rx-expression ITEM. Return (REGEXP . PRECEDENCE)."
+ (cond
+ ((stringp item)
+ (if (= (length item) 0)
+ (cons nil 'seq)
+ (cons (list (regexp-quote item)) (if (= (length item) 1) t 'seq))))
+ ((characterp item)
+ (cons (list (regexp-quote (char-to-string item))) t))
+ ((symbolp item)
+ (rx--translate-symbol item))
+ ((consp item)
+ (rx--translate-form item))
+ (t (error "Bad rx expression: %S" item))))
;;;###autoload
(defun rx-to-string (form &optional no-group)
- "Parse and produce code for regular expression FORM.
-FORM is a regular expression in sexp form.
-NO-GROUP non-nil means don't put shy groups around the result.
-
-In contrast to the `rx' macro, subforms `literal' and `regexp'
-will not accept non-string arguments, i.e., (literal STRING)
-becomes just a more verbose version of STRING."
- (rx-group-if (rx-form form) (null no-group)))
+ "Translate FORM from `rx' sexp syntax into a string regexp.
+The arguments to `literal' and `regexp' forms inside FORM must be
+constant strings.
+If NO-GROUP is non-nil, don't bracket the result in a non-capturing
+group."
+ (let* ((item (rx--translate form))
+ (exprs (if no-group
+ (car item)
+ (rx--atomic-regexp item))))
+ (apply #'concat exprs)))
+
+(defun rx--to-expr (form)
+ "Translate the rx-expression FORM to a Lisp expression yielding a regexp."
+ (let* ((rx--delayed-evaluation t)
+ (elems (car (rx--translate form)))
+ (args nil))
+ ;; Merge adjacent strings.
+ (while elems
+ (let ((strings nil))
+ (while (and elems (stringp (car elems)))
+ (push (car elems) strings)
+ (setq elems (cdr elems)))
+ (let ((s (apply #'concat (nreverse strings))))
+ (unless (zerop (length s))
+ (push s args))))
+ (when elems
+ (push (car elems) args)
+ (setq elems (cdr elems))))
+ (cond ((null args) "") ; 0 args
+ ((cdr args) (cons 'concat (nreverse args))) ; ≥2 args
+ (t (car args))))) ; 1 arg
;;;###autoload
(literal EXPR) Match the literal string from evaluating EXPR at run time.
(regexp EXPR) Match the string regexp from evaluating EXPR at run time.
-(eval EXPR) Match the rx sexp from evaluating EXPR at compile time."
- (let* ((rx--compile-to-lisp t)
- (re (cond ((null regexps)
- (error "No regexp"))
- ((cdr regexps)
- (rx-to-string `(and ,@regexps) t))
- (t
- (rx-to-string (car regexps) t)))))
- (if (stringp re)
- re
- `(concat ,@re))))
-
+(eval EXPR) Match the rx sexp from evaluating EXPR at compile time.
+
+\(fn REGEXPS...)"
+ (rx--to-expr (cons 'seq regexps)))
+
+
+;; During `rx--pcase-transform', list of defined variables in right-to-left
+;; order.
+(defvar rx--pcase-vars)
+
+(defun rx--pcase-transform (rx)
+ "Transform RX, an rx-expression augmented with `let' and named `backref',
+into a plain rx-expression, collecting names into `rx--pcase-vars'."
+ (pcase rx
+ (`(let ,name . ,body)
+ (let* ((index (length (memq name rx--pcase-vars)))
+ (i (if (zerop index)
+ (length (push name rx--pcase-vars))
+ index)))
+ `(group-n ,i ,(rx--pcase-transform (cons 'seq body)))))
+ ((and `(backref ,ref)
+ (guard (symbolp ref)))
+ (let ((index (length (memq ref rx--pcase-vars))))
+ (when (zerop index)
+ (error "rx `backref' variable must be one of: %s"
+ (mapconcat #'symbol-name rx--pcase-vars " ")))
+ `(backref ,index)))
+ ((and `(,head . ,rest)
+ (guard (and (symbolp head)
+ (not (memq head '(literal regexp regex eval))))))
+ (cons head (mapcar #'rx--pcase-transform rest)))
+ (_ rx)))
(pcase-defmacro rx (&rest regexps)
- "Build a `pcase' pattern matching `rx' REGEXPS in sexp form.
-The REGEXPS are interpreted as in `rx'. The pattern matches any
-string that is a match for the regular expression so constructed,
-as if by `string-match'.
+ "A pattern that matches strings against `rx' REGEXPS in sexp form.
+REGEXPS are interpreted as in `rx'. The pattern matches any
+string that is a match for REGEXPS, as if by `string-match'.
-In addition to the usual `rx' constructs, REGEXPS can contain the
+In addition to the usual `rx' syntax, REGEXPS can contain the
following constructs:
- (let REF SEXP...) creates a new explicitly named reference to
- a submatch that matches regular expressions
- SEXP, and binds the match to REF.
- (backref REF) creates a backreference to the submatch
- introduced by a previous (let REF ...)
- construct. REF can be the same symbol
- in the first argument of the corresponding
- (let REF ...) construct, or it can be a
- submatch number. It matches the referenced
- submatch.
-
-The REFs are associated with explicitly named submatches starting
-from 1. Multiple occurrences of the same REF refer to the same
-submatch.
-
-If a case matches, the match data is modified as usual so you can
-use it in the case body, but you still have to pass the correct
-string as argument to `match-string'."
- (let* ((vars ())
- (rx-constituents
- `((let
- ,(lambda (form)
- (rx-check form)
- (let ((var (cadr form)))
- (cl-check-type var symbol)
- (let ((i (or (cl-position var vars :test #'eq)
- (prog1 (length vars)
- (setq vars `(,@vars ,var))))))
- (rx-form `(submatch-n ,(1+ i) ,@(cddr form))))))
- 1 nil)
- (backref
- ,(lambda (form)
- (rx-check form)
- (rx-backref
- `(backref ,(let ((var (cadr form)))
- (if (integerp var) var
- (1+ (cl-position var vars :test #'eq)))))))
- 1 1
- ,(lambda (var)
- (cond ((integerp var) (rx-check-backref var))
- ((memq var vars) t)
- (t (error "rx `backref' variable must be one of %s: %s"
- vars var)))))
- ,@rx-constituents))
- (regexp (rx-to-string `(seq ,@regexps) :no-group)))
+ (let REF RX...) binds the symbol REF to a submatch that matches
+ the regular expressions RX. REF is bound in
+ CODE to the string of the submatch or nil, but
+ can also be used in `backref'.
+ (backref REF) matches whatever the submatch REF matched.
+ REF can be a number, as usual, or a name
+ introduced by a previous (let REF ...)
+ construct."
+ (let* ((rx--pcase-vars nil)
+ (regexp (rx--to-expr (rx--pcase-transform (cons 'seq regexps)))))
`(and (pred (string-match ,regexp))
- ,@(cl-loop for i from 1
- for var in vars
- collect `(app (match-string ,i) ,var)))))
-\f
+ ,@(let ((i 0))
+ (mapcar (lambda (name)
+ (setq i (1+ i))
+ `(app (match-string ,i) ,name))
+ (reverse rx--pcase-vars))))))
+
(provide 'rx)
;;; rx.el ends here
-;;; rx-tests.el --- test for rx.el functions -*- lexical-binding: t -*-
+;;; rx-tests.el --- tests for rx.el -*- lexical-binding: t -*-
;; Copyright (C) 2016-2019 Free Software Foundation, Inc.
;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs. If not, see <https://www.gnu.org/licenses/>.
-;;; Commentary:
-
(require 'ert)
(require 'rx)
-;;; Code:
+(ert-deftest rx-seq ()
+ (should (equal (rx "a.b" "*" "c")
+ "a\\.b\\*c"))
+ (should (equal (rx (seq "a" (: "b" (and "c" (sequence "d" nonl)
+ "e")
+ "f")
+ "g"))
+ "abcd.efg"))
+ (should (equal (rx "a$" "b")
+ "a\\$b"))
+ (should (equal (rx bol "a" "b" ?c eol)
+ "^abc$"))
+ (should (equal (rx "a" "" "b")
+ "ab"))
+ (should (equal (rx (seq))
+ ""))
+ (should (equal (rx "" (or "ab" nonl) "")
+ "ab\\|.")))
+
+(ert-deftest rx-or ()
+ (should (equal (rx (or "ab" (| "c" nonl) "de"))
+ "ab\\|c\\|.\\|de"))
+ (should (equal (rx (or "ab" "abc" "a"))
+ "\\(?:ab\\|abc\\|a\\)"))
+ (should (equal (rx (| nonl "a") (| "b" blank))
+ "\\(?:.\\|a\\)\\(?:b\\|[[:blank:]]\\)"))
+ (should (equal (rx (|))
+ "\\`a\\`")))
(ert-deftest rx-char-any ()
"Test character alternatives with `]' and `-' (Bug#25123)."
- (should (string-match
+ (should (equal
(rx string-start (1+ (char (?\] . ?\{) (?< . ?\]) (?- . ?:)))
string-end)
- (apply #'string (nconc (number-sequence ?\] ?\{)
- (number-sequence ?< ?\])
- (number-sequence ?- ?:))))))
+ "\\`[.-:<-{-]+\\'")))
(ert-deftest rx-char-any-range-nl ()
"Test character alternatives with LF as a range endpoint."
(should (equal (rx (any "\a-\n"))
"[\a-\n]")))
-(ert-deftest rx-char-any-range-bad ()
- (should-error (rx (any "0-9a-Z")))
- (should-error (rx (any (?0 . ?9) (?a . ?Z)))))
-
(ert-deftest rx-char-any-raw-byte ()
"Test raw bytes in character alternatives."
+
+ ;; The multibyteness of the rx return value sometimes depends on whether
+ ;; the test had been byte-compiled or not, so we add explicit conversions.
+
;; Separate raw characters.
- (should (equal (string-match-p (rx (any "\326A\333B"))
- "X\326\333")
- 1))
+ (should (equal (string-to-multibyte (rx (any "\326A\333B")))
+ (string-to-multibyte "[AB\326\333]")))
;; Range of raw characters, unibyte.
- (should (equal (string-match-p (rx (any "\200-\377"))
- "ÿA\310B")
- 2))
+ (should (equal (string-to-multibyte (rx (any "\200-\377")))
+ (string-to-multibyte "[\200-\377]")))
+
;; Range of raw characters, multibyte.
- (should (equal (string-match-p (rx (any "Å\211\326-\377\177"))
- "XY\355\177\327")
- 2))
+ (should (equal (rx (any "Å\211\326-\377\177"))
+ "[\177Å\211\326-\377]"))
;; Split range; \177-\377ÿ should not be optimised to \177-\377.
- (should (equal (string-match-p (rx (any "\177-\377" ?ÿ))
- "ÿA\310B")
- 0)))
+ (should (equal (rx (any "\177-\377" ?ÿ))
+ "[\177ÿ\200-\377]")))
+
+(ert-deftest rx-any ()
+ (should (equal (rx (any ?A (?C . ?D) "F-H" "J-L" "M" "N-P" "Q" "RS"))
+ "[ACDF-HJ-S]"))
+ (should (equal (rx (in "a!f" ?c) (char "q-z" "0-3")
+ (not-char "a-e1-5") (not (in "A-M" ?q)))
+ "[!acf][0-3q-z][^1-5a-e][^A-Mq]"))
+ (should (equal (rx (any "^") (any "]") (any "-")
+ (not (any "^")) (not (any "]")) (not (any "-")))
+ "\\^]-[^^][^]][^-]"))
+ (should (equal (rx (any "]" "^") (any "]" "-") (any "-" "^")
+ (not (any "]" "^")) (not (any "]" "-"))
+ (not (any "-" "^")))
+ "[]^][]-][-^][^]^][^]-][^-^]"))
+ (should (equal (rx (any "]" "^" "-") (not (any "]" "^" "-")))
+ "[]^-][^]^-]"))
+ (should (equal (rx (any "-" ascii) (any "^" ascii) (any "]" ascii))
+ "[[:ascii:]-][[:ascii:]^][][:ascii:]]"))
+ (should (equal (rx (not (any "-" ascii)) (not (any "^" ascii))
+ (not (any "]" ascii)))
+ "[^[:ascii:]-][^[:ascii:]^][^][:ascii:]]"))
+ (should (equal (rx (any "-]" ascii) (any "^]" ascii) (any "-^" ascii))
+ "[][:ascii:]-][]^[:ascii:]][[:ascii:]^-]"))
+ (should (equal (rx (not (any "-]" ascii)) (not (any "^]" ascii))
+ (not (any "-^" ascii)))
+ "[^][:ascii:]-][^]^[:ascii:]][^[:ascii:]^-]"))
+ (should (equal (rx (any "-]^" ascii) (not (any "-]^" ascii)))
+ "[]^[:ascii:]-][^]^[:ascii:]-]"))
+ (should (equal (rx (any "^" lower upper) (not (any "^" lower upper)))
+ "[[:lower:]^[:upper:]][^[:lower:]^[:upper:]]"))
+ (should (equal (rx (any "-" lower upper) (not (any "-" lower upper)))
+ "[[:lower:][:upper:]-][^[:lower:][:upper:]-]"))
+ (should (equal (rx (any "]" lower upper) (not (any "]" lower upper)))
+ "[][:lower:][:upper:]][^][:lower:][:upper:]]"))
+ (should (equal (rx (any "-a" "c-" "f-f" "--/*--"))
+ "[*-/acf]"))
+ (should (equal (rx (any "]-a" ?-) (not (any "]-a" ?-)))
+ "[]-a-][^]-a-]"))
+ (should (equal (rx (any "--]") (not (any "--]"))
+ (any "-" "^-a") (not (any "-" "^-a")))
+ "[].-\\-][^].-\\-][-^-a][^-^-a]"))
+ (should (equal (rx (not (any "!a" "0-8" digit nonascii)))
+ "[^!0-8a[:digit:][:nonascii:]]"))
+ (should (equal (rx (any) (not (any)))
+ "\\`a\\`\\(?:.\\|\n\\)"))
+ (should (equal (rx (any "") (not (any "")))
+ "\\`a\\`\\(?:.\\|\n\\)")))
(ert-deftest rx-pcase ()
(should (equal (pcase "a 1 2 3 1 1 b"
(backref u) space
(backref 1))
(list u v)))
- '("1" "3"))))
+ '("1" "3")))
+ (let ((k "blue"))
+ (should (equal (pcase "<blue>"
+ ((rx "<" (literal k) ">") 'ok))
+ 'ok))))
(ert-deftest rx-kleene ()
"Test greedy and non-greedy repetition operators."
(should (equal (rx (maximal-match
(seq (* "a") (+ "b") (\? "c") (?\s "d")
(*? "e") (+? "f") (\?? "g") (?? "h"))))
- "a*b+c?d?e*?f+?g??h??")))
+ "a*b+c?d?e*?f+?g??h??"))
+ (should (equal (rx "a" (*) (+ (*)) (? (*) (+)) "b")
+ "ab")))
-(ert-deftest rx-or ()
- ;; Test or-pattern reordering (Bug#34641).
- (let ((s "abc"))
- (should (equal (and (string-match (rx (or "abc" "ab" "a")) s)
- (match-string 0 s))
- "abc"))
- (should (equal (and (string-match (rx (or "ab" "abc" "a")) s)
- (match-string 0 s))
- "ab"))
- (should (equal (and (string-match (rx (or "a" "ab" "abc")) s)
- (match-string 0 s))
- "a")))
- ;; Test zero-argument `or'.
- (should (equal (rx (or)) regexp-unmatchable)))
+(ert-deftest rx-repeat ()
+ (should (equal (rx (= 3 "a") (>= 51 "b")
+ (** 2 11 "c") (repeat 6 "d") (repeat 4 8 "e"))
+ "a\\{3\\}b\\{51,\\}c\\{2,11\\}d\\{6\\}e\\{4,8\\}"))
+ (should (equal (rx (= 0 "k") (>= 0 "l") (** 0 0 "m") (repeat 0 "n")
+ (repeat 0 0 "o"))
+ "k\\{0\\}l\\{0,\\}m\\{0\\}n\\{0\\}o\\{0\\}"))
+ (should (equal (rx (opt (0+ "a")))
+ "\\(?:a*\\)?"))
+ (should (equal (rx (opt (= 4 "a")))
+ "a\\{4\\}?"))
+ (should (equal (rx "a" (** 3 7) (= 4) (>= 3) (= 4 (>= 7) (= 2)) "b")
+ "ab")))
+
+(ert-deftest rx-atoms ()
+ (should (equal (rx anything)
+ ".\\|\n"))
+ (should (equal (rx line-start not-newline nonl any line-end)
+ "^...$"))
+ (should (equal (rx bol string-start string-end buffer-start buffer-end
+ bos eos bot eot eol)
+ "^\\`\\'\\`\\'\\`\\'\\`\\'$"))
+ (should (equal (rx point word-start word-end bow eow symbol-start symbol-end
+ word-boundary not-word-boundary not-wordchar)
+ "\\=\\<\\>\\<\\>\\_<\\_>\\b\\B\\W"))
+ (should (equal (rx digit numeric num control cntrl)
+ "[[:digit:]][[:digit:]][[:digit:]][[:cntrl:]][[:cntrl:]]"))
+ (should (equal (rx hex-digit hex xdigit blank)
+ "[[:xdigit:]][[:xdigit:]][[:xdigit:]][[:blank:]]"))
+ (should (equal (rx graph graphic print printing)
+ "[[:graph:]][[:graph:]][[:print:]][[:print:]]"))
+ (should (equal (rx alphanumeric alnum letter alphabetic alpha)
+ "[[:alnum:]][[:alnum:]][[:alpha:]][[:alpha:]][[:alpha:]]"))
+ (should (equal (rx ascii nonascii lower lower-case)
+ "[[:ascii:]][[:nonascii:]][[:lower:]][[:lower:]]"))
+ (should (equal (rx punctuation punct space whitespace white)
+ "[[:punct:]][[:punct:]][[:space:]][[:space:]][[:space:]]"))
+ (should (equal (rx upper upper-case word wordchar)
+ "[[:upper:]][[:upper:]][[:word:]][[:word:]]"))
+ (should (equal (rx unibyte multibyte)
+ "[[:unibyte:]][[:multibyte:]]")))
+
+(ert-deftest rx-syntax ()
+ (should (equal (rx (syntax whitespace) (syntax punctuation)
+ (syntax word) (syntax symbol)
+ (syntax open-parenthesis) (syntax close-parenthesis))
+ "\\s-\\s.\\sw\\s_\\s(\\s)"))
+ (should (equal (rx (syntax string-quote) (syntax paired-delimiter)
+ (syntax escape) (syntax character-quote)
+ (syntax comment-start) (syntax comment-end)
+ (syntax string-delimiter) (syntax comment-delimiter))
+ "\\s\"\\s$\\s\\\\s/\\s<\\s>\\s|\\s!")))
+
+(ert-deftest rx-category ()
+ (should (equal (rx (category space-for-indent) (category base)
+ (category consonant) (category base-vowel)
+ (category upper-diacritical-mark)
+ (category lower-diacritical-mark)
+ (category tone-mark) (category symbol)
+ (category digit)
+ (category vowel-modifying-diacritical-mark)
+ (category vowel-sign) (category semivowel-lower)
+ (category not-at-end-of-line)
+ (category not-at-beginning-of-line))
+ "\\c \\c.\\c0\\c1\\c2\\c3\\c4\\c5\\c6\\c7\\c8\\c9\\c<\\c>"))
+ (should (equal (rx (category alpha-numeric-two-byte)
+ (category chinese-two-byte) (category greek-two-byte)
+ (category japanese-hiragana-two-byte)
+ (category indian-two-byte)
+ (category japanese-katakana-two-byte)
+ (category strong-left-to-right)
+ (category korean-hangul-two-byte)
+ (category strong-right-to-left)
+ (category cyrillic-two-byte)
+ (category combining-diacritic))
+ "\\cA\\cC\\cG\\cH\\cI\\cK\\cL\\cN\\cR\\cY\\c^"))
+ (should (equal (rx (category ascii) (category arabic) (category chinese)
+ (category ethiopic) (category greek) (category korean)
+ (category indian) (category japanese)
+ (category japanese-katakana) (category latin)
+ (category lao) (category tibetan))
+ "\\ca\\cb\\cc\\ce\\cg\\ch\\ci\\cj\\ck\\cl\\co\\cq"))
+ (should (equal (rx (category japanese-roman) (category thai)
+ (category vietnamese) (category hebrew)
+ (category cyrillic) (category can-break))
+ "\\cr\\ct\\cv\\cw\\cy\\c|"))
+ (should (equal (rx (category ?g) (not (category ?~)))
+ "\\cg\\C~")))
+
+(ert-deftest rx-not ()
+ (should (equal (rx (not word-boundary))
+ "\\B"))
+ (should (equal (rx (not ascii) (not lower-case) (not wordchar))
+ "[^[:ascii:]][^[:lower:]][^[:word:]]"))
+ (should (equal (rx (not (syntax punctuation)) (not (syntax escape)))
+ "\\S.\\S\\"))
+ (should (equal (rx (not (category tone-mark)) (not (category lao)))
+ "\\C4\\Co")))
+
+(ert-deftest rx-group ()
+ (should (equal (rx (group nonl) (submatch "x")
+ (group-n 3 "y") (submatch-n 13 "z") (backref 1))
+ "\\(.\\)\\(x\\)\\(?3:y\\)\\(?13:z\\)\\1"))
+ (should (equal (rx (group) (group-n 2))
+ "\\(\\)\\(?2:\\)")))
+
+(ert-deftest rx-regexp ()
+ (should (equal (rx (regexp "abc") (regex "[de]"))
+ "\\(?:abc\\)[de]"))
+ (let ((x "a*"))
+ (should (equal (rx (regexp x) "b")
+ "\\(?:a*\\)b"))
+ (should (equal (rx "" (regexp x) (eval ""))
+ "a*"))))
+
+(ert-deftest rx-eval ()
+ (should (equal (rx (eval (list 'syntax 'symbol)))
+ "\\s_"))
+ (should (equal (rx "a" (eval (concat)) "b")
+ "ab")))
+
+(ert-deftest rx-literal ()
+ (should (equal (rx (literal (char-to-string 42)) nonl)
+ "\\*."))
+ (let ((x "a+b"))
+ (should (equal (rx (opt (literal (upcase x))))
+ "\\(?:A\\+B\\)?"))))
+
+(ert-deftest rx-to-string ()
+ (should (equal (rx-to-string '(or nonl "\nx"))
+ "\\(?:.\\|\nx\\)"))
+ (should (equal (rx-to-string '(or nonl "\nx") t)
+ ".\\|\nx")))
+
+
+(ert-deftest rx-constituents ()
+ (let ((rx-constituents
+ (append '((beta . gamma)
+ (gamma . "a*b")
+ (delta . ((lambda (form)
+ (regexp-quote (format "<%S>" form)))
+ 1 nil symbolp))
+ (epsilon . delta))
+ rx-constituents)))
+ (should (equal (rx-to-string '(seq (+ beta) nonl gamma) t)
+ "\\(?:a*b\\)+.\\(?:a*b\\)"))
+ (should (equal (rx-to-string '(seq (delta a b c) (* (epsilon d e))) t)
+ "\\(?:<(delta a b c)>\\)\\(?:<(epsilon d e)>\\)*"))))
-(ert-deftest rx-seq ()
- ;; Test zero-argument `seq'.
- (should (equal (rx (seq)) "")))
-
-(defmacro rx-tests--match (regexp string &optional match)
- (macroexp-let2 nil strexp string
- `(ert-info ((format "Matching %S to %S" ',regexp ,strexp))
- (should (string-match ,regexp ,strexp))
- ,@(when match
- `((should (equal (match-string 0 ,strexp) ,match)))))))
-
-(ert-deftest rx-nonstring-expr ()
- (let ((bee "b")
- (vowel "[aeiou]"))
- (rx-tests--match (rx "a" (literal bee) "c") "abc")
- (rx-tests--match (rx "a" (regexp bee) "c") "abc")
- (rx-tests--match (rx "a" (or (regexp bee) "xy") "c") "abc")
- (rx-tests--match (rx "a" (or "xy" (regexp bee)) "c") "abc")
- (should-not (string-match (rx (or (regexp bee) "xy")) ""))
- (rx-tests--match (rx "a" (= 3 (regexp bee)) "c") "abbbc")
- (rx-tests--match (rx "x" (= 3 (regexp vowel)) "z") "xeoez")
- (should-not (string-match (rx "x" (= 3 (regexp vowel)) "z") "xe[]z"))
- (rx-tests--match (rx "x" (= 3 (literal vowel)) "z")
- "x[aeiou][aeiou][aeiou]z")
- (rx-tests--match (rx "x" (repeat 1 (regexp vowel)) "z") "xaz")
- (rx-tests--match (rx "x" (repeat 1 2 (regexp vowel)) "z") "xaz")
- (rx-tests--match (rx "x" (repeat 1 2 (regexp vowel)) "z") "xauz")
- (rx-tests--match (rx "x" (>= 1 (regexp vowel)) "z") "xaiiz")
- (rx-tests--match (rx "x" (** 1 2 (regexp vowel)) "z") "xaiz")
- (rx-tests--match (rx "x" (group (regexp vowel)) "z") "xaz")
- (rx-tests--match (rx "x" (group-n 1 (regexp vowel)) "z") "xaz")
- (rx-tests--match (rx "x" (? (regexp vowel)) "z") "xz")))
-
-(ert-deftest rx-nonstring-expr-non-greedy ()
- "`rx's greediness can't affect runtime regexp parts."
- (let ((ad-min "[ad]*?")
- (ad-max "[ad]*")
- (ad "[ad]"))
- (rx-tests--match (rx "c" (regexp ad-min) "a") "cdaaada" "cda")
- (rx-tests--match (rx "c" (regexp ad-max) "a") "cdaaada" "cdaaada")
- (rx-tests--match (rx "c" (minimal-match (regexp ad-max)) "a") "cdaaada" "cdaaada")
- (rx-tests--match (rx "c" (maximal-match (regexp ad-min)) "a") "cdaaada" "cda")
- (rx-tests--match (rx "c" (minimal-match (0+ (regexp ad))) "a") "cdaaada" "cda")
- (rx-tests--match (rx "c" (maximal-match (0+ (regexp ad))) "a") "cdaaada" "cdaaada")))
-
-(ert-deftest rx-to-string-lisp-forms ()
- (rx-tests--match (rx-to-string '(seq "a" (literal "b") "c")) "abc")
- (rx-tests--match (rx-to-string '(seq "a" (regexp "b") "c")) "abc"))
(provide 'rx-tests)
-;; rx-tests.el ends here.