From: Stefan Monnier Date: Sun, 26 Dec 2010 23:17:09 +0000 (-0500) Subject: * lisp/emacs-lisp/rx.el: Make it a superset of sregex. X-Git-Tag: emacs-pretest-24.0.90~104^2~618^2~1322^2~278^2~64 X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=723ee192a5b3c6ebf589d325d5d004e57ce42652;p=emacs.git * lisp/emacs-lisp/rx.el: Make it a superset of sregex. (rx-constituents): Add `any => "."', mark `repeat' as taking any number of args, add `regex' alias. (rx-info): Add arg to distinguish head and standalone forms. (rx-check, rx-form): Pass the corresponding arg. (rx-**): Simplify. (rx-repeat): Make it work for any number of args. (rx-syntax): Make it accept syntax chars as is. * lisp/obsolete/sregex.el: Move from emacs-lisp/. * lisp/emacs-lisp/re-builder.el: Remove sregex support. * lisp/emacs-lisp/edebug.el (sregexq, rx): Remove redundant defs. --- diff --git a/etc/NEWS b/etc/NEWS index f7288de8b13..f21028adc8c 100644 --- a/etc/NEWS +++ b/etc/NEWS @@ -541,6 +541,8 @@ listing object name completions when being sent text via *** An API for manipulating SQL product definitions has been added. +** sregex.el is now obsolete, since rx.el is a strict superset. + ** s-region.el is now declared obsolete, superceded by shift-select-mode enabled by default in 23.1. diff --git a/lisp/ChangeLog b/lisp/ChangeLog index ccf5b5c40ea..21d90eee903 100644 --- a/lisp/ChangeLog +++ b/lisp/ChangeLog @@ -1,3 +1,17 @@ +2010-12-26 Stefan Monnier + + * emacs-lisp/rx.el: Make it a superset of sregex. + (rx-constituents): Add `any => "."', mark `repeat' as taking any number + of args, add `regex' alias. + (rx-info): Add arg to distinguish head and standalone forms. + (rx-check, rx-form): Pass the corresponding arg. + (rx-**): Simplify. + (rx-repeat): Make it work for any number of args. + (rx-syntax): Make it accept syntax chars as is. + * obsolete/sregex.el: Move from emacs-lisp/. + * emacs-lisp/re-builder.el: Remove sregex support. + * emacs-lisp/edebug.el (sregexq, rx): Remove redundant defs. + 2010-12-25 Eli Zaretskii * mouse.el (mouse-yank-primary): On MS-Windows, try the (emulated) @@ -19,8 +33,8 @@ 2010-12-21 Daiki Ueno * obsolete/pgg-parse.el, obsolete/pgg-pgp5.el, obsolete/pgg-pgp.el, - * obsolete/pgg-gpg.el, obsolete/pgg-def.el, obsolete/pgg.el: Move - from lisp/. + * obsolete/pgg-gpg.el, obsolete/pgg-def.el, obsolete/pgg.el: + Move from lisp/. 2010-12-20 Leo diff --git a/lisp/emacs-lisp/edebug.el b/lisp/emacs-lisp/edebug.el index 77953b37021..d4af24aaaff 100644 --- a/lisp/emacs-lisp/edebug.el +++ b/lisp/emacs-lisp/edebug.el @@ -2131,8 +2131,6 @@ expressions; a `progn' form will be returned enclosing these forms." (def-edebug-spec with-custom-print body) -(def-edebug-spec sregexq (&rest sexp)) -(def-edebug-spec rx (&rest sexp)) ;;; The debugger itself diff --git a/lisp/emacs-lisp/re-builder.el b/lisp/emacs-lisp/re-builder.el index 1845effd5bb..eacabf72c95 100644 --- a/lisp/emacs-lisp/re-builder.el +++ b/lisp/emacs-lisp/re-builder.el @@ -60,8 +60,8 @@ ;; even the auto updates go all the way. Forcing an update overrides ;; this limit allowing an easy way to see all matches. -;; Currently `re-builder' understands five different forms of input, -;; namely `read', `string', `rx', and `sregex' syntax. Read +;; Currently `re-builder' understands three different forms of input, +;; namely `read', `string', and `rx' syntax. Read ;; syntax and string syntax are both delimited by `"'s and behave ;; according to their name. With the `string' syntax there's no need ;; to escape the backslashes and double quotes simplifying the editing @@ -75,7 +75,7 @@ ;; When editing a symbolic regular expression, only the first ;; expression in the RE Builder buffer is considered, which helps ;; limiting the extent of the expression like the `"'s do for the text -;; modes. For the `sregex' syntax the function `sregex' is applied to +;; modes. For the `rx' syntax the function `rx-to-string' is applied to ;; the evaluated expression read. So you can use quoted arguments ;; with something like '("findme") or you can construct arguments to ;; your hearts delight with a valid ELisp expression. (The compiled @@ -126,11 +126,10 @@ (defcustom reb-re-syntax 'read "Syntax for the REs in the RE Builder. -Can either be `read', `string', `sregex', or `rx'." +Can either be `read', `string', or `rx'." :group 're-builder :type '(choice (const :tag "Read syntax" read) (const :tag "String syntax" string) - (const :tag "`sregex' syntax" sregex) (const :tag "`rx' syntax" rx))) (defcustom reb-auto-match-limit 200 @@ -279,10 +278,8 @@ Except for Lisp syntax this is the same as `reb-regexp'.") emacs-lisp-mode "RE Builder Lisp" "Major mode for interactively building symbolic Regular Expressions." ;; Pull in packages as needed - (cond ((eq reb-re-syntax 'sregex) ; sregex is not autoloaded - (require 'sregex)) ; right now.. - ((eq reb-re-syntax 'rx) ; rx-to-string is autoloaded - (require 'rx))) ; require rx anyway + (cond ((memq reb-re-syntax '(sregex rx)) ; rx-to-string is autoloaded + (require 'rx))) ; require rx anyway (reb-mode-common)) ;; Use the same "\C-c" keymap as `reb-mode' and use font-locking from @@ -612,9 +609,7 @@ optional fourth argument FORCE is non-nil." (defun reb-cook-regexp (re) "Return RE after processing it according to `reb-re-syntax'." - (cond ((eq reb-re-syntax 'sregex) - (apply 'sregex (eval (car (read-from-string re))))) - ((eq reb-re-syntax 'rx) + (cond ((memq reb-re-syntax '(sregex rx)) (rx-to-string (eval (car (read-from-string re))))) (t re))) diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el index 522d452c2dc..b3b88c3ce4f 100644 --- a/lisp/emacs-lisp/rx.el +++ b/lisp/emacs-lisp/rx.el @@ -120,19 +120,17 @@ (nonl . not-newline) ; SRE (anything . (rx-anything 0 nil)) (any . (rx-any 1 nil rx-check-any)) ; inconsistent with SRE + (any . ".") ; sregex (in . any) (char . any) ; sregex (not-char . (rx-not-char 1 nil rx-check-any)) ; sregex (not . (rx-not 1 1 rx-check-not)) - ;; Partially consistent with sregex, whose `repeat' is like our - ;; `**'. (`repeat' with optional max arg and multiple sexp forms - ;; is ambiguous.) - (repeat . (rx-repeat 2 3)) + (repeat . (rx-repeat 2 nil)) (= . (rx-= 2 nil)) ; SRE (>= . (rx->= 2 nil)) ; SRE (** . (rx-** 2 nil)) ; SRE (submatch . (rx-submatch 1 nil)) ; SRE - (group . submatch) + (group . submatch) ; sregex (zero-or-more . (rx-kleene 1 nil)) (one-or-more . (rx-kleene 1 nil)) (zero-or-one . (rx-kleene 1 nil)) @@ -175,6 +173,7 @@ (category . (rx-category 1 1 rx-check-category)) (eval . (rx-eval 1 1)) (regexp . (rx-regexp 1 1 stringp)) + (regex . regexp) ; sregex (digit . "[[:digit:]]") (numeric . digit) ; SRE (num . digit) ; SRE @@ -295,15 +294,27 @@ regular expression strings.") `zero-or-more', and `one-or-more'. Dynamically bound.") -(defun rx-info (op) +(defun rx-info (op head) "Return parsing/code generation info for OP. If OP is the space character ASCII 32, return info for the symbol `?'. If OP is the character `?', return info for the symbol `??'. -See also `rx-constituents'." +See also `rx-constituents'. +If HEAD is non-nil, then OP is the head of a sexp, otherwise it's +a standalone symbol." (cond ((eq op ? ) (setq op '\?)) ((eq op ??) (setq op '\??))) - (while (and (not (null op)) (symbolp op)) - (setq op (cdr (assq op rx-constituents)))) + (let (old-op) + (while (and (not (null op)) (symbolp op)) + (setq old-op op) + (setq op (cdr (assq op rx-constituents))) + (when (if head (stringp op) (consp op)) + ;; We found something but of the wrong kind. Let's look for an + ;; alternate definition for the other case. + (let ((new-op + (cdr (assq old-op (cdr (memq (assq old-op rx-constituents) + rx-constituents)))))) + (if (and new-op (not (if head (stringp new-op) (consp new-op)))) + (setq op new-op)))))) op) @@ -311,7 +322,7 @@ See also `rx-constituents'." "Check FORM according to its car's parsing info." (unless (listp form) (error "rx `%s' needs argument(s)" form)) - (let* ((rx (rx-info (car form))) + (let* ((rx (rx-info (car form) 'head)) (nargs (1- (length form))) (min-args (nth 1 rx)) (max-args (nth 2 rx)) @@ -643,14 +654,17 @@ If SKIP is non-nil, allow that number of items after the head, i.e. (defun rx-** (form) "Parse and produce code from FORM `(** N M ...)'." (rx-check form) - (setq form (cons 'repeat (cdr (rx-trans-forms form 2)))) - (rx-form form '*)) + (rx-form (cons 'repeat (cdr (rx-trans-forms form 2))) '*)) (defun rx-repeat (form) "Parse and produce code from FORM. -FORM is either `(repeat N FORM1)' or `(repeat N M FORM1)'." +FORM is either `(repeat N FORM1)' or `(repeat N M FORMS...)'." (rx-check form) + (if (> (length form) 4) + (setq form (rx-trans-forms form 2))) + (if (null (nth 2 form)) + (setq form (list* (nth 0 form) (nth 1 form) (nthcdr 3 form)))) (cond ((= (length form) 3) (unless (and (integerp (nth 1 form)) (> (nth 1 form) 0)) @@ -749,15 +763,18 @@ of all atomic regexps." "Parse and produce code from FORM, which is `(syntax SYMBOL)'." (rx-check form) (let* ((sym (cadr form)) - (syntax (assq sym rx-syntax))) + (syntax (cdr (assq sym rx-syntax)))) (unless syntax ;; Try sregex compatibility. - (let ((name (symbol-name sym))) - (if (= 1 (length name)) - (setq syntax (rassq (aref name 0) rx-syntax)))) + (cond + ((character sym) (setq syntax sym)) + ((symbolp sym) + (let ((name (symbol-name sym))) + (if (= 1 (length name)) + (setq syntax (aref name 0)))))) (unless syntax - (error "Unknown rx syntax `%s'" (cadr form)))) - (format "\\s%c" (cdr syntax)))) + (error "Unknown rx syntax `%s'" sym))) + (format "\\s%c" syntax))) (defun rx-check-category (form) @@ -811,7 +828,7 @@ shy groups around the result and some more in other functions." (cond ((integerp form) (regexp-quote (char-to-string form))) ((symbolp form) - (let ((info (rx-info form))) + (let ((info (rx-info form nil))) (cond ((stringp info) info) ((null info) @@ -819,7 +836,7 @@ shy groups around the result and some more in other functions." (t (funcall (nth 0 info) form))))) ((consp form) - (let ((info (rx-info (car form)))) + (let ((info (rx-info (car form) 'head))) (unless (consp info) (error "Unknown rx form `%s'" (car form))) (funcall (nth 0 info) form))) diff --git a/lisp/emacs-lisp/sregex.el b/lisp/emacs-lisp/sregex.el deleted file mode 100644 index f5e3aac231c..00000000000 --- a/lisp/emacs-lisp/sregex.el +++ /dev/null @@ -1,608 +0,0 @@ -;;; sregex.el --- symbolic regular expressions - -;; Copyright (C) 1997, 1998, 2000, 2001, 2002, 2003, 2004, -;; 2005, 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc. - -;; Author: Bob Glickstein -;; Maintainer: Bob Glickstein -;; Keywords: extensions - -;; This file is part of GNU Emacs. - -;; GNU Emacs is free software: you can redistribute it and/or modify -;; it under the terms of the GNU General Public License as published by -;; the Free Software Foundation, either version 3 of the License, or -;; (at your option) any later version. - -;; GNU Emacs is distributed in the hope that it will be useful, -;; but WITHOUT ANY WARRANTY; without even the implied warranty of -;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -;; GNU General Public License for more details. - -;; You should have received a copy of the GNU General Public License -;; along with GNU Emacs. If not, see . - -;;; Commentary: - -;; This package allows you to write regular expressions using a -;; totally new, Lisp-like syntax. - -;; A "symbolic regular expression" (sregex for short) is a Lisp form -;; that, when evaluated, produces the string form of the specified -;; regular expression. Here's a simple example: - -;; (sregexq (or "Bob" "Robert")) => "Bob\\|Robert" - -;; As you can see, an sregex is specified by placing one or more -;; special clauses in a call to `sregexq'. The clause in this case is -;; the `or' of two strings (not to be confused with the Lisp function -;; `or'). The list of allowable clauses appears below. - -;; With sregex, it is never necessary to "escape" magic characters -;; that are meant to be taken literally; that happens automatically. -;; For example: - -;; (sregexq "M*A*S*H") => "M\\*A\\*S\\*H" - -;; It is also unnecessary to "group" parts of the expression together -;; to overcome operator precedence; that also happens automatically. -;; For example: - -;; (sregexq (opt (or "Bob" "Robert"))) => "\\(?:Bob\\|Robert\\)?" - -;; It *is* possible to group parts of the expression in order to refer -;; to them with numbered backreferences: - -;; (sregexq (group (or "Go" "Run")) -;; ", Spot, " -;; (backref 1)) => "\\(Go\\|Run\\), Spot, \\1" - -;; `sregexq' is a macro. Each time it is used, it constructs a simple -;; Lisp expression that then invokes a moderately complex engine to -;; interpret the sregex and render the string form. Because of this, -;; I don't recommend sprinkling calls to `sregexq' throughout your -;; code, the way one normally does with string regexes (which are -;; cheap to evaluate). Instead, it's wiser to precompute the regexes -;; you need wherever possible instead of repeatedly constructing the -;; same ones over and over. Example: - -;; (let ((field-regex (sregexq (opt "resent-") -;; (or "to" "cc" "bcc")))) -;; ... -;; (while ... -;; ... -;; (re-search-forward field-regex ...) -;; ...)) - -;; The arguments to `sregexq' are automatically quoted, but the -;; flipside of this is that it is not straightforward to include -;; computed (i.e., non-constant) values in `sregexq' expressions. So -;; `sregex' is a function that is like `sregexq' but which does not -;; automatically quote its values. Literal sregex clauses must be -;; explicitly quoted like so: - -;; (sregex '(or "Bob" "Robert")) => "Bob\\|Robert" - -;; but computed clauses can be included easily, allowing for the reuse -;; of common clauses: - -;; (let ((dotstar '(0+ any)) -;; (whitespace '(1+ (syntax ?-))) -;; (digits '(1+ (char (?0 . ?9))))) -;; (sregex 'bol dotstar ":" whitespace digits)) => "^.*:\\s-+[0-9]+" - -;; To use this package in a Lisp program, simply (require 'sregex). - -;; Here are the clauses allowed in an `sregex' or `sregexq' -;; expression: - -;; - a string -;; This stands for the literal string. If it contains -;; metacharacters, they will be escaped in the resulting regex -;; (using `regexp-quote'). - -;; - the symbol `any' -;; This stands for ".", a regex matching any character except -;; newline. - -;; - the symbol `bol' -;; Stands for "^", matching the empty string at the beginning of a line - -;; - the symbol `eol' -;; Stands for "$", matching the empty string at the end of a line - -;; - (group CLAUSE ...) -;; Groups the given CLAUSEs using "\\(" and "\\)". - -;; - (sequence CLAUSE ...) - -;; Groups the given CLAUSEs; may or may not use "\\(?:" and "\\)". -;; Clauses grouped by `sequence' do not count for purposes of -;; numbering backreferences. Use `sequence' in situations like -;; this: - -;; (sregexq (or "dog" "cat" -;; (sequence (opt "sea ") "monkey"))) -;; => "dog\\|cat\\|\\(?:sea \\)?monkey" - -;; where a single `or' alternate needs to contain multiple -;; subclauses. - -;; - (backref N) -;; Matches the same string previously matched by the Nth "group" in -;; the same sregex. N is a positive integer. - -;; - (or CLAUSE ...) -;; Matches any one of the CLAUSEs by separating them with "\\|". - -;; - (0+ CLAUSE ...) -;; Concatenates the given CLAUSEs and matches zero or more -;; occurrences by appending "*". - -;; - (1+ CLAUSE ...) -;; Concatenates the given CLAUSEs and matches one or more -;; occurrences by appending "+". - -;; - (opt CLAUSE ...) -;; Concatenates the given CLAUSEs and matches zero or one occurrence -;; by appending "?". - -;; - (repeat MIN MAX CLAUSE ...) -;; Concatenates the given CLAUSEs and constructs a regex matching at -;; least MIN occurrences and at most MAX occurrences. MIN must be a -;; non-negative integer. MAX must be a non-negative integer greater -;; than or equal to MIN; or MAX can be nil to mean "infinity." - -;; - (char CHAR-CLAUSE ...) -;; Creates a "character class" matching one character from the given -;; set. See below for how to construct a CHAR-CLAUSE. - -;; - (not-char CHAR-CLAUSE ...) -;; Creates a "character class" matching any one character not in the -;; given set. See below for how to construct a CHAR-CLAUSE. - -;; - the symbol `bot' -;; Stands for "\\`", matching the empty string at the beginning of -;; text (beginning of a string or of a buffer). - -;; - the symbol `eot' -;; Stands for "\\'", matching the empty string at the end of text. - -;; - the symbol `point' -;; Stands for "\\=", matching the empty string at point. - -;; - the symbol `word-boundary' -;; Stands for "\\b", matching the empty string at the beginning or -;; end of a word. - -;; - the symbol `not-word-boundary' -;; Stands for "\\B", matching the empty string not at the beginning -;; or end of a word. - -;; - the symbol `bow' -;; Stands for "\\<", matching the empty string at the beginning of a -;; word. - -;; - the symbol `eow' -;; Stands for "\\>", matching the empty string at the end of a word. - -;; - the symbol `wordchar' -;; Stands for the regex "\\w", matching a word-constituent character -;; (as determined by the current syntax table) - -;; - the symbol `not-wordchar' -;; Stands for the regex "\\W", matching a non-word-constituent -;; character. - -;; - (syntax CODE) -;; Stands for the regex "\\sCODE", where CODE is a syntax table code -;; (a single character). Matches any character with the requested -;; syntax. - -;; - (not-syntax CODE) -;; Stands for the regex "\\SCODE", where CODE is a syntax table code -;; (a single character). Matches any character without the -;; requested syntax. - -;; - (regex REGEX) -;; This is a "trapdoor" for including ordinary regular expression -;; strings in the result. Some regular expressions are clearer when -;; written the old way: "[a-z]" vs. (sregexq (char (?a . ?z))), for -;; instance. However, see the note under "Bugs," below. - -;; Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...) -;; has one of the following forms: - -;; - a character -;; Adds that character to the set. - -;; - a string -;; Adds all the characters in the string to the set. - -;; - A pair (MIN . MAX) -;; Where MIN and MAX are characters, adds the range of characters -;; from MIN through MAX to the set. - -;;; To do: - -;; An earlier version of this package could optionally translate the -;; symbolic regex into other languages' syntaxes, e.g. Perl. For -;; instance, with Perl syntax selected, (sregexq (or "ab" "cd")) would -;; yield "ab|cd" instead of "ab\\|cd". It might be useful to restore -;; such a facility. - -;; - handle multibyte chars in sregex--char-aux -;; - add support for character classes ([:blank:], ...) -;; - add support for non-greedy operators *? and +? -;; - bug: (sregexq (opt (opt ?a))) returns "a??" which is a non-greedy "a?" - -;;; Bugs: - -;;; Code: - -(eval-when-compile (require 'cl)) - -;; Compatibility code for when we didn't have shy-groups -(defvar sregex--current-sregex nil) -(defun sregex-info () nil) -(defmacro sregex-save-match-data (&rest forms) (cons 'save-match-data forms)) -(defun sregex-replace-match (r &optional f l str subexp x) - (replace-match r f l str subexp)) -(defun sregex-match-string (c &optional i x) (match-string c i)) -(defun sregex-match-string-no-properties (count &optional in-string sregex) - (match-string-no-properties count in-string)) -(defun sregex-match-beginning (count &optional sregex) (match-beginning count)) -(defun sregex-match-end (count &optional sregex) (match-end count)) -(defun sregex-match-data (&optional sregex) (match-data)) -(defun sregex-backref-num (n &optional sregex) n) - - -(defun sregex (&rest exps) - "Symbolic regular expression interpreter. -This is exactly like `sregexq' (q.v.) except that it evaluates all its -arguments, so literal sregex clauses must be quoted. For example: - - (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\" - -An argument-evaluating sregex interpreter lets you reuse sregex -subexpressions: - - (let ((dotstar '(0+ any)) - (whitespace '(1+ (syntax ?-))) - (digits '(1+ (char (?0 . ?9))))) - (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\"" - (sregex--sequence exps nil)) - -(defmacro sregexq (&rest exps) - "Symbolic regular expression interpreter. -This macro allows you to specify a regular expression (regexp) in -symbolic form, and converts it into the string form required by Emacs's -regex functions such as `re-search-forward' and `looking-at'. Here is -a simple example: - - (sregexq (or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\" - -As you can see, an sregex is specified by placing one or more special -clauses in a call to `sregexq'. The clause in this case is the `or' -of two strings (not to be confused with the Lisp function `or'). The -list of allowable clauses appears below. - -With `sregex', it is never necessary to \"escape\" magic characters -that are meant to be taken literally; that happens automatically. -For example: - - (sregexq \"M*A*S*H\") => \"M\\\\*A\\\\*S\\\\*H\" - -It is also unnecessary to \"group\" parts of the expression together -to overcome operator precedence; that also happens automatically. -For example: - - (sregexq (opt (or \"Bob\" \"Robert\"))) => \"\\\\(Bob\\\\|Robert\\\\)?\" - -It *is* possible to group parts of the expression in order to refer -to them with numbered backreferences: - - (sregexq (group (or \"Go\" \"Run\")) - \", Spot, \" - (backref 1)) => \"\\\\(Go\\\\|Run\\\\), Spot, \\\\1\" - -If `sregexq' needs to introduce its own grouping parentheses, it will -automatically renumber your backreferences: - - (sregexq (opt \"resent-\") - (group (or \"to\" \"cc\" \"bcc\")) - \": \" - (backref 1)) => \"\\\\(resent-\\\\)?\\\\(to\\\\|cc\\\\|bcc\\\\): \\\\2\" - -`sregexq' is a macro. Each time it is used, it constructs a simple -Lisp expression that then invokes a moderately complex engine to -interpret the sregex and render the string form. Because of this, I -don't recommend sprinkling calls to `sregexq' throughout your code, -the way one normally does with string regexes (which are cheap to -evaluate). Instead, it's wiser to precompute the regexes you need -wherever possible instead of repeatedly constructing the same ones -over and over. Example: - - (let ((field-regex (sregexq (opt \"resent-\") - (or \"to\" \"cc\" \"bcc\")))) - ... - (while ... - ... - (re-search-forward field-regex ...) - ...)) - -The arguments to `sregexq' are automatically quoted, but the -flipside of this is that it is not straightforward to include -computed (i.e., non-constant) values in `sregexq' expressions. So -`sregex' is a function that is like `sregexq' but which does not -automatically quote its values. Literal sregex clauses must be -explicitly quoted like so: - - (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\" - -but computed clauses can be included easily, allowing for the reuse -of common clauses: - - (let ((dotstar '(0+ any)) - (whitespace '(1+ (syntax ?-))) - (digits '(1+ (char (?0 . ?9))))) - (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\" - -Here are the clauses allowed in an `sregex' or `sregexq' expression: - -- a string - This stands for the literal string. If it contains - metacharacters, they will be escaped in the resulting regex - (using `regexp-quote'). - -- the symbol `any' - This stands for \".\", a regex matching any character except - newline. - -- the symbol `bol' - Stands for \"^\", matching the empty string at the beginning of a line - -- the symbol `eol' - Stands for \"$\", matching the empty string at the end of a line - -- (group CLAUSE ...) - Groups the given CLAUSEs using \"\\\\(\" and \"\\\\)\". - -- (sequence CLAUSE ...) - - Groups the given CLAUSEs; may or may not use \"\\\\(\" and \"\\\\)\". - Clauses grouped by `sequence' do not count for purposes of - numbering backreferences. Use `sequence' in situations like - this: - - (sregexq (or \"dog\" \"cat\" - (sequence (opt \"sea \") \"monkey\"))) - => \"dog\\\\|cat\\\\|\\\\(?:sea \\\\)?monkey\" - - where a single `or' alternate needs to contain multiple - subclauses. - -- (backref N) - Matches the same string previously matched by the Nth \"group\" in - the same sregex. N is a positive integer. - -- (or CLAUSE ...) - Matches any one of the CLAUSEs by separating them with \"\\\\|\". - -- (0+ CLAUSE ...) - Concatenates the given CLAUSEs and matches zero or more - occurrences by appending \"*\". - -- (1+ CLAUSE ...) - Concatenates the given CLAUSEs and matches one or more - occurrences by appending \"+\". - -- (opt CLAUSE ...) - Concatenates the given CLAUSEs and matches zero or one occurrence - by appending \"?\". - -- (repeat MIN MAX CLAUSE ...) - Concatenates the given CLAUSEs and constructs a regex matching at - least MIN occurrences and at most MAX occurrences. MIN must be a - non-negative integer. MAX must be a non-negative integer greater - than or equal to MIN; or MAX can be nil to mean \"infinity.\" - -- (char CHAR-CLAUSE ...) - Creates a \"character class\" matching one character from the given - set. See below for how to construct a CHAR-CLAUSE. - -- (not-char CHAR-CLAUSE ...) - Creates a \"character class\" matching any one character not in the - given set. See below for how to construct a CHAR-CLAUSE. - -- the symbol `bot' - Stands for \"\\\\`\", matching the empty string at the beginning of - text (beginning of a string or of a buffer). - -- the symbol `eot' - Stands for \"\\\\'\", matching the empty string at the end of text. - -- the symbol `point' - Stands for \"\\\\=\\=\", matching the empty string at point. - -- the symbol `word-boundary' - Stands for \"\\\\b\", matching the empty string at the beginning or - end of a word. - -- the symbol `not-word-boundary' - Stands for \"\\\\B\", matching the empty string not at the beginning - or end of a word. - -- the symbol `bow' - Stands for \"\\\\=\\<\", matching the empty string at the beginning of a - word. - -- the symbol `eow' - Stands for \"\\\\=\\>\", matching the empty string at the end of a word. - -- the symbol `wordchar' - Stands for the regex \"\\\\w\", matching a word-constituent character - (as determined by the current syntax table) - -- the symbol `not-wordchar' - Stands for the regex \"\\\\W\", matching a non-word-constituent - character. - -- (syntax CODE) - Stands for the regex \"\\\\sCODE\", where CODE is a syntax table code - (a single character). Matches any character with the requested - syntax. - -- (not-syntax CODE) - Stands for the regex \"\\\\SCODE\", where CODE is a syntax table code - (a single character). Matches any character without the - requested syntax. - -- (regex REGEX) - This is a \"trapdoor\" for including ordinary regular expression - strings in the result. Some regular expressions are clearer when - written the old way: \"[a-z]\" vs. (sregexq (char (?a . ?z))), for - instance. - -Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...) -has one of the following forms: - -- a character - Adds that character to the set. - -- a string - Adds all the characters in the string to the set. - -- A pair (MIN . MAX) - Where MIN and MAX are characters, adds the range of characters - from MIN through MAX to the set." - `(apply 'sregex ',exps)) - -(defun sregex--engine (exp combine) - (cond - ((stringp exp) - (if (and combine - (eq combine 'suffix) - (/= (length exp) 1)) - (concat "\\(?:" (regexp-quote exp) "\\)") - (regexp-quote exp))) - ((symbolp exp) - (ecase exp - (any ".") - (bol "^") - (eol "$") - (wordchar "\\w") - (not-wordchar "\\W") - (bot "\\`") - (eot "\\'") - (point "\\=") - (word-boundary "\\b") - (not-word-boundary "\\B") - (bow "\\<") - (eow "\\>"))) - ((consp exp) - (funcall (intern (concat "sregex--" - (symbol-name (car exp)))) - (cdr exp) - combine)) - (t (error "Invalid expression: %s" exp)))) - -(defun sregex--sequence (exps combine) - (if (= (length exps) 1) (sregex--engine (car exps) combine) - (let ((re (mapconcat - (lambda (e) (sregex--engine e 'concat)) - exps ""))) - (if (eq combine 'suffix) - (concat "\\(?:" re "\\)") - re)))) - -(defun sregex--or (exps combine) - (if (= (length exps) 1) (sregex--engine (car exps) combine) - (let ((re (mapconcat - (lambda (e) (sregex--engine e 'or)) - exps "\\|"))) - (if (not (eq combine 'or)) - (concat "\\(?:" re "\\)") - re)))) - -(defun sregex--group (exps combine) (concat "\\(" (sregex--sequence exps nil) "\\)")) - -(defun sregex--backref (exps combine) (concat "\\" (int-to-string (car exps)))) -(defun sregex--opt (exps combine) (concat (sregex--sequence exps 'suffix) "?")) -(defun sregex--0+ (exps combine) (concat (sregex--sequence exps 'suffix) "*")) -(defun sregex--1+ (exps combine) (concat (sregex--sequence exps 'suffix) "+")) - -(defun sregex--char (exps combine) (sregex--char-aux nil exps)) -(defun sregex--not-char (exps combine) (sregex--char-aux t exps)) - -(defun sregex--syntax (exps combine) (format "\\s%c" (car exps))) -(defun sregex--not-syntax (exps combine) (format "\\S%c" (car exps))) - -(defun sregex--regex (exps combine) - (if combine (concat "\\(?:" (car exps) "\\)") (car exps))) - -(defun sregex--repeat (exps combine) - (let* ((min (or (pop exps) 0)) - (minstr (number-to-string min)) - (max (pop exps))) - (concat (sregex--sequence exps 'suffix) - (concat "\\{" minstr "," - (when max (number-to-string max)) "\\}")))) - -(defun sregex--char-range (start end) - (let ((startc (char-to-string start)) - (endc (char-to-string end))) - (cond - ((> end (+ start 2)) (concat startc "-" endc)) - ((> end (+ start 1)) (concat startc (char-to-string (1+ start)) endc)) - ((> end start) (concat startc endc)) - (t startc)))) - -(defun sregex--char-aux (complement args) - ;; regex-opt does the same, we should join effort. - (let ((chars (make-bool-vector 256 nil))) ; Yeah, right! - (dolist (arg args) - (cond ((integerp arg) (aset chars arg t)) - ((stringp arg) (mapc (lambda (c) (aset chars c t)) arg)) - ((consp arg) - (let ((start (car arg)) - (end (cdr arg))) - (when (> start end) - (let ((tmp start)) (setq start end) (setq end tmp))) - ;; now start <= end - (let ((i start)) - (while (<= i end) - (aset chars i t) - (setq i (1+ i)))))))) - ;; now chars is a map of the characters in the class - (let ((caret (aref chars ?^)) - (dash (aref chars ?-)) - (class (if (aref chars ?\]) "]" ""))) - (aset chars ?^ nil) - (aset chars ?- nil) - (aset chars ?\] nil) - - (let (start end) - (dotimes (i 256) - (if (aref chars i) - (progn - (unless start (setq start i)) - (setq end i) - (aset chars i nil)) - (when start - (setq class (concat class (sregex--char-range start end))) - (setq start nil)))) - (if start - (setq class (concat class (sregex--char-range start end))))) - - (if (> (length class) 0) - (setq class (concat class (if caret "^") (if dash "-"))) - (setq class (concat class (if dash "-") (if caret "^")))) - (if (and (not complement) (= (length class) 1)) - (regexp-quote class) - (concat "[" (if complement "^") class "]"))))) - -(provide 'sregex) - -;; arch-tag: 460c1f5a-eb6e-42ec-a451-ffac78bdf492 -;;; sregex.el ends here diff --git a/lisp/obsolete/sregex.el b/lisp/obsolete/sregex.el new file mode 100644 index 00000000000..ef4700c15f8 --- /dev/null +++ b/lisp/obsolete/sregex.el @@ -0,0 +1,609 @@ +;;; sregex.el --- symbolic regular expressions + +;; Copyright (C) 1997, 1998, 2000, 2001, 2002, 2003, 2004, +;; 2005, 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc. + +;; Author: Bob Glickstein +;; Maintainer: Bob Glickstein +;; Keywords: extensions +;; Obsolete-since: 24.1 + +;; This file is part of GNU Emacs. + +;; GNU Emacs is free software: you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation, either version 3 of the License, or +;; (at your option) any later version. + +;; GNU Emacs is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GNU Emacs. If not, see . + +;;; Commentary: + +;; This package allows you to write regular expressions using a +;; totally new, Lisp-like syntax. + +;; A "symbolic regular expression" (sregex for short) is a Lisp form +;; that, when evaluated, produces the string form of the specified +;; regular expression. Here's a simple example: + +;; (sregexq (or "Bob" "Robert")) => "Bob\\|Robert" + +;; As you can see, an sregex is specified by placing one or more +;; special clauses in a call to `sregexq'. The clause in this case is +;; the `or' of two strings (not to be confused with the Lisp function +;; `or'). The list of allowable clauses appears below. + +;; With sregex, it is never necessary to "escape" magic characters +;; that are meant to be taken literally; that happens automatically. +;; For example: + +;; (sregexq "M*A*S*H") => "M\\*A\\*S\\*H" + +;; It is also unnecessary to "group" parts of the expression together +;; to overcome operator precedence; that also happens automatically. +;; For example: + +;; (sregexq (opt (or "Bob" "Robert"))) => "\\(?:Bob\\|Robert\\)?" + +;; It *is* possible to group parts of the expression in order to refer +;; to them with numbered backreferences: + +;; (sregexq (group (or "Go" "Run")) +;; ", Spot, " +;; (backref 1)) => "\\(Go\\|Run\\), Spot, \\1" + +;; `sregexq' is a macro. Each time it is used, it constructs a simple +;; Lisp expression that then invokes a moderately complex engine to +;; interpret the sregex and render the string form. Because of this, +;; I don't recommend sprinkling calls to `sregexq' throughout your +;; code, the way one normally does with string regexes (which are +;; cheap to evaluate). Instead, it's wiser to precompute the regexes +;; you need wherever possible instead of repeatedly constructing the +;; same ones over and over. Example: + +;; (let ((field-regex (sregexq (opt "resent-") +;; (or "to" "cc" "bcc")))) +;; ... +;; (while ... +;; ... +;; (re-search-forward field-regex ...) +;; ...)) + +;; The arguments to `sregexq' are automatically quoted, but the +;; flipside of this is that it is not straightforward to include +;; computed (i.e., non-constant) values in `sregexq' expressions. So +;; `sregex' is a function that is like `sregexq' but which does not +;; automatically quote its values. Literal sregex clauses must be +;; explicitly quoted like so: + +;; (sregex '(or "Bob" "Robert")) => "Bob\\|Robert" + +;; but computed clauses can be included easily, allowing for the reuse +;; of common clauses: + +;; (let ((dotstar '(0+ any)) +;; (whitespace '(1+ (syntax ?-))) +;; (digits '(1+ (char (?0 . ?9))))) +;; (sregex 'bol dotstar ":" whitespace digits)) => "^.*:\\s-+[0-9]+" + +;; To use this package in a Lisp program, simply (require 'sregex). + +;; Here are the clauses allowed in an `sregex' or `sregexq' +;; expression: + +;; - a string +;; This stands for the literal string. If it contains +;; metacharacters, they will be escaped in the resulting regex +;; (using `regexp-quote'). + +;; - the symbol `any' +;; This stands for ".", a regex matching any character except +;; newline. + +;; - the symbol `bol' +;; Stands for "^", matching the empty string at the beginning of a line + +;; - the symbol `eol' +;; Stands for "$", matching the empty string at the end of a line + +;; - (group CLAUSE ...) +;; Groups the given CLAUSEs using "\\(" and "\\)". + +;; - (sequence CLAUSE ...) + +;; Groups the given CLAUSEs; may or may not use "\\(?:" and "\\)". +;; Clauses grouped by `sequence' do not count for purposes of +;; numbering backreferences. Use `sequence' in situations like +;; this: + +;; (sregexq (or "dog" "cat" +;; (sequence (opt "sea ") "monkey"))) +;; => "dog\\|cat\\|\\(?:sea \\)?monkey" + +;; where a single `or' alternate needs to contain multiple +;; subclauses. + +;; - (backref N) +;; Matches the same string previously matched by the Nth "group" in +;; the same sregex. N is a positive integer. + +;; - (or CLAUSE ...) +;; Matches any one of the CLAUSEs by separating them with "\\|". + +;; - (0+ CLAUSE ...) +;; Concatenates the given CLAUSEs and matches zero or more +;; occurrences by appending "*". + +;; - (1+ CLAUSE ...) +;; Concatenates the given CLAUSEs and matches one or more +;; occurrences by appending "+". + +;; - (opt CLAUSE ...) +;; Concatenates the given CLAUSEs and matches zero or one occurrence +;; by appending "?". + +;; - (repeat MIN MAX CLAUSE ...) +;; Concatenates the given CLAUSEs and constructs a regex matching at +;; least MIN occurrences and at most MAX occurrences. MIN must be a +;; non-negative integer. MAX must be a non-negative integer greater +;; than or equal to MIN; or MAX can be nil to mean "infinity." + +;; - (char CHAR-CLAUSE ...) +;; Creates a "character class" matching one character from the given +;; set. See below for how to construct a CHAR-CLAUSE. + +;; - (not-char CHAR-CLAUSE ...) +;; Creates a "character class" matching any one character not in the +;; given set. See below for how to construct a CHAR-CLAUSE. + +;; - the symbol `bot' +;; Stands for "\\`", matching the empty string at the beginning of +;; text (beginning of a string or of a buffer). + +;; - the symbol `eot' +;; Stands for "\\'", matching the empty string at the end of text. + +;; - the symbol `point' +;; Stands for "\\=", matching the empty string at point. + +;; - the symbol `word-boundary' +;; Stands for "\\b", matching the empty string at the beginning or +;; end of a word. + +;; - the symbol `not-word-boundary' +;; Stands for "\\B", matching the empty string not at the beginning +;; or end of a word. + +;; - the symbol `bow' +;; Stands for "\\<", matching the empty string at the beginning of a +;; word. + +;; - the symbol `eow' +;; Stands for "\\>", matching the empty string at the end of a word. + +;; - the symbol `wordchar' +;; Stands for the regex "\\w", matching a word-constituent character +;; (as determined by the current syntax table) + +;; - the symbol `not-wordchar' +;; Stands for the regex "\\W", matching a non-word-constituent +;; character. + +;; - (syntax CODE) +;; Stands for the regex "\\sCODE", where CODE is a syntax table code +;; (a single character). Matches any character with the requested +;; syntax. + +;; - (not-syntax CODE) +;; Stands for the regex "\\SCODE", where CODE is a syntax table code +;; (a single character). Matches any character without the +;; requested syntax. + +;; - (regex REGEX) +;; This is a "trapdoor" for including ordinary regular expression +;; strings in the result. Some regular expressions are clearer when +;; written the old way: "[a-z]" vs. (sregexq (char (?a . ?z))), for +;; instance. However, see the note under "Bugs," below. + +;; Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...) +;; has one of the following forms: + +;; - a character +;; Adds that character to the set. + +;; - a string +;; Adds all the characters in the string to the set. + +;; - A pair (MIN . MAX) +;; Where MIN and MAX are characters, adds the range of characters +;; from MIN through MAX to the set. + +;;; To do: + +;; An earlier version of this package could optionally translate the +;; symbolic regex into other languages' syntaxes, e.g. Perl. For +;; instance, with Perl syntax selected, (sregexq (or "ab" "cd")) would +;; yield "ab|cd" instead of "ab\\|cd". It might be useful to restore +;; such a facility. + +;; - handle multibyte chars in sregex--char-aux +;; - add support for character classes ([:blank:], ...) +;; - add support for non-greedy operators *? and +? +;; - bug: (sregexq (opt (opt ?a))) returns "a??" which is a non-greedy "a?" + +;;; Bugs: + +;;; Code: + +(eval-when-compile (require 'cl)) + +;; Compatibility code for when we didn't have shy-groups +(defvar sregex--current-sregex nil) +(defun sregex-info () nil) +(defmacro sregex-save-match-data (&rest forms) (cons 'save-match-data forms)) +(defun sregex-replace-match (r &optional f l str subexp x) + (replace-match r f l str subexp)) +(defun sregex-match-string (c &optional i x) (match-string c i)) +(defun sregex-match-string-no-properties (count &optional in-string sregex) + (match-string-no-properties count in-string)) +(defun sregex-match-beginning (count &optional sregex) (match-beginning count)) +(defun sregex-match-end (count &optional sregex) (match-end count)) +(defun sregex-match-data (&optional sregex) (match-data)) +(defun sregex-backref-num (n &optional sregex) n) + + +(defun sregex (&rest exps) + "Symbolic regular expression interpreter. +This is exactly like `sregexq' (q.v.) except that it evaluates all its +arguments, so literal sregex clauses must be quoted. For example: + + (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\" + +An argument-evaluating sregex interpreter lets you reuse sregex +subexpressions: + + (let ((dotstar '(0+ any)) + (whitespace '(1+ (syntax ?-))) + (digits '(1+ (char (?0 . ?9))))) + (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\"" + (sregex--sequence exps nil)) + +(defmacro sregexq (&rest exps) + "Symbolic regular expression interpreter. +This macro allows you to specify a regular expression (regexp) in +symbolic form, and converts it into the string form required by Emacs's +regex functions such as `re-search-forward' and `looking-at'. Here is +a simple example: + + (sregexq (or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\" + +As you can see, an sregex is specified by placing one or more special +clauses in a call to `sregexq'. The clause in this case is the `or' +of two strings (not to be confused with the Lisp function `or'). The +list of allowable clauses appears below. + +With `sregex', it is never necessary to \"escape\" magic characters +that are meant to be taken literally; that happens automatically. +For example: + + (sregexq \"M*A*S*H\") => \"M\\\\*A\\\\*S\\\\*H\" + +It is also unnecessary to \"group\" parts of the expression together +to overcome operator precedence; that also happens automatically. +For example: + + (sregexq (opt (or \"Bob\" \"Robert\"))) => \"\\\\(Bob\\\\|Robert\\\\)?\" + +It *is* possible to group parts of the expression in order to refer +to them with numbered backreferences: + + (sregexq (group (or \"Go\" \"Run\")) + \", Spot, \" + (backref 1)) => \"\\\\(Go\\\\|Run\\\\), Spot, \\\\1\" + +If `sregexq' needs to introduce its own grouping parentheses, it will +automatically renumber your backreferences: + + (sregexq (opt \"resent-\") + (group (or \"to\" \"cc\" \"bcc\")) + \": \" + (backref 1)) => \"\\\\(resent-\\\\)?\\\\(to\\\\|cc\\\\|bcc\\\\): \\\\2\" + +`sregexq' is a macro. Each time it is used, it constructs a simple +Lisp expression that then invokes a moderately complex engine to +interpret the sregex and render the string form. Because of this, I +don't recommend sprinkling calls to `sregexq' throughout your code, +the way one normally does with string regexes (which are cheap to +evaluate). Instead, it's wiser to precompute the regexes you need +wherever possible instead of repeatedly constructing the same ones +over and over. Example: + + (let ((field-regex (sregexq (opt \"resent-\") + (or \"to\" \"cc\" \"bcc\")))) + ... + (while ... + ... + (re-search-forward field-regex ...) + ...)) + +The arguments to `sregexq' are automatically quoted, but the +flipside of this is that it is not straightforward to include +computed (i.e., non-constant) values in `sregexq' expressions. So +`sregex' is a function that is like `sregexq' but which does not +automatically quote its values. Literal sregex clauses must be +explicitly quoted like so: + + (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\" + +but computed clauses can be included easily, allowing for the reuse +of common clauses: + + (let ((dotstar '(0+ any)) + (whitespace '(1+ (syntax ?-))) + (digits '(1+ (char (?0 . ?9))))) + (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\" + +Here are the clauses allowed in an `sregex' or `sregexq' expression: + +- a string + This stands for the literal string. If it contains + metacharacters, they will be escaped in the resulting regex + (using `regexp-quote'). + +- the symbol `any' + This stands for \".\", a regex matching any character except + newline. + +- the symbol `bol' + Stands for \"^\", matching the empty string at the beginning of a line + +- the symbol `eol' + Stands for \"$\", matching the empty string at the end of a line + +- (group CLAUSE ...) + Groups the given CLAUSEs using \"\\\\(\" and \"\\\\)\". + +- (sequence CLAUSE ...) + + Groups the given CLAUSEs; may or may not use \"\\\\(\" and \"\\\\)\". + Clauses grouped by `sequence' do not count for purposes of + numbering backreferences. Use `sequence' in situations like + this: + + (sregexq (or \"dog\" \"cat\" + (sequence (opt \"sea \") \"monkey\"))) + => \"dog\\\\|cat\\\\|\\\\(?:sea \\\\)?monkey\" + + where a single `or' alternate needs to contain multiple + subclauses. + +- (backref N) + Matches the same string previously matched by the Nth \"group\" in + the same sregex. N is a positive integer. + +- (or CLAUSE ...) + Matches any one of the CLAUSEs by separating them with \"\\\\|\". + +- (0+ CLAUSE ...) + Concatenates the given CLAUSEs and matches zero or more + occurrences by appending \"*\". + +- (1+ CLAUSE ...) + Concatenates the given CLAUSEs and matches one or more + occurrences by appending \"+\". + +- (opt CLAUSE ...) + Concatenates the given CLAUSEs and matches zero or one occurrence + by appending \"?\". + +- (repeat MIN MAX CLAUSE ...) + Concatenates the given CLAUSEs and constructs a regex matching at + least MIN occurrences and at most MAX occurrences. MIN must be a + non-negative integer. MAX must be a non-negative integer greater + than or equal to MIN; or MAX can be nil to mean \"infinity.\" + +- (char CHAR-CLAUSE ...) + Creates a \"character class\" matching one character from the given + set. See below for how to construct a CHAR-CLAUSE. + +- (not-char CHAR-CLAUSE ...) + Creates a \"character class\" matching any one character not in the + given set. See below for how to construct a CHAR-CLAUSE. + +- the symbol `bot' + Stands for \"\\\\`\", matching the empty string at the beginning of + text (beginning of a string or of a buffer). + +- the symbol `eot' + Stands for \"\\\\'\", matching the empty string at the end of text. + +- the symbol `point' + Stands for \"\\\\=\\=\", matching the empty string at point. + +- the symbol `word-boundary' + Stands for \"\\\\b\", matching the empty string at the beginning or + end of a word. + +- the symbol `not-word-boundary' + Stands for \"\\\\B\", matching the empty string not at the beginning + or end of a word. + +- the symbol `bow' + Stands for \"\\\\=\\<\", matching the empty string at the beginning of a + word. + +- the symbol `eow' + Stands for \"\\\\=\\>\", matching the empty string at the end of a word. + +- the symbol `wordchar' + Stands for the regex \"\\\\w\", matching a word-constituent character + (as determined by the current syntax table) + +- the symbol `not-wordchar' + Stands for the regex \"\\\\W\", matching a non-word-constituent + character. + +- (syntax CODE) + Stands for the regex \"\\\\sCODE\", where CODE is a syntax table code + (a single character). Matches any character with the requested + syntax. + +- (not-syntax CODE) + Stands for the regex \"\\\\SCODE\", where CODE is a syntax table code + (a single character). Matches any character without the + requested syntax. + +- (regex REGEX) + This is a \"trapdoor\" for including ordinary regular expression + strings in the result. Some regular expressions are clearer when + written the old way: \"[a-z]\" vs. (sregexq (char (?a . ?z))), for + instance. + +Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...) +has one of the following forms: + +- a character + Adds that character to the set. + +- a string + Adds all the characters in the string to the set. + +- A pair (MIN . MAX) + Where MIN and MAX are characters, adds the range of characters + from MIN through MAX to the set." + `(apply 'sregex ',exps)) + +(defun sregex--engine (exp combine) + (cond + ((stringp exp) + (if (and combine + (eq combine 'suffix) + (/= (length exp) 1)) + (concat "\\(?:" (regexp-quote exp) "\\)") + (regexp-quote exp))) + ((symbolp exp) + (ecase exp + (any ".") + (bol "^") + (eol "$") + (wordchar "\\w") + (not-wordchar "\\W") + (bot "\\`") + (eot "\\'") + (point "\\=") + (word-boundary "\\b") + (not-word-boundary "\\B") + (bow "\\<") + (eow "\\>"))) + ((consp exp) + (funcall (intern (concat "sregex--" + (symbol-name (car exp)))) + (cdr exp) + combine)) + (t (error "Invalid expression: %s" exp)))) + +(defun sregex--sequence (exps combine) + (if (= (length exps) 1) (sregex--engine (car exps) combine) + (let ((re (mapconcat + (lambda (e) (sregex--engine e 'concat)) + exps ""))) + (if (eq combine 'suffix) + (concat "\\(?:" re "\\)") + re)))) + +(defun sregex--or (exps combine) + (if (= (length exps) 1) (sregex--engine (car exps) combine) + (let ((re (mapconcat + (lambda (e) (sregex--engine e 'or)) + exps "\\|"))) + (if (not (eq combine 'or)) + (concat "\\(?:" re "\\)") + re)))) + +(defun sregex--group (exps combine) (concat "\\(" (sregex--sequence exps nil) "\\)")) + +(defun sregex--backref (exps combine) (concat "\\" (int-to-string (car exps)))) +(defun sregex--opt (exps combine) (concat (sregex--sequence exps 'suffix) "?")) +(defun sregex--0+ (exps combine) (concat (sregex--sequence exps 'suffix) "*")) +(defun sregex--1+ (exps combine) (concat (sregex--sequence exps 'suffix) "+")) + +(defun sregex--char (exps combine) (sregex--char-aux nil exps)) +(defun sregex--not-char (exps combine) (sregex--char-aux t exps)) + +(defun sregex--syntax (exps combine) (format "\\s%c" (car exps))) +(defun sregex--not-syntax (exps combine) (format "\\S%c" (car exps))) + +(defun sregex--regex (exps combine) + (if combine (concat "\\(?:" (car exps) "\\)") (car exps))) + +(defun sregex--repeat (exps combine) + (let* ((min (or (pop exps) 0)) + (minstr (number-to-string min)) + (max (pop exps))) + (concat (sregex--sequence exps 'suffix) + (concat "\\{" minstr "," + (when max (number-to-string max)) "\\}")))) + +(defun sregex--char-range (start end) + (let ((startc (char-to-string start)) + (endc (char-to-string end))) + (cond + ((> end (+ start 2)) (concat startc "-" endc)) + ((> end (+ start 1)) (concat startc (char-to-string (1+ start)) endc)) + ((> end start) (concat startc endc)) + (t startc)))) + +(defun sregex--char-aux (complement args) + ;; regex-opt does the same, we should join effort. + (let ((chars (make-bool-vector 256 nil))) ; Yeah, right! + (dolist (arg args) + (cond ((integerp arg) (aset chars arg t)) + ((stringp arg) (mapc (lambda (c) (aset chars c t)) arg)) + ((consp arg) + (let ((start (car arg)) + (end (cdr arg))) + (when (> start end) + (let ((tmp start)) (setq start end) (setq end tmp))) + ;; now start <= end + (let ((i start)) + (while (<= i end) + (aset chars i t) + (setq i (1+ i)))))))) + ;; now chars is a map of the characters in the class + (let ((caret (aref chars ?^)) + (dash (aref chars ?-)) + (class (if (aref chars ?\]) "]" ""))) + (aset chars ?^ nil) + (aset chars ?- nil) + (aset chars ?\] nil) + + (let (start end) + (dotimes (i 256) + (if (aref chars i) + (progn + (unless start (setq start i)) + (setq end i) + (aset chars i nil)) + (when start + (setq class (concat class (sregex--char-range start end))) + (setq start nil)))) + (if start + (setq class (concat class (sregex--char-range start end))))) + + (if (> (length class) 0) + (setq class (concat class (if caret "^") (if dash "-"))) + (setq class (concat class (if dash "-") (if caret "^")))) + (if (and (not complement) (= (length class) 1)) + (regexp-quote class) + (concat "[" (if complement "^") class "]"))))) + +(provide 'sregex) + +;; arch-tag: 460c1f5a-eb6e-42ec-a451-ffac78bdf492 +;;; sregex.el ends here