Add peg.el as a built-in library

author Eric Abrahamsen <eric@ericabrahamsen.net>

Tue, 6 Dec 2022 05:59:03 +0000 (21:59 -0800)

committer Eshel Yaron <me@eshelyaron.com>

Sun, 31 Mar 2024 18:05:29 +0000 (20:05 +0200)
author Eric Abrahamsen <eric@ericabrahamsen.net>
Tue, 6 Dec 2022 05:59:03 +0000 (21:59 -0800)
committer Eshel Yaron <me@eshelyaron.com>
Sun, 31 Mar 2024 18:05:29 +0000 (20:05 +0200)
diff --git a/doc/lispref/Makefile.in b/doc/lispref/Makefile.in

index 0a228271be3731fdf4794b8bfb8afa9a3f4cfd9c..4ceffd7d7d32514ed01726ef845824abb02c2eaa 100644 (file)
--- a/doc/lispref/Makefile.in
+++ b/doc/lispref/Makefile.in
@@ -112,6 +112,7 @@ srcs = \
    $(srcdir)/os.texi \
    $(srcdir)/package.texi \
    $(srcdir)/parsing.texi \
+  $(srcdir)/peg.texi \
    $(srcdir)/positions.texi \
    $(srcdir)/processes.texi \
    $(srcdir)/records.texi \
diff --git a/doc/lispref/elisp.texi b/doc/lispref/elisp.texi

index 71139db43590b1d3240eb293b9dc7a2944c70001..ec93a0b9c8ae7fb69f7488e47a2d623e4c75261b 100644 (file)
--- a/doc/lispref/elisp.texi
+++ b/doc/lispref/elisp.texi
@@ -222,6 +222,7 @@ To view this manual in other formats, click
  * Non-ASCII Characters::    Non-ASCII text in buffers and strings.
  * Searching and Matching::  Searching buffers for strings or regexps.
  * Syntax Tables::           The syntax table controls word and list parsing.
+* Parsing Expression Grammars:: Parsing structured buffer text.
  * Parsing Program Source::  Generate syntax tree for program sources.
  * Abbrevs::                 How Abbrev mode works, and its data structures.
  
@@ -1365,6 +1366,12 @@ Syntax Tables
  * Syntax Table Internals::  How syntax table information is stored.
  * Categories::              Another way of classifying character syntax.
  
+Parsing Expression Grammars
+
+* PEX Definitions::         The syntax of PEX rules
+* Parsing Actions::         Running actions upon successful parsing.
+* Writing PEG Rules::       Tips for writing parsing rules.
+
  Parsing Program Source
  
  * Language Grammar::         Loading tree-sitter language grammar.
@@ -1720,6 +1727,7 @@ Object Internals
  
  @include searching.texi
  @include syntax.texi
+@include peg.texi
  @include parsing.texi
  @include abbrevs.texi
  @include threads.texi
diff --git a/doc/lispref/peg.texi b/doc/lispref/peg.texi

new file mode 100644 (file)

index 0000000..ef4dfa7
--- /dev/null
+++ b/doc/lispref/peg.texi
@@ -0,0 +1,351 @@
+@c -*-texinfo-*-
+@c This is part of the GNU Emacs Lisp Reference Manual.
+@c Copyright (C) 1990--1995, 1998--1999, 2001--2023 Free Software
+@c Foundation, Inc.
+@c See the file elisp.texi for copying conditions.
+@node Parsing Expression Grammars
+@chapter Parsing Expression Grammars
+@cindex text parsing
+@cindex parsing expression grammar
+
+  Emacs Lisp provides several tools for parsing and matching text,
+from regular expressions (@pxref{Regular Expressions}) to full
+@acronym{LL} grammar parsers (@pxref{Top,, Bovine parser
+development,bovine}).  @dfn{Parsing Expression Grammars}
+(@acronym{PEG}) are another approach to text parsing that offer more
+structure and composibility than regular expressions, but less
+complexity than context-free grammars.
+
+A @acronym{PEG} parser is defined as a list of named rules, each of
+which matches text patterns, and/or contains references to other
+rules.  Parsing is initiated with the function @code{peg-run} or the
+macro @code{peg-parse} (see below), and parses text after point in the
+current buffer, using a given set of rules.
+
+@cindex parsing expression
+The definition of each rule is referred to as a @dfn{parsing
+expression} (@acronym{PEX}), and can consist of a literal string, a
+regexp-like character range or set, a peg-specific construct
+resembling an elisp function call, a reference to another rule, or a
+combination of any of these.  A grammar is expressed as a tree of
+rules in which one rule is typically treated as a ``root'' or
+``entry-point'' rule.  For instance:
+
+@example
+@group
+((number sign digit (* digit))
+ (sign   (or "+" "-" ""))
+ (digit  [0-9]))
+@end group
+@end example
+
+Once defined, grammars can be used to parse text after point in the
+current buffer, in the following ways:
+
+@defmac peg-parse &rest pexs
+Match @var{pexs} at point.  If @var{pexs} is a list of PEG rules, the
+first rule is considered the ``entry-point'':
+@end defmac
+
+@example
+@group
+(peg-parse
+  ((number sign digit (* digit))
+   (sign   (or "+" "-" ""))
+   (digit  [0-9])))
+@end group
+@end example
+
+This macro represents the simplest use of the @acronym{PEG} library,
+but also the least flexible, as the rules must be written directly
+into the source code.  A more flexible approach involves use of three
+macros in conjunction: @code{with-peg-rules}, a @code{let}-like
+construct that makes a set of rules available within the macro body;
+@code{peg-run}, which initiates parsing given a single rule; and
+@code{peg}, which is used to wrap the entry-point rule name.  In fact,
+a call to @code{peg-parse} expands to just this set of calls.  The
+above example could be written as:
+
+@example
+@group
+(with-peg-rules
+    ((number sign digit (* digit))
+     (sign   (or "+" "-" ""))
+     (digit  [0-9]))
+  (peg-run (peg number)))
+@end group
+@end example
+
+This allows more explicit control over the ``entry-point'' of parsing,
+and allows the combination of rules from different sources.
+
+Individual rules can also be defined using a more @code{defun}-like
+syntax, using the macro @code{define-peg-rule}:
+
+@example
+(define-peg-rule digit ()
+  [0-9])
+@end example
+
+This also allows for rules that accept an argument (supplied by the
+@code{funcall} PEG rule).
+
+Another possibility is to define a named set of rules with
+@code{define-peg-ruleset}:
+
+@example
+(define-peg-ruleset number-grammar
+        '((number sign digit (* digit))
+          digit  ;; A reference to the definition above.
+          (sign (or "+" "-" ""))))
+@end example
+
+Rules and rulesets defined this way can be referred to by name in
+later calls to @code{peg-run} or @code{with-peg-rules}:
+
+@example
+(with-peg-rules number-grammar
+  (peg-run (peg number)))
+@end example
+
+By default, calls to @code{peg-run} or @code{peg-parse} produce no
+output: parsing simply moves point.  In order to return or otherwise
+act upon parsed strings, rules can include @dfn{actions}, see
+@ref{Parsing Actions}.
+
+@menu
+* PEX Definitions::             The syntax of PEX rules.
+* Parsing Actions::             Running actions upon successful parsing.
+* Writing PEG Rules::           Tips for writing parsing rules.
+@end menu
+
+@node PEX Definitions
+@section PEX Definitions
+
+Parsing expressions can be defined using the following syntax:
+
+@table @code
+@item (and E1 E2 ...)
+A sequence of @acronym{PEX}s that must all be matched.  The @code{and} form is
+optional and implicit.
+
+@item (or E1 E2 ...)
+Prioritized choices, meaning that, as in Elisp, the choices are tried
+in order, and the first successful match is used.  Note that this is
+distinct from context-free grammars, in which selection between
+multiple matches is indeterminate.
+
+@item (any)
+Matches any single character, as the regexp ``.''.
+
+@item @var{string}
+A literal string.
+
+@item (char @var{C})
+A single character @var{C}, as an Elisp character literal.
+
+@item (* @var{E})
+Zero or more instances of expression @var{E}, as the regexp @samp{*}.
+Matching is always ``greedy''.
+
+@item (+ @var{E})
+One or more instances of expression @var{E}, as the regexp @samp{+}.
+Matching is always ``greedy''.
+
+@item (opt @var{E})
+Zero or one instance of expression @var{E}, as the regexp @samp{?}.
+
+@item SYMBOL
+A symbol representing a previously-defined PEG rule.
+
+@item (range CH1 CH2)
+The character range between CH1 and CH2, as the regexp @samp{[CH1-CH2]}.
+
+@item [CH1-CH2 "+*" ?x]
+A character set, which can include ranges, character literals, or
+strings of characters.
+
+@item [ascii cntrl]
+A list of named character classes.
+
+@item (syntax-class @var{NAME})
+A single syntax class.
+
+@item (funcall E ARGS...)
+Call @acronym{PEX} E (previously defined with @code{define-peg-rule})
+with arguments @var{ARGS}.
+
+@item (null)
+The empty string.
+
+@end table
+
+The following expressions are used as anchors or tests -- they do not
+move point, but return a boolean value which can be used to constrain
+matches as a way of controlling the parsing process (@pxref{Writing
+PEG Rules}).
+
+@table @code
+@item (bob)
+Beginning of buffer.
+
+@item (eob)
+End of buffer.
+
+@item (bol)
+Beginning of line.
+
+@item (eol)
+End of line.
+
+@item (bow)
+Beginning of word.
+
+@item (eow)
+End of word.
+
+@item (bos)
+Beginning of symbol.
+
+@item (eos)
+End of symbol.
+
+@item (if E)
+Returns non-@code{nil} if parsing @acronym{PEX} E from point succeeds (point
+is not moved).
+
+@item (not E)
+Returns non-@code{nil} if parsing @acronym{PEX} E from point fails (point
+is not moved).
+
+@item (guard EXP)
+Treats the value of the Lisp expression EXP as a boolean.
+
+@end table
+
+@vindex peg-char-classes
+Character class matching can use the same named character classes as
+in regular expressions (@pxref{Top,, Character Classes,elisp})
+
+@node Parsing Actions
+@section Parsing Actions
+
+@cindex parsing actions
+@cindex parsing stack
+By default the process of parsing simply moves point in the current
+buffer, ultimately returning @code{t} if the parsing succeeds, and
+@code{nil} if it doesn't.  It's also possible to define ``actions''
+that can run arbitrary Elisp at certain points in the parsed text.
+These actions can optionally affect something called the @dfn{parsing
+stack}, which is a list of values returned by the parsing process.
+These actions only run (and only return values) if the parsing process
+ultimately succeeds; if it fails the action code is not run at all.
+
+Actions can be added anywhere in the definition of a rule.  They are
+distinguished from parsing expressions by an initial backquote
+(@samp{`}), followed by a parenthetical form that must contain a pair
+of hyphens (@samp{--}) somewhere within it.  Symbols to the left of
+the hyphens are bound to values popped from the stack (they are
+somewhat analogous to the argument list of a lambda form).  Values
+produced by code to the right are pushed to the stack (analogous to
+the return value of the lambda).  For instance, the previous grammar
+can be augmented with actions to return the parsed number as an actual
+integer:
+
+@example
+(with-peg-rules ((number sign digit (* digit
+                                       `(a b -- (+ (* a 10) b)))
+                         `(sign val -- (* sign val)))
+                 (sign (or (and "+" `(-- 1))
+                           (and "-" `(-- -1))
+                           (and ""  `(-- 1))))
+                 (digit [0-9] `(-- (- (char-before) ?0))))
+  (peg-run (peg number)))
+@end example
+
+There must be values on the stack before they can be popped and
+returned -- if there aren't enough stack values to bind to an action's
+left-hand terms, they will be bound to @code{nil}.  An action with
+only right-hand terms will push values to the stack; an action with
+only left-hand terms will consume (and discard) values from the stack.
+At the end of parsing, stack values are returned as a flat list.
+
+To return the string matched by a @acronym{PEX} (instead of simply
+moving point over it), a rule like this can be used:
+
+@example
+(one-word
+  `(-- (point))
+  (+ [word])
+  `(start -- (buffer-substring start (point))))
+@end example
+
+The first action pushes the initial value of point to the stack.  The
+intervening @acronym{PEX} moves point over the next word.  The second
+action pops the previous value from the stack (binding it to the
+variable @code{start}), and uses that value to extract a substring
+from the buffer and push it to the stack.  This pattern is so common
+that @acronym{PEG} provides a shorthand function that does exactly the
+above, along with a few other shorthands for common scenarios:
+
+@table @code
+@item (substring @var{E})
+Match @acronym{PEX} @var{E} and push the matched string to the stack.
+
+@item (region @var{E})
+Match @var{E} and push the start and end positions of the matched
+region to the stack.
+
+@item (replace @var{E} @var{replacement})
+Match @var{E} and replaced the matched region with the string @var{replacement}.
+
+@item (list @var{E})
+Match @var{E}, collect all values produced by @var{E} (and its
+sub-expressions) into a list, and push that list to the stack.  Stack
+values are typically returned as a flat list; this is a way of
+``grouping'' values together.
+@end table
+
+@node Writing PEG Rules
+@section Writing PEG Rules
+
+Something to be aware of when writing PEG rules is that they are
+greedy.  Rules which can consume a variable amount of text will always
+consume the maximum amount possible, even if that causes a rule that
+might otherwise have matched to fail later on -- there is no
+backtracking.  For instance, this rule will never succeed:
+
+@example
+(forest (+ "tree" (* [blank])) "tree" (eol))
+@end example
+
+The @acronym{PEX} @code{(+ "tree" (* [blank]))} will consume all
+repetitions of the word ``tree'', leaving none to match the final
+@code{"tree"}.
+
+In these situations, the desired result can be obtained by using
+predicates and guards -- namely the @code{not}, @code{if} and
+@code{guard} expressions -- to constrain behavior.  For instance:
+
+@example
+(forest (+ "tree" (* [blank])) (not (eol)) "tree" (eol))
+@end example
+
+The @code{if} and @code{not} operators accept a parsing expression and
+interpret it as a boolean, without moving point.  The contents of a
+@code{guard} operator are evaluated as regular Lisp (not a
+@acronym{PEX}) and should return a boolean value.  A @code{nil} value
+causes the match to fail.
+
+Another potentially unexpected behavior is that parsing will move
+point as far as possible, even if the parsing ultimately fails.  This
+rule:
+
+@example
+(end-game "game" (eob))
+@end example
+
+when run in a buffer containing the text ``game over'' after point,
+will move point to just after ``game'' then halt parsing, returning
+@code{nil}.  Successful parsing will always return @code{t}, or the
+contexts of the parsing stack.
diff --git a/etc/NEWS b/etc/NEWS

index 5bbd4353a8b43c9244264a5c6ea6f74d83bd0462..2ba1268ddad3ebb4917d2f3d923017edd6aeaa4f 100644 (file)
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -1695,6 +1695,14 @@ forwards-compatibility Compat package from GNU ELPA.  This allows
  built-in packages to use the library more effectively, and helps
  preventing the installation of Compat if unnecessary.
  
++++
+** New package PEG.
+Emacs now includes a library for writing (P)arsing (E)xpression
+(G)rammars, an approach to text parsing that provides more structure
+than regular expressions, but less complexity than context-free
+grammars.  The Info manual "(elisp) Parsing Expression Grammars" has
+documentation and examples.
+
  \f
  * Incompatible Lisp Changes in Emacs 30.1
  
diff --git a/lisp/progmodes/peg.el b/lisp/progmodes/peg.el

new file mode 100644 (file)

index 0000000..2eb4a73
--- /dev/null
+++ b/lisp/progmodes/peg.el
@@ -0,0 +1,944 @@
+;;; peg.el --- Parsing Expression Grammars in Emacs Lisp  -*- lexical-binding:t -*-
+
+;; Copyright (C) 2008-2023  Free Software Foundation, Inc.
+;;
+;; Author: Helmut Eller <eller.helmut@gmail.com>
+;; Maintainer: Stefan Monnier <monnier@iro.umontreal.ca>
+;; Version: 1.0.1
+;;
+;; This program is free software: you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
+;;
+;; This program is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with this program.  If not, see <https://www.gnu.org/licenses/>.
+;;
+;;; Commentary:
+;;
+;; This package implements Parsing Expression Grammars for Emacs Lisp.
+
+;; Parsing Expression Grammars (PEG) are a formalism in the spirit of
+;; Context Free Grammars (CFG) with some simplifications which makes
+;; the implementation of PEGs as recursive descent parsers particularly
+;; simple and easy to understand [Ford, Baker].
+;; PEGs are more expressive than regexps and potentially easier to use.
+;;
+;; This file implements the macros `define-peg-rule', `with-peg-rules', and
+;; `peg-parse' which parses the current buffer according to a PEG.
+;; E.g. we can match integers with:
+;;
+;;     (with-peg-rules
+;;         ((number sign digit (* digit))
+;;          (sign   (or "+" "-" ""))
+;;          (digit  [0-9]))
+;;       (peg-run (peg number)))
+;; or
+;;     (define-peg-rule digit ()
+;;       [0-9])
+;;     (peg-parse (number sign digit (* digit))
+;;                (sign   (or "+" "-" "")))
+;;
+;; In contrast to regexps, PEGs allow us to define recursive "rules".
+;; A "grammar" is a set of rules.  A rule is written as (NAME PEX...)
+;; E.g. (sign (or "+" "-" "")) is a rule with the name "sign".
+;; The syntax for PEX (Parsing Expression) is a follows:
+;;
+;;     Description             Lisp            Traditional, as in Ford's paper
+;;     ===========             ====            ===========
+;;     Sequence                        (and E1 E2)     e1 e2
+;;     Prioritized Choice      (or E1 E2)      e1 / e2
+;;     Not-predicate           (not E)         !e
+;;     And-predicate           (if E)          &e
+;;     Any character           (any)           .
+;;     Literal string          "abc"           "abc"
+;;     Character C             (char C)        'c'
+;;     Zero-or-more            (* E)           e*
+;;     One-or-more             (+ E)           e+
+;;     Optional                        (opt E)         e?
+;;     Non-terminal             SYMBOL         A
+;;     Character range         (range A B)     [a-b]
+;;     Character set           [a-b "+*" ?x]   [a-b+*x]   ;Note: it's a vector
+;;     Character classes       [ascii cntrl]
+;;     Boolean-guard           (guard EXP)
+;;     Syntax-Class            (syntax-class NAME)
+;;     Local definitions       (with RULES PEX...)
+;;     Indirect call            (funcall EXP ARGS...)
+;; and
+;;     Empty-string            (null)          ε
+;;     Beginning-of-Buffer     (bob)
+;;     End-of-Buffer           (eob)
+;;     Beginning-of-Line       (bol)
+;;     End-of-Line             (eol)
+;;     Beginning-of-Word       (bow)
+;;     End-of-Word             (eow)
+;;     Beginning-of-Symbol     (bos)
+;;     End-of-Symbol           (eos)
+;;
+;; Rules can refer to other rules, and a grammar is often structured
+;; as a tree, with a root rule referring to one or more "branch
+;; rules", all the way down to the "leaf rules" that deal with actual
+;; buffer text.  Rules can be recursive or mutually referential,
+;; though care must be taken not to create infinite loops.
+;;
+;;;; Named rulesets:
+;;
+;; You can define a set of rules for later use with:
+;;
+;;     (define-peg-ruleset myrules
+;;       (sign  () (or "+" "-" ""))
+;;       (digit () [0-9])
+;;       (nat   () digit (* digit))
+;;       (int   () sign digit (* digit))
+;;       (float () int "." nat))
+;;
+;; and later refer to it:
+;;
+;;     (with-peg-rules
+;;         (myrules
+;;          (complex float "+i" float))
+;;       ... (peg-parse nat "," nat "," complex) ...)
+;;
+;;;; Parsing actions:
+;;
+;; PEXs also support parsing actions, i.e. Lisp snippets which are
+;; executed when a pex matches.  This can be used to construct syntax
+;; trees or for similar tasks.  The most basic form of action is
+;; written as:
+;;
+;;     (action FORM)          ; evaluate FORM for its side-effects
+;;
+;; Actions don't consume input, but are executed at the point of
+;; match.  Another kind of action is called a "stack action", and
+;; looks like this:
+;;
+;;     `(VAR... -- FORM...)   ; stack action
+;;
+;; A stack action takes VARs from the "value stack" and pushes the
+;; results of evaluating FORMs to that stack.
+
+;; The value stack is created during the course of parsing.  Certain
+;; operators (see below) that match buffer text can push values onto
+;; this stack.  "Upstream" rules can then draw values from the stack,
+;; and optionally push new ones back.  For instance, consider this
+;; very simple grammar:
+;;
+;; (with-peg-rules
+;;     ((query (+ term) (eol))
+;;      (term key ":" value (opt (+ [space]))
+;;        `(k v -- (cons (intern k) v)))
+;;      (key (substring (and (not ":") (+ [word]))))
+;;      (value (or string-value number-value))
+;;      (string-value (substring (+ [alpha])))
+;;      (number-value (substring (+ [digit]))
+;;                `(val -- (string-to-number val))))
+;;   (peg-run (peg query)))
+;;
+;; This invocation of `peg-run' would parse this buffer text:
+;;
+;; name:Jane age:30
+;;
+;; And return this Elisp sexp:
+;;
+;; ((age . 30) (name . "Jane"))
+;;
+;; Note that, in complex grammars, some care must be taken to make
+;; sure that the number and type of values drawn from the stack always
+;; match those pushed.  In the example above, both `string-value' and
+;; `number-value' push a single value to the stack.  Since the `value'
+;; rule only includes these two sub-rules, any upstream rule that
+;; makes use of `value' can be confident it will always and only push
+;; a single value to the stack.
+;;
+;; Stack action forms are in a sense analogous to lambda forms: the
+;; symbols before the "--" are the equivalent of lambda arguments,
+;; while the forms after the "--" are return values.  The difference
+;; being that a lambda form can only return a single value, while a
+;; stack action can push multiple values onto the stack.  It's also
+;; perfectly valid to use `(-- FORM...)' or `(VAR... --)': the former
+;; pushes values to the stack without consuming any, and the latter
+;; pops values from the stack and discards them.
+;;
+;;;; Derived Operators:
+;;
+;; The following operators are implemented as combinations of
+;; primitive expressions:
+;;
+;;     (substring E)  ; Match E and push the substring for the matched region.
+;;     (region E)     ; Match E and push the start and end positions.
+;;     (replace E RPL); Match E and replace the matched region with RPL.
+;;     (list E)       ; Match E and push a list of the items that E produced.
+;;
+;; See `peg-ex-parse-int' in `peg-tests.el' for further examples.
+;;
+;; Regexp equivalents:
+;;
+;; Here a some examples for regexps and how those could be written as pex.
+;; [Most are taken from rx.el]
+;;
+;;     "^[a-z]*"
+;;     (and (bol) (* [a-z]))
+;;
+;;     "\n[^ \t]"
+;;     (and "\n" (not [" \t"]) (any))
+;;
+;;     "\\*\\*\\* EOOH \\*\\*\\*\n"
+;;     "*** EOOH ***\n"
+;;
+;;     "\\<\\(catch\\|finally\\)\\>[^_]"
+;;     (and (bow) (or "catch" "finally") (eow) (not "_") (any))
+;;
+;;     "[ \t\n]*:\\([^:]+\\|$\\)"
+;;     (and (* [" \t\n"]) ":" (or (+ (not ":") (any)) (eol)))
+;;
+;;     "^content-transfer-encoding:\\(\n?[\t ]\\)*quoted-printable\\(\n?[\t ]\\)*"
+;;     (and (bol)
+;;          "content-transfer-encoding:"
+;;          (* (opt "\n") ["\t "])
+;;          "quoted-printable"
+;;          (* (opt "\n") ["\t "]))
+;;
+;;     "\\$[I]d: [^ ]+ \\([^ ]+\\) "
+;;     (and "$Id: " (+ (not " ") (any)) " " (+ (not " ") (any)) " ")
+;;
+;;     "^;;\\s-*\n\\|^\n"
+;;     (or (and (bol) ";;" (* (syntax-class whitespace)) "\n")
+;;         (and (bol) "\n"))
+;;
+;;     "\\\\\\\\\\[\\w+"
+;;     (and "\\\\[" (+ (syntax-class word)))
+;;
+;; See ";;; Examples" in `peg-tests.el' for other examples.
+;;
+;;;; Rule argument and indirect calls:
+;;
+;; Rules can take arguments and those arguments can themselves be PEGs.
+;; For example:
+;;
+;;     (define-peg-rule 2-or-more (peg)
+;;       (funcall peg)
+;;       (funcall peg)
+;;       (* (funcall peg)))
+;;
+;;     ... (peg-parse
+;;          ...
+;;          (2-or-more (peg foo))
+;;          ...
+;;          (2-or-more (peg bar))
+;;          ...)
+;;
+;;;; References:
+;;
+;; [Ford] Bryan Ford. Parsing Expression Grammars: a Recognition-Based
+;; Syntactic Foundation. In POPL'04: Proceedings of the 31st ACM
+;; SIGPLAN-SIGACT symposium on Principles of Programming Languages,
+;; pages 111-122, New York, NY, USA, 2004. ACM Press.
+;; http://pdos.csail.mit.edu/~baford/packrat/
+;;
+;; [Baker] Baker, Henry G. "Pragmatic Parsing in Common Lisp".  ACM Lisp
+;; Pointers 4(2), April--June 1991, pp. 3--15.
+;; http://home.pipeline.com/~hbaker1/Prag-Parse.html
+;;
+;; Roman Redziejowski does good PEG related research
+;; http://www.romanredz.se/pubs.htm
+
+;;;; Todo:
+
+;; - Fix the exponential blowup in `peg-translate-exp'.
+;; - Add a proper debug-spec for PEXs.
+
+;;; News:
+
+;; Since 1.0.1:
+;; - Use OClosures to represent PEG rules when available, and let cl-print
+;;   display their source code.
+;; - New PEX form (with RULES PEX...).
+;; - Named rulesets.
+;; - You can pass arguments to rules.
+;; - New `funcall' rule to call rules indirectly (e.g. a peg you received
+;;   as argument).
+
+;; Version 1.0:
+;; - New official entry points `peg` and `peg-run`.
+
+;;; Code:
+
+(eval-when-compile (require 'cl-lib))
+
+(defvar peg--actions nil
+  "Actions collected along the current parse.
+Used at runtime for backtracking.  It's a list ((POS . THUNK)...).
+Each THUNK is executed at the corresponding POS.  Thunks are
+executed in a postprocessing step, not during parsing.")
+
+(defvar peg--errors nil
+  "Data keeping track of the rightmost parse failure location.
+It's a pair (POSITION . EXPS ...).  POSITION is the buffer position and
+EXPS is a list of rules/expressions that failed.")
+
+;;;; Main entry points
+
+(defmacro peg--when-fboundp (f &rest body)
+  (declare (indent 1) (debug (sexp body)))
+  (when (fboundp f)
+    (macroexp-progn body)))
+
+(peg--when-fboundp oclosure-define
+  (oclosure-define peg-function
+    "Parsing function built from PEG rule."
+    pexs)
+
+  (cl-defmethod cl-print-object ((peg peg-function) stream)
+    (princ "#f<peg " stream)
+    (let ((args (help-function-arglist peg 'preserve-names)))
+      (if args
+          (prin1 args stream)
+        (princ "()" stream)))
+    (princ " " stream)
+    (prin1 (peg-function--pexs peg) stream)
+    (princ ">" stream)))
+
+(defmacro peg--lambda (pexs args &rest body)
+  (declare (indent 2)
+           (debug (&define form lambda-list def-body)))
+  (if (fboundp 'oclosure-lambda)
+      `(oclosure-lambda (peg-function (pexs ,pexs)) ,args . ,body)
+    `(lambda ,args . ,body)))
+
+;; Sometimes (with-peg-rules ... (peg-run (peg ...))) is too
+;; longwinded for the task at hand, so `peg-parse' comes in handy.
+(defmacro peg-parse (&rest pexs)
+  "Match PEXS at point.
+PEXS is a sequence of PEG expressions, implicitly combined with `and'.
+Returns STACK if the match succeed and signals an error on failure,
+moving point along the way.
+PEXS can also be a list of PEG rules, in which case the first rule is used."
+  (if (and (consp (car pexs))
+           (symbolp (caar pexs))
+           (not (ignore-errors (peg-normalize (car pexs)))))
+      ;; `pexs' is a list of rules: use the first rule as entry point.
+      `(with-peg-rules ,pexs (peg-run (peg ,(caar pexs)) #'peg-signal-failure))
+    `(peg-run (peg ,@pexs) #'peg-signal-failure)))
+
+(defmacro peg (&rest pexs)
+  "Return a PEG-matcher that matches PEXS."
+  (pcase (peg-normalize `(and . ,pexs))
+    (`(call ,name) `#',(peg--rule-id name)) ;Optimize this case by η-reduction!
+    (exp `(peg--lambda ',pexs () ,(peg-translate-exp exp)))))
+
+;; There are several "infos we want to return" when parsing a given PEX:
+;; 1- We want to return the success/failure of the parse.
+;; 2- We want to return the data of the successful parse (the stack).
+;; 3- We want to return the diagnostic of the failures.
+;; 4- We want to perform the actions (upon parse success)!
+;; `peg-parse' used an error signal to encode the (1) boolean, which
+;; lets it return all the info conveniently but the error signal was sometimes
+;; inconvenient.  Other times one wants to just know (1) maybe without even
+;; performing (4).
+;; `peg-run' lets you choose all that, and by default gives you
+;; (1) as a simple boolean, while also doing (2), and (4).
+
+(defun peg-run (peg-matcher &optional failure-function success-function)
+  "Parse with PEG-MATCHER at point and run the success/failure function.
+If a match was found, move to the end of the match and call SUCCESS-FUNCTION
+with one argument: a function which will perform all the actions collected
+during the parse and then return the resulting stack (or t if empty).
+If no match was found, move to the (rightmost) point of parse failure and call
+FAILURE-FUNCTION with one argument, which is a list of PEG expressions that
+failed at this point.
+SUCCESS-FUNCTION defaults to `funcall' and FAILURE-FUNCTION
+defaults to `ignore'."
+  (let ((peg--actions '()) (peg--errors '(-1)))
+    (if (funcall peg-matcher)
+        ;; Found a parse: run the actions collected along the way.
+        (funcall (or success-function #'funcall)
+                 (lambda ()
+                   (save-excursion (peg-postprocess peg--actions))))
+      (goto-char (car peg--errors))
+      (when failure-function
+        (funcall failure-function (peg-merge-errors (cdr peg--errors)))))))
+
+(defmacro define-peg-rule (name args &rest pexs)
+  "Define PEG rule NAME as equivalent to PEXS.
+The PEG expressions in PEXS are implicitly combined with the
+sequencing `and' operator of PEG grammars."
+  (declare (indent 1))
+  (let ((inline nil))
+    (while (keywordp (car pexs))
+      (pcase (pop pexs)
+        (:inline (setq inline (car pexs))))
+      (setq pexs (cdr pexs)))
+    (let ((id (peg--rule-id name))
+          (exp (peg-normalize `(and . ,pexs))))
+      `(progn
+         (defalias ',id
+           (peg--lambda ',pexs ,args
+             ,(if inline
+                  ;; Short-circuit to peg--translate in order to skip
+                  ;; the extra failure-recording of `peg-translate-exp'.
+                  ;; It also skips the cycle detection of
+                  ;; `peg--translate-rule-body', which is not the main
+                  ;; purpose but we can live with it.
+                  (apply #'peg--translate exp)
+                (peg--translate-rule-body name exp))))
+         (eval-and-compile
+           ;; FIXME: We shouldn't need this any more since the info is now
+           ;; stored in the function, but sadly we need to find a name's EXP
+           ;; during compilation (i.e. before the `defalias' is executed)
+           ;; as part of cycle-detection!
+           (put ',id 'peg--rule-definition ',exp)
+           ,@(when inline
+               ;; FIXME: Copied from `defsubst'.
+               `(;; Never native-compile defsubsts as we need the byte
+                 ;; definition in `byte-compile-unfold-bcf' to perform the
+                 ;; inlining (Bug#42664, Bug#43280, Bug#44209).
+                 ,(byte-run--set-speed id nil -1)
+                 (put ',id 'byte-optimizer #'byte-compile-inline-expand))))))))
+
+(defmacro define-peg-ruleset (name &rest rules)
+  "Define a set of PEG rules for later use, e.g., in `with-peg-rules'."
+  (declare (indent 1))
+  (let ((defs ())
+        (aliases ()))
+    (dolist (rule rules)
+      (let* ((rname (car rule))
+             (full-rname (format "%s %s" name rname)))
+        (push `(define-peg-rule ,full-rname . ,(cdr rule)) defs)
+        (push `(,(peg--rule-id rname) #',(peg--rule-id full-rname)) aliases)))
+    `(cl-flet ,aliases
+       ,@defs
+       (eval-and-compile (put ',name 'peg--rules ',aliases)))))
+
+(defmacro with-peg-rules (rules &rest body)
+  "Make PEG rules RULES available within the scope of BODY.
+RULES is a list of rules of the form (NAME . PEXS), where PEXS is a sequence
+of PEG expressions, implicitly combined with `and'.
+RULES can also contain symbols in which case these must name
+rulesets defined previously with `define-peg-ruleset'."
+  (declare (indent 1) (debug (sexp form))) ;FIXME: `sexp' is not good enough!
+  (let* ((rulesets nil)
+         (rules
+          ;; First, macroexpand the rules.
+          (delq nil
+                (mapcar (lambda (rule)
+                          (if (symbolp rule)
+                              (progn (push rule rulesets) nil)
+                            (cons (car rule) (peg-normalize `(and . ,(cdr rule))))))
+                        rules)))
+        (ctx (assq :peg-rules macroexpand-all-environment)))
+    (macroexpand-all
+     `(cl-labels
+          ,(mapcar (lambda (rule)
+                    ;; FIXME: Use `peg--lambda' as well.
+                    `(,(peg--rule-id (car rule))
+                      ()
+                      ,(peg--translate-rule-body (car rule) (cdr rule))))
+                  rules)
+        ,@body)
+     `((:peg-rules ,@(append rules (cdr ctx)))
+       ,@macroexpand-all-environment))))
+
+;;;;; Old entry points
+
+(defmacro peg-parse-exp (exp)
+  "Match the parsing expression EXP at point."
+  (declare (obsolete peg-parse "peg-0.9"))
+  `(peg-run (peg ,exp)))
+
+;;;; The actual implementation
+
+(defun peg--lookup-rule (name)
+  (or (cdr (assq name (cdr (assq :peg-rules macroexpand-all-environment))))
+      ;; With `peg-function' objects, we can recover the PEG from which it was
+      ;; defined, but this info is not yet available at compile-time.  :-(
+      ;;(let ((id (peg--rule-id name)))
+      ;;  (peg-function--pexs (symbol-function id)))
+      (get (peg--rule-id name) 'peg--rule-definition)))
+
+(defun peg--rule-id (name)
+  (intern (format "peg-rule %s" name)))
+
+(define-error 'peg-search-failed "Parse error at %d (expecting %S)")
+
+(defun peg-signal-failure (failures)
+  (signal 'peg-search-failed (list (point) failures)))
+
+(defun peg-parse-at-point (peg-matcher)
+  "Parse text at point according to the PEG rule PEG-MATCHER."
+  (declare (obsolete peg-run "peg-1.0"))
+  (peg-run peg-matcher
+           #'peg-signal-failure
+           (lambda (f) (let ((r (funcall f))) (if (listp r) r)))))
+
+;; Internally we use a regularized syntax, e.g. we only have binary OR
+;; nodes.  Regularized nodes are lists of the form (OP ARGS...).
+(cl-defgeneric peg-normalize (exp)
+  "Return a \"normalized\" form of EXP."
+  (error "Invalid parsing expression: %S" exp))
+
+(cl-defmethod peg-normalize ((exp string))
+  (let ((len (length exp)))
+    (cond ((zerop len) '(guard t))
+         ((= len 1) `(char ,(aref exp 0)))
+         (t `(str ,exp)))))
+
+(cl-defmethod peg-normalize ((exp symbol))
+  ;; (peg--lookup-rule exp)
+  `(call ,exp))
+
+(cl-defmethod peg-normalize ((exp vector))
+  (peg-normalize `(set . ,(append exp '()))))
+
+(cl-defmethod peg-normalize ((exp cons))
+  (apply #'peg--macroexpand exp))
+
+(defconst peg-leaf-types '(any call action char range str set
+                          guard syntax-class = funcall))
+
+(cl-defgeneric peg--macroexpand (head &rest args)
+  (cond
+   ((memq head peg-leaf-types) (cons head args))
+   (t `(call ,head ,@args))))
+
+(cl-defmethod peg--macroexpand ((_ (eql or)) &rest args)
+  (cond ((null args) '(guard nil))
+       ((null (cdr args)) (peg-normalize (car args)))
+       (t `(or ,(peg-normalize (car args))
+               ,(peg-normalize `(or . ,(cdr args)))))))
+
+(cl-defmethod peg--macroexpand ((_ (eql and)) &rest args)
+  (cond ((null args) '(guard t))
+       ((null (cdr args)) (peg-normalize (car args)))
+       (t `(and ,(peg-normalize (car args))
+                ,(peg-normalize `(and . ,(cdr args)))))))
+
+(cl-defmethod peg--macroexpand ((_ (eql *)) &rest args)
+  `(* ,(peg-normalize `(and . ,args))))
+
+;; FIXME: this duplicates code; could use some loop to avoid that
+(cl-defmethod peg--macroexpand ((_ (eql +)) &rest args)
+  (let ((e (peg-normalize `(and . ,args))))
+    `(and ,e (* ,e))))
+
+(cl-defmethod peg--macroexpand ((_ (eql opt)) &rest args)
+  (let ((e (peg-normalize `(and . ,args))))
+    `(or ,e (guard t))))
+
+(cl-defmethod peg--macroexpand ((_ (eql if)) &rest args)
+  `(if ,(peg-normalize `(and . ,args))))
+
+(cl-defmethod peg--macroexpand ((_ (eql not)) &rest args)
+  `(not ,(peg-normalize `(and . ,args))))
+
+(cl-defmethod peg--macroexpand ((_ (eql \`)) form)
+  (peg-normalize `(stack-action ,form)))
+
+(cl-defmethod peg--macroexpand ((_ (eql stack-action)) form)
+  (unless (member '-- form)
+    (error "Malformed stack action: %S" form))
+  (let ((args (cdr (member '-- (reverse form))))
+       (values (cdr (member '-- form))))
+    (let ((form `(let ,(mapcar (lambda (var) `(,var (pop peg--stack))) args)
+                  ,@(mapcar (lambda (val) `(push ,val peg--stack)) values))))
+      `(action ,form))))
+
+(defvar peg-char-classes
+  '(ascii alnum alpha blank cntrl digit graph lower multibyte nonascii print
+         punct space unibyte upper word xdigit))
+
+(cl-defmethod peg--macroexpand ((_ (eql set)) &rest specs)
+  (cond ((null specs) '(guard nil))
+       ((and (null (cdr specs))
+             (let ((range (peg-range-designator (car specs))))
+               (and range `(range ,(car range) ,(cdr range))))))
+       (t
+        (let ((chars '()) (ranges '()) (classes '()))
+          (while specs
+            (let* ((spec (pop specs))
+                   (range (peg-range-designator spec)))
+              (cond (range
+                     (push range ranges))
+                    ((peg-characterp spec)
+                     (push spec chars))
+                    ((stringp spec)
+                     (setq chars (append (reverse (append spec ())) chars)))
+                    ((memq spec peg-char-classes)
+                     (push spec classes))
+                    (t (error "Invalid set specifier: %S" spec)))))
+          (setq ranges (reverse ranges))
+          (setq chars (delete-dups (reverse chars)))
+          (setq classes (reverse classes))
+          (cond ((and (null ranges)
+                      (null classes)
+                      (cond ((null chars) '(guard nil))
+                            ((null (cdr chars)) `(char ,(car chars))))))
+                (t `(set ,ranges ,chars ,classes)))))))
+
+(defun peg-range-designator (x)
+  (and (symbolp x)
+       (let ((str (symbol-name x)))
+        (and (= (length str) 3)
+             (eq (aref str 1) ?-)
+             (< (aref str 0) (aref str 2))
+             (cons (aref str 0) (aref str 2))))))
+
+;; characterp is new in Emacs 23.
+(defun peg-characterp (x)
+  (if (fboundp 'characterp)
+      (characterp x)
+    (integerp x)))
+
+(cl-defmethod peg--macroexpand ((_ (eql list)) &rest args)
+  (peg-normalize
+   (let ((marker (make-symbol "magic-marker")))
+     `(and (stack-action (-- ',marker))
+          ,@args
+          (stack-action (--
+                         (let ((l '()))
+                           (while
+                               (let ((e (pop peg--stack)))
+                                 (cond ((eq e ',marker) nil)
+                                       ((null peg--stack)
+                                        (error "No marker on stack"))
+                                       (t (push e l) t))))
+                           l)))))))
+
+(cl-defmethod peg--macroexpand ((_ (eql substring)) &rest args)
+  (peg-normalize
+   `(and `(-- (point))
+        ,@args
+        `(start -- (buffer-substring-no-properties start (point))))))
+
+(cl-defmethod peg--macroexpand ((_ (eql region)) &rest args)
+  (peg-normalize
+   `(and `(-- (point))
+        ,@args
+        `(-- (point)))))
+
+(cl-defmethod peg--macroexpand ((_ (eql replace)) pe replacement)
+  (peg-normalize
+   `(and (stack-action (-- (point)))
+        ,pe
+        (stack-action (start -- (progn
+                                  (delete-region start (point))
+                                  (insert-before-markers ,replacement))))
+        (stack-action (_ --)))))
+
+(cl-defmethod peg--macroexpand ((_ (eql quote)) _form)
+  (error "quote is reserved for future use"))
+
+(cl-defgeneric peg--translate (head &rest args)
+  (error "No translator for: %S" (cons head args)))
+
+(defun peg--translate-rule-body (name exp)
+  (let ((msg (condition-case err
+                 (progn (peg-detect-cycles exp (list name)) nil)
+               (error (error-message-string err))))
+        (code (peg-translate-exp exp)))
+    (cond
+     ((null msg) code)
+     ((fboundp 'macroexp--warn-and-return)
+      (macroexp--warn-and-return msg code))
+     (t
+      (message "%s" msg)
+      code))))
+
+;; This is the main translation function.
+(defun peg-translate-exp (exp)
+  "Return the ELisp code to match the PE EXP."
+  ;; FIXME: This expansion basically duplicates `exp' in the output, which is
+  ;; a serious problem because it's done recursively, so it makes the output
+  ;; code's size exponentially larger than the input!
+  `(or ,(apply #'peg--translate exp)
+       (peg--record-failure ',exp))) ; for error reporting
+
+(define-obsolete-function-alias 'peg-record-failure
+  #'peg--record-failure "peg-1.0")
+(defun peg--record-failure (exp)
+  (cond ((= (point) (car peg--errors))
+        (setcdr peg--errors (cons exp (cdr peg--errors))))
+       ((> (point) (car peg--errors))
+        (setq peg--errors (list (point) exp))))
+  nil)
+
+(cl-defmethod peg--translate ((_ (eql and)) e1 e2)
+  `(and ,(peg-translate-exp e1)
+       ,(peg-translate-exp e2)))
+
+;; Choicepoints are used for backtracking.  At a choicepoint we save
+;; enough state, so that we can continue from there if needed.
+(defun peg--choicepoint-moved-p (choicepoint)
+  `(/= ,(car choicepoint) (point)))
+
+(defun peg--choicepoint-restore (choicepoint)
+  `(progn
+     (goto-char ,(car choicepoint))
+     (setq peg--actions ,(cdr choicepoint))))
+
+(defmacro peg--with-choicepoint (var &rest body)
+  (declare (indent 1) (debug (symbolp form)))
+  `(let ((,var (cons (make-symbol "point") (make-symbol "actions"))))
+     `(let ((,(car ,var) (point))
+           (,(cdr ,var) peg--actions))
+        ,@(list ,@body))))
+
+(cl-defmethod peg--translate ((_ (eql or)) e1 e2)
+  (peg--with-choicepoint cp
+    `(or ,(peg-translate-exp e1)
+        (,@(peg--choicepoint-restore cp)
+         ,(peg-translate-exp e2)))))
+
+(cl-defmethod peg--translate ((_ (eql with)) rules &rest exps)
+  `(with-peg-rules ,rules ,(peg--translate `(and . ,exps))))
+
+(cl-defmethod peg--translate ((_ (eql guard)) exp) exp)
+
+(defvar peg-syntax-classes
+  '((whitespace ?-) (word ?w) (symbol ?s) (punctuation ?.)
+    (open ?\() (close ?\)) (string ?\") (escape ?\\) (charquote ?/)
+    (math ?$) (prefix ?') (comment ?<) (endcomment ?>)
+    (comment-fence ?!) (string-fence ?|)))
+
+(cl-defmethod peg--translate ((_ (eql syntax-class)) class)
+  (let ((probe (assoc class peg-syntax-classes)))
+    (cond (probe `(when (looking-at ,(format "\\s%c" (cadr probe)))
+                    (forward-char)
+                    t))
+         (t (error "Invalid syntax class: %S\nMust be one of: %s" class
+                   (mapcar #'car peg-syntax-classes))))))
+
+(cl-defmethod peg--translate ((_ (eql =)) string)
+  `(let ((str ,string))
+     (when (zerop (length str))
+       (error "Empty strings not allowed for ="))
+     (search-forward str (+ (point) (length str)) t)))
+
+(cl-defmethod peg--translate ((_ (eql *)) e)
+  `(progn (while ,(peg--with-choicepoint cp
+                   `(if ,(peg-translate-exp e)
+                         ;; Just as regexps do for the `*' operator,
+                         ;; we allow the body of `*' loops to match
+                         ;; the empty string, but we don't repeat the loop if
+                         ;; we haven't moved, to avoid inf-loops.
+                         ,(peg--choicepoint-moved-p cp)
+                       ,(peg--choicepoint-restore cp)
+                      nil)))
+         t))
+
+(cl-defmethod peg--translate ((_ (eql if)) e)
+  (peg--with-choicepoint cp
+    `(when ,(peg-translate-exp e)
+       ,(peg--choicepoint-restore cp)
+       t)))
+
+(cl-defmethod peg--translate ((_ (eql not)) e)
+  (peg--with-choicepoint cp
+    `(unless ,(peg-translate-exp e)
+       ,(peg--choicepoint-restore cp)
+       t)))
+
+(cl-defmethod peg--translate ((_ (eql any)) )
+  '(when (not (eobp))
+     (forward-char)
+     t))
+
+(cl-defmethod peg--translate ((_ (eql char)) c)
+  `(when (eq (char-after) ',c)
+     (forward-char)
+     t))
+
+(cl-defmethod peg--translate ((_ (eql set)) ranges chars classes)
+  `(when (looking-at ',(peg-make-charset-regexp ranges chars classes))
+     (forward-char)
+     t))
+
+(defun peg-make-charset-regexp (ranges chars classes)
+  (when (and (not ranges) (not classes) (<= (length chars) 1))
+    (error "Bug"))
+  (let ((rbracket (member ?\] chars))
+       (minus (member ?- chars))
+       (hat (member ?^ chars)))
+    (dolist (c '(?\] ?- ?^))
+      (setq chars (remove c chars)))
+    (format "[%s%s%s%s%s%s]"
+           (if rbracket "]" "")
+           (if minus "-" "")
+           (mapconcat (lambda (x) (format "%c-%c" (car x) (cdr x))) ranges "")
+           (mapconcat (lambda (c) (format "[:%s:]" c)) classes "")
+           (mapconcat (lambda (c) (format "%c" c)) chars "")
+           (if hat "^" ""))))
+
+(cl-defmethod peg--translate ((_ (eql range)) from to)
+  `(when (and (char-after)
+             (<= ',from (char-after))
+             (<= (char-after) ',to))
+     (forward-char)
+     t))
+
+(cl-defmethod peg--translate ((_ (eql str)) str)
+  `(when (looking-at ',(regexp-quote str))
+     (goto-char (match-end 0))
+     t))
+
+(cl-defmethod peg--translate ((_ (eql call)) name &rest args)
+  `(,(peg--rule-id name) ,@args))
+
+(cl-defmethod peg--translate ((_ (eql funcall)) exp &rest args)
+  `(funcall ,exp ,@args))
+
+(cl-defmethod peg--translate ((_ (eql action)) form)
+  `(progn
+     (push (cons (point) (lambda () ,form)) peg--actions)
+     t))
+
+(defvar peg--stack nil)
+(defun peg-postprocess (actions)
+  "Execute \"actions\"."
+  (let  ((peg--stack '())
+         (forw-actions ()))
+    (pcase-dolist (`(,pos . ,thunk) actions)
+      (push (cons (copy-marker pos) thunk) forw-actions))
+    (pcase-dolist (`(,pos . ,thunk) forw-actions)
+      (goto-char pos)
+      (funcall thunk))
+    (or peg--stack t)))
+
+;; Left recursion is presumably a common mistake when using PEGs.
+;; Here we try to detect such mistakes.  Essentially we traverse the
+;; graph as long as we can without consuming input.  When we find a
+;; recursive call we signal an error.
+
+(defun peg-detect-cycles (exp path)
+  "Signal an error on a cycle.
+Otherwise traverse EXP recursively and return T if EXP can match
+without consuming input.  Return nil if EXP definitely consumes
+input.  PATH is the list of rules that we have visited so far."
+  (apply #'peg--detect-cycles path exp))
+
+(cl-defgeneric peg--detect-cycles (head _path &rest args)
+  (error "No detect-cycle method for: %S" (cons head args)))
+
+(cl-defmethod peg--detect-cycles (path (_ (eql call)) name)
+  (if (member name path)
+      (error "Possible left recursion: %s"
+            (mapconcat (lambda (x) (format "%s" x))
+                       (reverse (cons name path)) " -> "))
+    (let ((exp (peg--lookup-rule name)))
+      (if (null exp)
+          ;; If there's no rule by that name, either we'll fail at
+          ;; run-time or it will be defined later.  In any case, at this
+          ;; point there's no evidence of a cycle, and if a cycle appears
+          ;; later we'll hopefully catch it when the rule gets defined.
+          ;; FIXME: In practice, if `name' is part of the cycle, we will
+          ;; indeed detect it when it gets defined, but OTOH if `name'
+          ;; is not part of a cycle but it *enables* a cycle because
+          ;; it matches the empty string (i.e. we should have returned t
+          ;; here), then we may not catch the problem at all :-(
+          nil
+       (peg-detect-cycles exp (cons name path))))))
+
+(cl-defmethod peg--detect-cycles (path (_ (eql and)) e1 e2)
+  (and (peg-detect-cycles e1 path)
+       (peg-detect-cycles e2 path)))
+
+(cl-defmethod peg--detect-cycles (path (_ (eql or)) e1 e2)
+  (or (peg-detect-cycles e1 path)
+      (peg-detect-cycles e2 path)))
+
+(cl-defmethod peg--detect-cycles (path (_ (eql *)) e)
+  (peg-detect-cycles e path)
+  t)
+
+(cl-defmethod peg--detect-cycles (path (_ (eql if)) e)
+  (peg-unary-nullable e path))
+(cl-defmethod peg--detect-cycles (path (_ (eql not)) e)
+  (peg-unary-nullable e path))
+
+(defun peg-unary-nullable (exp path)
+  (peg-detect-cycles exp path)
+  t)
+
+(cl-defmethod peg--detect-cycles (_path (_ (eql any)))           nil)
+(cl-defmethod peg--detect-cycles (_path (_ (eql char)) _c)       nil)
+(cl-defmethod peg--detect-cycles (_path (_ (eql set)) _r _c _k)  nil)
+(cl-defmethod peg--detect-cycles (_path (_ (eql range)) _c1 _c2) nil)
+(cl-defmethod peg--detect-cycles (_path (_ (eql str)) s)         (equal s ""))
+(cl-defmethod peg--detect-cycles (_path (_ (eql guard)) _e)      t)
+(cl-defmethod peg--detect-cycles (_path (_ (eql =)) _s)          nil)
+(cl-defmethod peg--detect-cycles (_path (_ (eql syntax-class)) _n) nil)
+(cl-defmethod peg--detect-cycles (_path (_ (eql action)) _form)  t)
+
+(defun peg-merge-errors (exps)
+  "Build a more readable error message out of failed expression."
+  (let ((merged '()))
+    (dolist (exp exps)
+      (setq merged (peg-merge-error exp merged)))
+    merged))
+
+(defun peg-merge-error (exp merged)
+  (apply #'peg--merge-error merged exp))
+
+(cl-defgeneric peg--merge-error (_merged head &rest args)
+  (error "No merge-error method for: %S" (cons head args)))
+
+(cl-defmethod peg--merge-error (merged (_ (eql or)) e1 e2)
+  (peg-merge-error e2 (peg-merge-error e1 merged)))
+
+(cl-defmethod peg--merge-error (merged (_ (eql and)) e1 _e2)
+  ;; FIXME: Why is `e2' not used?
+  (peg-merge-error e1 merged))
+
+(cl-defmethod peg--merge-error (merged (_ (eql str)) str)
+  ;;(add-to-list 'merged str)
+  (cl-adjoin str merged :test #'equal))
+
+(cl-defmethod peg--merge-error (merged (_ (eql call)) rule)
+  ;; (add-to-list 'merged rule)
+  (cl-adjoin rule merged :test #'equal))
+
+(cl-defmethod peg--merge-error (merged (_ (eql char)) char)
+  ;; (add-to-list 'merged (string char))
+  (cl-adjoin (string char) merged :test #'equal))
+
+(cl-defmethod peg--merge-error (merged (_ (eql set)) r c k)
+  ;; (add-to-list 'merged (peg-make-charset-regexp r c k))
+  (cl-adjoin (peg-make-charset-regexp r c k) merged :test #'equal))
+
+(cl-defmethod peg--merge-error (merged (_ (eql range)) from to)
+  ;; (add-to-list 'merged (format "[%c-%c]" from to))
+  (cl-adjoin (format "[%c-%c]" from to) merged :test #'equal))
+
+(cl-defmethod peg--merge-error (merged (_ (eql *)) exp)
+  (peg-merge-error exp merged))
+
+(cl-defmethod peg--merge-error (merged (_ (eql any)))
+  ;; (add-to-list 'merged '(any))
+  (cl-adjoin '(any) merged :test #'equal))
+
+(cl-defmethod peg--merge-error (merged (_ (eql not)) x)
+  ;; (add-to-list 'merged `(not ,x))
+  (cl-adjoin `(not ,x) merged :test #'equal))
+
+(cl-defmethod peg--merge-error (merged (_ (eql action)) _action) merged)
+(cl-defmethod peg--merge-error (merged (_ (eql null))) merged)
+
+(provide 'peg)
+(require 'peg)
+
+(define-peg-rule null () :inline t (guard t))
+(define-peg-rule fail () :inline t (guard nil))
+(define-peg-rule bob  () :inline t (guard (bobp)))
+(define-peg-rule eob  () :inline t (guard (eobp)))
+(define-peg-rule bol  () :inline t (guard (bolp)))
+(define-peg-rule eol  () :inline t (guard (eolp)))
+(define-peg-rule bow  () :inline t (guard (looking-at "\\<")))
+(define-peg-rule eow  () :inline t (guard (looking-at "\\>")))
+(define-peg-rule bos  () :inline t (guard (looking-at "\\_<")))
+(define-peg-rule eos  () :inline t (guard (looking-at "\\_>")))
+
+;;; peg.el ends here
diff --git a/test/lisp/peg-tests.el b/test/lisp/peg-tests.el

new file mode 100644 (file)

index 0000000..864e09b
--- /dev/null
+++ b/test/lisp/peg-tests.el
@@ -0,0 +1,367 @@
+;;; peg-tests.el --- Tests of PEG parsers            -*- lexical-binding: t; -*-
+
+;; Copyright (C) 2008-2023  Free Software Foundation, Inc.
+
+;; This program is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
+
+;; This program is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+;;; Commentary:
+
+;; Tests and examples, that used to live in peg.el wrapped inside an `eval'.
+
+;;; Code:
+
+(require 'peg)
+(require 'ert)
+
+;;; Tests:
+
+(defmacro peg-parse-string (pex string &optional noerror)
+  "Parse STRING according to PEX.
+If NOERROR is non-nil, push nil resp. t if the parse failed
+resp. succeeded instead of signaling an error."
+  (let ((oldstyle (consp (car-safe pex)))) ;PEX is really a list of rules.
+    `(with-temp-buffer
+       (insert ,string)
+       (goto-char (point-min))
+       ,(if oldstyle
+            `(with-peg-rules ,pex
+               (peg-run (peg ,(caar pex))
+                        ,(unless noerror '#'peg-signal-failure)))
+          `(peg-run (peg ,pex)
+                    ,(unless noerror '#'peg-signal-failure))))))
+
+(define-peg-rule peg-test-natural ()
+  [0-9] (* [0-9]))
+
+(ert-deftest peg-test ()
+  (should (peg-parse-string peg-test-natural "99 bottles" t))
+  (should (peg-parse-string ((s "a")) "a" t))
+  (should (not (peg-parse-string ((s "a")) "b" t)))
+  (should (peg-parse-string ((s (not "a"))) "b" t))
+  (should (not (peg-parse-string ((s (not "a"))) "a" t)))
+  (should (peg-parse-string ((s (if "a"))) "a" t))
+  (should (not (peg-parse-string ((s (if "a"))) "b" t)))
+  (should (peg-parse-string ((s "ab")) "ab" t))
+  (should (not (peg-parse-string ((s "ab")) "ba" t)))
+  (should (not (peg-parse-string ((s "ab")) "a" t)))
+  (should (peg-parse-string ((s (range ?0 ?9))) "0" t))
+  (should (not (peg-parse-string ((s (range ?0 ?9))) "a" t)))
+  (should (peg-parse-string ((s [0-9])) "0" t))
+  (should (not (peg-parse-string ((s [0-9])) "a" t)))
+  (should (not (peg-parse-string ((s [0-9])) "" t)))
+  (should (peg-parse-string ((s (any))) "0" t))
+  (should (not (peg-parse-string ((s (any))) "" t)))
+  (should (peg-parse-string ((s (eob))) "" t))
+  (should (peg-parse-string ((s (not (eob)))) "a" t))
+  (should (peg-parse-string ((s (or "a" "b"))) "a" t))
+  (should (peg-parse-string ((s (or "a" "b"))) "b" t))
+  (should (not (peg-parse-string ((s (or "a" "b"))) "c" t)))
+  (should (peg-parse-string (and "a" "b") "ab" t))
+  (should (peg-parse-string ((s (and "a" "b"))) "abc" t))
+  (should (not (peg-parse-string (and "a" "b") "ba" t)))
+  (should (peg-parse-string ((s (and "a" "b" "c"))) "abc" t))
+  (should (peg-parse-string ((s (* "a") "b" (eob))) "b" t))
+  (should (peg-parse-string ((s (* "a") "b" (eob))) "ab" t))
+  (should (peg-parse-string ((s (* "a") "b" (eob))) "aaab" t))
+  (should (not (peg-parse-string ((s (* "a") "b" (eob))) "abc" t)))
+  (should (peg-parse-string ((s "")) "abc" t))
+  (should (peg-parse-string ((s "" (eob))) "" t))
+  (should (peg-parse-string ((s (opt "a") "b")) "abc" t))
+  (should (peg-parse-string ((s (opt "a") "b")) "bc" t))
+  (should (not (peg-parse-string ((s (or))) "ab" t)))
+  (should (peg-parse-string ((s (and))) "ab" t))
+  (should (peg-parse-string ((s (and))) "" t))
+  (should (peg-parse-string ((s ["^"])) "^" t))
+  (should (peg-parse-string ((s ["^a"])) "a" t))
+  (should (peg-parse-string ["-"] "-" t))
+  (should (peg-parse-string ((s ["]-"])) "]" t))
+  (should (peg-parse-string ((s ["^]"])) "^" t))
+  (should (peg-parse-string ((s [alpha])) "z" t))
+  (should (not (peg-parse-string ((s [alpha])) "0" t)))
+  (should (not (peg-parse-string ((s [alpha])) "" t)))
+  (should (not (peg-parse-string ((s ["][:alpha:]"])) "z" t)))
+  (should (peg-parse-string ((s (bob))) "" t))
+  (should (peg-parse-string ((s (bos))) "x" t))
+  (should (not (peg-parse-string ((s (bos))) " x" t)))
+  (should (peg-parse-string ((s "x" (eos))) "x" t))
+  (should (peg-parse-string ((s (syntax-class whitespace))) " " t))
+  (should (peg-parse-string ((s (= "foo"))) "foo" t))
+  (should (let ((f "foo")) (peg-parse-string ((s (= f))) "foo" t)))
+  (should (not (peg-parse-string ((s (= "foo"))) "xfoo" t)))
+  (should (equal (peg-parse-string ((s `(-- 1 2))) "") '(2 1)))
+  (should (equal (peg-parse-string ((s `(-- 1 2) `(a b -- a b))) "") '(2 1)))
+  (should (equal (peg-parse-string ((s (or (and (any) s)
+                                          (substring [0-9]))))
+                                  "ab0cd1ef2gh")
+                '("2")))
+  ;; The PEG rule `other' doesn't exist, which will cause a byte-compiler
+  ;; warning, but not an error at run time because the rule is not actually
+  ;; used in this particular case.
+  (should (equal (peg-parse-string ((s (substring (or "a" other)))
+                                    ;; Unused left-recursive rule, should
+                                    ;; cause a byte-compiler warning.
+                                    (r (* "a") r))
+                                   "af")
+                 '("a")))
+  (should (equal (peg-parse-string ((s (list x y))
+                                   (x `(-- 1))
+                                   (y `(-- 2)))
+                                  "")
+                '((1 2))))
+  (should (equal (peg-parse-string ((s (list (* x)))
+                                    (x "" `(-- 'x)))
+                                   "xxx")
+                 ;; The empty loop body should be matched once!
+                 '((x))))
+  (should (equal (peg-parse-string ((s (list (* x)))
+                                    (x "x" `(-- 'x)))
+                                   "xxx")
+                 '((x x x))))
+  (should (equal (peg-parse-string ((s (region (* x)))
+                                   (x "x" `(-- 'x)))
+                                  "xxx")
+                 ;; FIXME: Since string positions start at 0, this should
+                 ;; really be '(3 x x x 0) !!
+                '(4 x x x 1)))
+  (should (equal (peg-parse-string ((s (region (list (* x))))
+                                   (x "x" `(-- 'x 'y)))
+                                  "xxx")
+                '(4 (x y x y x y) 1)))
+  (should (equal (with-temp-buffer
+                  (save-excursion (insert "abcdef"))
+                  (list
+                   (peg-run (peg "a"
+                                 (replace "bc" "x")
+                                 (replace "de" "y")
+                                 "f"))
+                   (buffer-string)))
+                '(t "axyf")))
+  (with-temp-buffer
+    (insert "toro")
+    (goto-char (point-min))
+    (should (peg-run (peg "to")))
+    (should-not (peg-run (peg "to")))
+    (should (peg-run (peg "ro")))
+    (should (eobp)))
+  (with-temp-buffer
+    (insert "   ")
+    (goto-char (point-min))
+    (peg-run (peg (+ (syntax-class whitespace))))
+    (should (eobp)))
+  )
+
+;;; Examples:
+
+;; peg-ex-recognize-int recognizes integers.  An integer begins with a
+;; optional sign, then follows one or more digits.  Digits are all
+;; characters from 0 to 9.
+;;
+;; Notes:
+;; 1) "" matches the empty sequence, i.e. matches without consuming
+;;    input.
+;; 2) [0-9] is the character range from 0 to 9.  This can also be
+;;    written as (range ?0 ?9).  Note that 0-9 is a symbol.
+(defun peg-ex-recognize-int ()
+  (with-peg-rules ((number   sign digit (* digit))
+                  (sign     (or "+" "-" ""))
+                  (digit    [0-9]))
+    (peg-run (peg number))))
+
+;; peg-ex-parse-int recognizes integers and computes the corresponding
+;; value.  The grammar is the same as for `peg-ex-recognize-int'
+;; augmented with parsing actions.  Unfortunaletly, the actions add
+;; quite a bit of clutter.
+;;
+;; The actions for the sign rule push -1 on the stack for a minus sign
+;; and 1 for plus or no sign.
+;;
+;; The action for the digit rule pushes the value for a single digit.
+;;
+;; The action `(a b -- (+ (* a 10) b)), takes two items from the stack
+;; and pushes the first digit times 10 added to the second digit.
+;;
+;; The action `(sign val -- (* sign val)), multiplies val with the
+;; sign (1 or -1).
+(defun peg-ex-parse-int ()
+  (with-peg-rules ((number sign digit (* digit
+                                        `(a b -- (+ (* a 10) b)))
+                          `(sign val -- (* sign val)))
+                   (sign (or (and "+" `(-- 1))
+                            (and "-" `(-- -1))
+                            (and ""  `(-- 1))))
+                  (digit [0-9] `(-- (- (char-before) ?0))))
+    (peg-run (peg number))))
+
+;; Put point after the ) and press C-x C-e
+;; (peg-ex-parse-int)-234234
+
+;; Parse arithmetic expressions and compute the result as side effect.
+(defun peg-ex-arith ()
+  (peg-parse
+   (expr _ sum eol)
+   (sum product (* (or (and "+" _ product `(a b -- (+ a b)))
+                      (and "-" _ product `(a b -- (- a b))))))
+   (product value (* (or (and "*" _ value `(a b -- (* a b)))
+                        (and "/" _ value `(a b -- (/ a b))))))
+   (value (or (and (substring number) `(string -- (string-to-number string)))
+             (and "(" _ sum ")" _)))
+   (number (+ [0-9]) _)
+   (_ (* [" \t"]))
+   (eol (or "\n" "\r\n" "\r"))))
+
+;; (peg-ex-arith)   1 + 2 * 3 * (4 + 5)
+;; (peg-ex-arith)   1 + 2 ^ 3 * (4 + 5)  ; fails to parse
+
+;; Parse URI according to RFC 2396.
+(defun peg-ex-uri ()
+  (peg-parse
+   (URI-reference (or absoluteURI relativeURI)
+                 (or (and "#" (substring fragment))
+                     `(-- nil))
+                 `(scheme user host port path query fragment --
+                          (list :scheme scheme :user user
+                                :host host :port port
+                                :path path :query query
+                                :fragment fragment)))
+   (absoluteURI (substring scheme) ":" (or hier-part opaque-part))
+   (hier-part ;(-- user host port path query)
+    (or net-path
+       (and `(-- nil nil nil)
+            abs-path))
+    (or (and "?" (substring query))
+       `(-- nil)))
+   (net-path "//" authority (or abs-path `(-- nil)))
+   (abs-path "/" path-segments)
+   (path-segments segment (list (* "/" segment)) `(s l -- (cons s l)))
+   (segment (substring (* pchar) (* ";" param)))
+   (param (* pchar))
+   (pchar (or unreserved escaped [":@&=+$,"]))
+   (query (* uric))
+   (fragment (* uric))
+   (relativeURI (or net-path abs-path rel-path) (opt "?" query))
+   (rel-path rel-segment (opt abs-path))
+   (rel-segment (+ unreserved escaped [";@&=+$,"]))
+   (authority (or server reg-name))
+   (server (or (and (or (and (substring userinfo) "@")
+                       `(-- nil))
+                   hostport)
+              `(-- nil nil nil)))
+   (userinfo (* (or unreserved escaped [";:&=+$,"])))
+   (hostport (substring host) (or (and ":" (substring port))
+                                 `(-- nil)))
+   (host (or hostname ipv4address))
+   (hostname (* domainlabel ".") toplabel (opt "."))
+   (domainlabel alphanum
+               (opt (* (or alphanum "-") (if alphanum))
+                    alphanum))
+   (toplabel alpha
+            (* (or alphanum "-") (if alphanum))
+            alphanum)
+   (ipv4address (+ digit) "." (+ digit) "." (+ digit) "." (+ digit))
+   (port (* digit))
+   (scheme alpha (* (or alpha digit ["+-."])))
+   (reg-name (or unreserved escaped ["$,;:@&=+"]))
+   (opaque-part uric-no-slash (* uric))
+   (uric (or reserved unreserved escaped))
+   (uric-no-slash (or unreserved escaped [";?:@&=+$,"]))
+   (reserved (set ";/?:@&=+$,"))
+   (unreserved (or alphanum mark))
+   (escaped "%" hex hex)
+   (hex (or digit [A-F] [a-f]))
+   (mark (set "-_.!~*'()"))
+   (alphanum (or alpha digit))
+   (alpha (or lowalpha upalpha))
+   (lowalpha [a-z])
+   (upalpha [A-Z])
+   (digit [0-9])))
+
+;; (peg-ex-uri)http://luser@www.foo.com:8080/bar/baz.html?x=1#foo
+;; (peg-ex-uri)file:/bar/baz.html?foo=df#x
+
+;; Split STRING where SEPARATOR occurs.
+(defun peg-ex-split (string separator)
+  (peg-parse-string ((s (list (* (* sep) elt)))
+                    (elt (substring (+ (not sep) (any))))
+                    (sep (= separator)))
+                   string))
+
+;; (peg-ex-split "-abc-cd-" "-")
+
+;; Parse a lisp style Sexp.
+;; [To keep the example short, ' and . are handled as ordinary symbol.]
+(defun peg-ex-lisp ()
+  (peg-parse
+   (sexp _ (or string list number symbol))
+   (_ (* (or [" \n\t"] comment)))
+   (comment ";" (* (not (or "\n" (eob))) (any)))
+   (string "\"" (substring  (* (not "\"") (any))) "\"")
+   (number (substring (opt (set "+-")) (+ digit))
+          (if terminating)
+          `(string -- (string-to-number string)))
+   (symbol (substring (and symchar (* (not terminating) symchar)))
+          `(s -- (intern s)))
+   (symchar [a-z A-Z 0-9 "-;!#%&'*+,./:;<=>?@[]^_`{|}~"])
+   (list "("           `(-- (cons nil nil)) `(hd -- hd hd)
+        (* sexp        `(tl e -- (setcdr tl (list e))))
+        _ ")"  `(hd _tl -- (cdr hd)))
+   (digit [0-9])
+   (terminating (or (set " \n\t();\"'") (eob)))))
+
+;; (peg-ex-lisp)
+
+;; We try to detect left recursion and report it as error.
+(defun peg-ex-left-recursion ()
+  (eval '(peg-parse (exp (or term
+                            (and exp "+" exp)))
+                   (term (or digit
+                             (and term "*" term)))
+                   (digit [0-9]))
+        t))
+
+(defun peg-ex-infinite-loop ()
+  (eval '(peg-parse (exp (* (or "x"
+                               "y"
+                               (action (foo))))))
+        t))
+
+;; Some efficiency problems:
+
+;; Find the last digit in a string.
+;; Recursive definition with excessive stack usage.
+(defun peg-ex-last-digit (string)
+  (peg-parse-string ((s (or (and (any) s)
+                           (substring [0-9]))))
+                   string))
+
+;; (peg-ex-last-digit "ab0cd1ef2gh")
+;; (peg-ex-last-digit (make-string 50 ?-))
+;; (peg-ex-last-digit (make-string 1000 ?-))
+
+;; Find the last digit without recursion.  Doesn't run out of stack,
+;; but probably still too inefficient for large inputs.
+(defun peg-ex-last-digit2 (string)
+  (peg-parse-string ((s `(-- nil)
+                       (+ (* (not digit) (any))
+                          (substring digit)
+                          `(_d1 d2 -- d2)))
+                    (digit [0-9]))
+                   string))
+
+;; (peg-ex-last-digit2 "ab0cd1ef2gh")
+;; (peg-ex-last-digit2 (concat (make-string 500000 ?-) "8a9b"))
+;; (peg-ex-last-digit2 (make-string 500000 ?-))
+;; (peg-ex-last-digit2 (make-string 500000 ?5))
+
+(provide 'peg-tests)
+;;; peg-tests.el ends here
author	Eric Abrahamsen <eric@ericabrahamsen.net>
	Tue, 6 Dec 2022 05:59:03 +0000 (21:59 -0800)
committer	Eshel Yaron <me@eshelyaron.com>
	Sun, 31 Mar 2024 18:05:29 +0000 (20:05 +0200)
doc/lispref/Makefile.in		patch \| blob \| history
doc/lispref/elisp.texi		patch \| blob \| history
doc/lispref/peg.texi	[new file with mode: 0644]	patch \| blob
etc/NEWS		patch \| blob \| history
lisp/progmodes/peg.el	[new file with mode: 0644]	patch \| blob
test/lisp/peg-tests.el	[new file with mode: 0644]	patch \| blob