From: Yuan Fu Date: Fri, 4 Nov 2022 08:31:56 +0000 (-0700) Subject: Revise tree-sitter facility for multi-language buffers X-Git-Tag: emacs-29.0.90~1719 X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=d9d66764e2a02db9f19f5c76ce34ac7d8bc1edc6;p=emacs.git Revise tree-sitter facility for multi-language buffers * doc/lispref/parsing.texi (Multiple Languages): Extend and update manual. * lisp/treesit.el (treesit-range-functions): Remove variable. (treesit-range-settings): New variable. (treesit-range-rules): New function. (treesit--merge-ranges): New function. (treesit-update-ranges): Use treesit-range-settings instead of treesit-range-functions. (treesit-font-lock-rules): Fix docstring. (treesit-indent) (treesit-indent-region): Only update ranges in a region. * test/src/treesit-tests.el (treesit-range): New test. --- diff --git a/doc/lispref/parsing.texi b/doc/lispref/parsing.texi index e280ac8c645..fa06924f58e 100644 --- a/doc/lispref/parsing.texi +++ b/doc/lispref/parsing.texi @@ -1419,49 +1419,30 @@ Like other query functions, this function raises the @code{treesit-query-error} error if @var{query} is malformed. @end defun -@defvar treesit-range-functions -This variable holds the list of range functions. Font-locking and -indenting code use functions in this list to set correct ranges for -a language parser before using it. - -The signature of each function in the list should be: - -@example -(@var{start} @var{end} &rest @var{_}) -@end example - -@noindent -where @var{start} and @var{end} specify the region that is about to be -used. A range function only needs to (but is not limited to) update -ranges in that region. - -The functions in the list are called in order. -@end defvar - -@defun treesit-update-ranges &optional start end -This function is used by font-lock and indentation to update ranges -before using any parser. Each range function in -@var{treesit-range-functions} is called in-order. Arguments -@var{start} and @var{end} are passed to each range function. +@defun treesit-update-ranges &optional beg end +This function is used by fontification and indentation to update +ranges before using any parser. It makes sure the parsers' range are +set correctly between @var{beg} and @var{end}, according to +@code{treesit-range-settings}. If omitted, @var{beg} defaults to the +beginning of the buffer, and @var{end} defaults to the end. @end defun @vindex treesit-language-at-point-function @defun treesit-language-at pos This function tries to figure out which language is responsible for the text at buffer position @var{pos}. Under the hood it just calls -@code{treesit-language-at-point-function}. +@code{treesit-language-at-point-function}, which is a function that +takes the same argument as this function. Various Lisp programs use this function. For example, the indentation program uses this function to determine which language's rule to use -in a multi-language buffer. So it is important to provide -@code{treesit-language-at-point-function} for a multi-language major -mode. +in a multi-language buffer. @end defun @heading An example Normally, in a set of languages that can be mixed together, there is a -major language and several embedded languages. A Lisp program usually +main language and several embedded languages. A Lisp program usually first parses the whole document with the major language's parser, sets ranges for the embedded languages, and then parses the embedded languages. @@ -1512,6 +1493,51 @@ We use a query pattern @w{@code{(style_element (raw_text) @@capture)}} to find @acronym{CSS} nodes in the @acronym{HTML} parse tree. For how to write query patterns, @pxref{Pattern Matching}. +Emacs can automate the above process in @code{treesit-update-ranges}. +For it to work, a Lisp program should set +@code{treesit-range-settings} to the output of +@code{treesit-range-rules}, like the following. + +@example +@group +(setq-local treesit-range-settings + (treesit-range-rules + :embed 'javascript + :host 'html + '((script_element (raw_text) @@cap)) +@end group + +@group + :embed 'css + :host 'html + '((style_element (raw_text) @@cap)))) +@end group +@example + +@defun treesit-range-rules :keyword value query... +This function is used to set @var{treesit-range-settings}. It +takes care of compiling queries and other post-processing, and outputs +a value that @var{treesit-range-settings} accepts. + +It takes a series of @var{queries} in either string, s-expression or +compiled form. Before each QUERY there must be @var{:keyword} +@var{value} pairs that configure the query (and only that query). + +For each query, @code{:embed} keyword specifies the embedded language, +and @code{:host} keyword specified the host language. Emacs queries +the @var{query} in the host language and uses the result to set ranges +for the embedded language. + +A @var{query} can also be a function that takes two arguments, +@var{start} and @var{end}, and sets the range for parsers. It only +needs to ensure ranges between @var{start} and @var{end} is correct. +When @var{query} is a function, it doesn't need keywords before it. +@end defun + +In summary, a multi-langauge major mode should set +@code{treesit-language-at-function} and @code{treesit-range-settings} +for Emacs to handle multiple languages in the same buffer. + @node Tree-sitter major modes @section Developing major modes with tree-sitter @cindex major mode, developing with tree-sitter diff --git a/lisp/treesit.el b/lisp/treesit.el index 7d961476cd7..21b19299d87 100644 --- a/lisp/treesit.el +++ b/lisp/treesit.el @@ -373,27 +373,133 @@ captured node. Capture names don't matter." ;;; Range API supplement -(defvar-local treesit-range-functions nil - "A list of range functions. -Font-locking and indenting code uses functions in this list to -set correct ranges for a language parser before using it. +(defvar-local treesit-range-settings nil + "A list of range settings. -The signature of each function should be +A list of list of the form - (start end &rest _) + (QUERY LANGUAGE) -where START and END marks the region that is about to be used. A -range function only need to (but not limited to) update ranges in -that region. +When updating the range of each parser in the buffer, +`treesit-update-ranges' queries each QUERY, and set LANGUAGE's +range to the range spanned by captured nodes. QUERY must be a +compiled query. -Each function in the list is called in-order.") +QUERY can also be a function, in which case it is called with 2 +arguments, START and END. It should ensure parsers' range are +correct in that region. -(defun treesit-update-ranges (&optional start end) +The exact form of the variable is considered internal and subject +to change. Use `treesit-range-rules' to set this variable.") + +(defun treesit-range-rules (&rest args) + "Produce settings for `treesit-range-settings'. + +Take a series of QUERIES in either string, s-expression or +compiled form. For example, + +Before each QUERY there must be :KEYWORD VALUE pairs that +configure the query (and only that query). For example, + + (treesit-range-rules + :embed \\='javascript + :host \\='html + \\='((script_element (raw_text) @cap))) + +For each query, :embed keyword specifies the embedded language, +and :host keyword specified the host language. Emacs queries the +QUERY in the host language and uses the result to set ranges for +the embedded language. + +QUERY can also be a function that takes two arguments, START and +END, and sets the range for parsers. The function only needs to +ensure ranges between START and END is correct. If QUERY is a +function, it doesn't need to have keywords before it. + +\(:KEYWORD VALUE QUERY...)" + (let (host embed result) + (while args + (pcase (pop args) + (:host (let ((host-lang (pop args))) + (unless (symbolp host-lang) + (signal 'treesit-error (list "Value of :host option should be a symbol" host-lang))) + (setq host host-lang))) + (:embed (let ((embed-lang (pop args))) + (unless (symbolp embed-lang) + (signal 'treesit-error (list "Value of :embed option should be a symbol" embed-lang))) + (setq embed embed-lang))) + (query (if (functionp query) + (push (list query nil nil) result) + (when (null embed) + (signal 'treesit-error (list "Value of :embed option cannot be omitted"))) + (when (null host) + (signal 'treesit-error (list "Value of :host option cannot be omitted"))) + (push (list (treesit-query-compile host query) + embed host) + result)) + (setq host nil embed nil)))) + (nreverse result))) + +(defun treesit--merge-ranges (old-ranges new-ranges start end) + "Merge OLD-RANGES and NEW-RANGES. +Each range is a list of cons of the form (BEG . END). When +merging the two ranges, if a range in OLD-RANGES intersects with +another range in NEW-RANGES, discard the one in OLD-RANGES and +keep the one in NEW-RANGES. Also discard any range in OLD-RANGES +that intersects the region marked by START and END. + +Return the merged range list." + (let ((result nil)) + (while (and old-ranges new-ranges) + (let ((new-beg (caar new-ranges)) + (new-end (cdar new-ranges)) + (old-beg (caar old-ranges)) + (old-end (cdar old-ranges))) + (cond + ;; Old range intersects with START-END, discard. + ((and (< start old-end) + (< old-beg end)) + (setq old-ranges (cdr old-ranges))) + ;; New range and old range don't intersect, new comes + ;; before, push new. + ((<= new-end old-beg) + (push (car new-ranges) result) + (setq new-ranges (cdr new-ranges))) + ;; New range and old range don't intersect, old comes + ;; before, push old. + ((<= old-end new-beg) + (push (car old-ranges) result) + (setq old-ranges (cdr old-ranges))) + (t ;; New and old range intersect, discard old. + (setq old-ranges (cdr old-ranges)))))) + (let ((left-over (or new-ranges old-ranges))) + (dolist (range left-over) + (push range result))) + (nreverse result))) + +(defun treesit-update-ranges (&optional beg end) "Update the ranges for each language in the current buffer. -Calls each range functions in `treesit-range-functions' -in-order. START and END are passed to each range function." - (dolist (range-fn treesit-range-functions) - (funcall range-fn (or start (point-min)) (or end (point-max))))) +If BEG and END not omitted, only update parser ranges in that +region." + ;; When updating ranges, we want to avoid querying the whole buffer + ;; which could be slow in very large buffers. Instead, we only + ;; query for nodes that intersect with the region between BEG and + ;; END. And we only update the ranges intersecting BEG and END, + ;; outside of that region we inherit old ranges. + (dolist (setting treesit-range-settings) + (let ((query (nth 0 setting)) + (language (nth 1 setting)) + (beg (or beg (point-min))) + (end (or end (point-max)))) + (if (functionp query) (funcall query beg end) + (let* ((host-lang (treesit-query-language query)) + (parser (treesit-parser-create language)) + (old-ranges (treesit-parser-included-ranges parser)) + (new-ranges (treesit-query-range + host-lang query beg end)) + (set-ranges (treesit--merge-ranges + old-ranges new-ranges beg end))) + (treesit-parser-set-included-ranges parser set-ranges)))))) (defun treesit-parser-range-on (parser beg &optional end) "Check if PARSER's range covers the portion between BEG and END. @@ -469,9 +575,8 @@ t, nil, append, prepend, keep. See more in "Return a value suitable for `treesit-font-lock-settings'. Take a series of QUERIES in either string, s-expression or -compiled form. Same as in `treesit-font-lock-settings', for each -query, captured nodes are highlighted with the capture name as -its face. +compiled form. For each query, captured nodes are highlighted +with the capture name as its face. Before each QUERY there could be :KEYWORD VALUE pairs that configure the query (and only that query). For example, @@ -1065,7 +1170,8 @@ Return (ANCHOR . OFFSET). This function is used by (defun treesit-indent () "Indent according to the result of `treesit-indent-function'." - (treesit-update-ranges) + (treesit-update-ranges (line-beginning-position) + (line-end-position)) ;; We don't return 'noindent even if no rules match, because ;; `indent-for-tab-command' tries to indent itself when we return ;; 'noindent, which leads to wrong indentation at times. @@ -1092,7 +1198,7 @@ reparse after indenting every single line.") (defun treesit-indent-region (beg end) "Indent the region between BEG and END. Similar to `treesit-indent', but indent a region instead." - (treesit-update-ranges) + (treesit-update-ranges beg end) (let* ((meta-len 2) (vector-len (* meta-len treesit--indent-region-batch-size)) ;; This vector saves the indent meta for each line in the diff --git a/test/src/treesit-tests.el b/test/src/treesit-tests.el index 56ac51ff994..5e4aea3ad41 100644 --- a/test/src/treesit-tests.el +++ b/test/src/treesit-tests.el @@ -321,6 +321,9 @@ visible_end.)" (setq parser (treesit-parser-create 'json)) (setq root-node (treesit-parser-root-node parser))) + + (should (eq (treesit-parser-included-ranges parser) nil)) + (should-error (treesit-parser-set-included-ranges parser '((1 . 6) (5 . 20))) @@ -333,6 +336,32 @@ visible_end.)" (should (equal "(document (array (array (number)) (array (number) (number) (number)) (array (number) (number))))" (treesit-node-string (treesit-parser-root-node parser)))) + + (treesit-parser-set-included-ranges parser nil) + (should (eq (treesit-parser-included-ranges parser) nil)) + + ;; `treesit--merge-ranges'. + (let ((old-ranges '((1 . 10) ; (1) -- before (a) + (20 . 30); (2) -- intersect with (b) + (42 . 46) (47 . 48) ; (3) -- inside (c) + (55 . 65) (70 . 75) ; (4) -- intersect start-end + (80 . 90) ; (4) + )) + (new-ranges '((10 . 15) ; (a) + (18 . 25) (26 . 28) ; (b) + (40 . 50) ; (c) + (90 . 100) ; (d) -- after (4) + )) + (result '((1 . 10) ; (1) + (10 . 15) ; (a) + (18 . 25) (26 . 28) ; (b) + (40 . 50) ; (c) + (80 . 90) ; (4) + (90 . 100) ; (d) + ))) + (should (equal (treesit--merge-ranges + old-ranges new-ranges 60 75) + result))) ;; TODO: More tests. )))