From e171ef933feefd67d7f1b3b3693ce730111660e9 Mon Sep 17 00:00:00 2001 From: Yuan Fu Date: Tue, 14 Jun 2022 11:36:22 -0700 Subject: [PATCH] Support compiled queries in treesit-query-capture Last commit added this new type, this commit adds functionalities. treesit.el only has documentation changes. * lisp/treesit.el (treesit-query-in, treesit-font-lock-settings, treesit-defun-query): Update docstring. * src/treesit.c (make_ts_query): New function. (Ftreesit_query_compile): New function. (Ftreesit_query_capture): Remove code that creates a query object and instead either use make_ts_query or use the give compiled query. Free the query object conditonally. (syms_of_treesit): New symbol. --- lisp/treesit.el | 21 +++++-- src/treesit.c | 143 ++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 134 insertions(+), 30 deletions(-) diff --git a/lisp/treesit.el b/lisp/treesit.el index 78dfcae7e56..09f750f9d52 100644 --- a/lisp/treesit.el +++ b/lisp/treesit.el @@ -366,9 +366,12 @@ language symbol, use the root node of the first parser for that language; if a parser, use the root node of that parser; if a node, use that node. -QUERY is either a string query or a sexp query. See Info node -`(elisp)Pattern Matching' for how to write a query pattern in either -string or s-expression form. +QUERY is either a string query, a sexp query, or a compiled +query. See Info node `(elisp)Pattern Matching' for how to write +a query in either string or s-expression form. When using +repeatedly, a compiled query is much faster than a string or sexp +one, so it is recommend to compile your queries if it will be +used over and over. BEG and END, if _both_ non-nil, specifies the range in which the query is executed. @@ -442,8 +445,12 @@ Each SETTING controls one parser (often of different language). LANGUAGE is the language symbol. See Info node `(elisp)Language Definitions'. -QUERY is either a string query or a sexp query. -See Info node `(elisp)Pattern Matching' for writing queries. +QUERY is either a string query, a sexp query, or a compiled +query. See Info node `(elisp)Pattern Matching' for how to write +a query in either string or s-expression form. When using +repeatedly, a compiled query is much faster than a string or sexp +one, so it is recommend to compile your queries if it will be +used over and over. Capture names in QUERY should be face names like `font-lock-keyword-face'. The captured node will be fontified @@ -923,7 +930,9 @@ search failed." (defvar-local treesit-defun-query nil "A tree-sitter query that matches function/class definitions. Capture names don't matter. This variable is used by navigation -functions like `treesit-beginning-of-defun'.") +functions like `treesit-beginning-of-defun'. + +It is recommended to use compiled query for this variable.") (defun treesit-beginning-of-defun (&optional arg) "Move backward to the beginning of a defun. diff --git a/src/treesit.c b/src/treesit.c index 3c8edc92131..5b344a2ea1c 100644 --- a/src/treesit.c +++ b/src/treesit.c @@ -88,6 +88,28 @@ along with GNU Emacs. If not, see . */ parser of buffer changes. - lisp/emacs-lisp/cl-preloaded.el & data.c & lisp.h for parser and node type. + + Because it is pretty slow (comparing to other tree-sitter + operations) for tree-sitter to parse the query and produce a query + object, it is very wasteful to reparse the query every time + treesit-query-capture is called, and it completely kills the + performance of querying in a loop for a moderate amount of times + (hundreds of queries takes seconds rather than milliseconds to + complete). Therefore we want some caching. We can either use a + search.c style transparent caching, or simply expose a new type, + compiled-ts-query and let the user to manually compile AOT. I + believe AOT compiling gives users more control, makes the + performance stable and easy to understand (compiled -> fast, + uncompiled -> slow), and avoids some edge cases transparent cache + could have (see below). So I implemented the AOT compilation. + + Problems a transparent cache could have: Suppose we store cache + entries in a fixed-length linked-list, and compare with EQ. 1) + One-off query could kick out useful cache. 2) if the user messed + up and the query doesn't EQ to the cache anymore, the performance + mysteriously drops. 3) what if a user uses so many stuff that the + default cache size (20) is not enough and we end up thrashing? + These are all imagined scenarios but they are not impossible :-) */ /*** Initialization */ @@ -536,6 +558,31 @@ make_ts_node (Lisp_Object parser, TSNode node) return make_lisp_ptr (lisp_node, Lisp_Vectorlike); } +/* Make a compiled query struct. Return NULL if error occurs. QUERY + has to be either a cons or a string. */ +static struct Lisp_TS_Query * +make_ts_query (Lisp_Object query, const TSLanguage *language, + uint32_t *error_offset, TSQueryError *error_type) +{ + if (CONSP (query)) + query = Ftreesit_expand_query (query); + char *source = SSDATA (query); + + TSQuery *ts_query = ts_query_new (language, source, strlen (source), + error_offset, error_type); + TSQueryCursor *ts_cursor = ts_query_cursor_new (); + + if (ts_query == NULL) + return NULL; + + struct Lisp_TS_Query *lisp_query + = ALLOCATE_PLAIN_PSEUDOVECTOR (struct Lisp_TS_Query, + PVEC_TS_COMPILED_QUERY); + lisp_query->query = ts_query; + lisp_query->cursor = ts_cursor; + return lisp_query; +} + DEFUN ("treesit-parser-p", Ftreesit_parser_p, Streesit_parser_p, 1, 1, 0, doc: /* Return t if OBJECT is a tree-sitter parser. */) @@ -1467,6 +1514,39 @@ ts_eval_predicates return pass; } +DEFUN ("treesit-query-compile", + Ftreesit_query_compile, + Streesit_query_compile, 2, 2, 0, + doc: /* Compile QUERY to a compiled query. + +Querying a compiled query is much faster than an uncompiled one. +LANGUAGE is the language this query is for. + +Signals treesit-query-error if QUERY is malformed or something +else goes wrong. */) + (Lisp_Object language, Lisp_Object query) +{ + if (!Ftreesit_query_p (query)) + wrong_type_argument (Qtreesit_query_p, query); + CHECK_SYMBOL (language); + if (TS_COMPILED_QUERY_P (query)) + return query; + + TSLanguage *ts_lang = ts_load_language (language, true); + uint32_t error_offset; + TSQueryError error_type; + + struct Lisp_TS_Query *lisp_query + = make_ts_query (query, ts_lang, &error_offset, &error_type); + + if (lisp_query == NULL) + xsignal2 (Qtreesit_query_error, + build_string (ts_query_error_to_string (error_type)), + make_fixnum (error_offset + 1)); + + return make_lisp_ptr (lisp_query, Lisp_Vectorlike); +} + DEFUN ("treesit-query-capture", Ftreesit_query_capture, Streesit_query_capture, 2, 4, 0, @@ -1475,9 +1555,11 @@ DEFUN ("treesit-query-capture", Return a list of (CAPTURE_NAME . NODE). CAPTURE_NAME is the name assigned to the node in PATTERN. NODE is the captured node. -QUERY is either a string query or a sexp query. See Info node -`(elisp)Pattern Matching' for how to write a query in either string or -s-expression form. +QUERY is either a string query, a sexp query, or a compiled query. +See Info node `(elisp)Pattern Matching' for how to write a query in +either string or s-expression form. When using repeatedly, a compiled +query is much faster than a string or sexp one, so it is recommend to +compile your queries if it will be used over and over. BEG and END, if both non-nil, specifies the range in which the query is executed. @@ -1493,10 +1575,9 @@ else goes wrong. */) if (!NILP (end)) CHECK_INTEGER (end); - if (CONSP (query)) - query = Ftreesit_expand_query (query); - else - CHECK_STRING (query); + if (!(TS_COMPILED_QUERY_P (query) + || CONSP (query) || STRINGP (query))) + wrong_type_argument (Qtreesit_query_p, query); /* Extract C values from Lisp objects. */ TSNode ts_node = XTS_NODE (node)->node; @@ -1505,25 +1586,34 @@ else goes wrong. */) XTS_PARSER (XTS_NODE (node)->parser)->visible_beg; const TSLanguage *lang = ts_parser_language (XTS_PARSER (lisp_parser)->parser); - char *source = SSDATA (query); /* Initialize query objects, and execute query. */ - uint32_t error_offset; - TSQueryError error_type; - /* TODO: We could cache the query object, so that repeatedly - querying with the same query can reuse the query object. It also - saves us from expanding the sexp query into a string. I don't - know how much time that could save though. */ - TSQuery *ts_query = ts_query_new (lang, source, strlen (source), - &error_offset, &error_type); - TSQueryCursor *cursor = ts_query_cursor_new (); - - if (ts_query == NULL) + struct Lisp_TS_Query *lisp_query; + /* If the lisp query is temporary, we need to free it after use. */ + bool lisp_query_temp_p; + if (TS_COMPILED_QUERY_P (query)) { - xsignal2 (Qtreesit_query_error, - build_string (ts_query_error_to_string (error_type)), - make_fixnum (error_offset + 1)); + lisp_query_temp_p = false; + lisp_query = XTS_COMPILED_QUERY (query); } + else + { + lisp_query_temp_p = true; + uint32_t error_offset; + TSQueryError error_type; + lisp_query = make_ts_query (query, lang, + &error_offset, &error_type); + if (lisp_query == NULL) + { + xsignal2 (Qtreesit_query_error, + build_string + (ts_query_error_to_string (error_type)), + make_fixnum (error_offset + 1)); + } + } + TSQuery *ts_query = lisp_query->query; + TSQueryCursor *cursor = lisp_query->cursor; + if (!NILP (beg) && !NILP (end)) { EMACS_INT beg_byte = XFIXNUM (beg); @@ -1578,8 +1668,11 @@ else goes wrong. */) result = prev_result; } } - ts_query_delete (ts_query); - ts_query_cursor_delete (cursor); + if (lisp_query_temp_p) + { + ts_query_delete (ts_query); + ts_query_cursor_delete (cursor); + } return Fnreverse (result); } @@ -1592,6 +1685,7 @@ syms_of_treesit (void) DEFSYM (Qtreesit_parser_p, "treesit-parser-p"); DEFSYM (Qtreesit_node_p, "treesit-node-p"); DEFSYM (Qtreesit_compiled_query_p, "treesit-compiled-query-p"); + DEFSYM (Qtreesit_query_p, "treesit-query-p"); DEFSYM (Qnamed, "named"); DEFSYM (Qmissing, "missing"); DEFSYM (Qextra, "extra"); @@ -1705,5 +1799,6 @@ dynamic libraries, in that order. */); defsubr (&Streesit_expand_pattern); defsubr (&Streesit_expand_query); + defsubr (&Streesit_query_compile); defsubr (&Streesit_query_capture); } -- 2.39.5