From: Eshel Yaron Date: Sun, 15 Sep 2024 07:33:34 +0000 (+0200) Subject: ; Sync treesit.[ch] with upstream X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=a00dd6ffe7f283b1a8f8536a2a33f668460d185b;p=emacs.git ; Sync treesit.[ch] with upstream --- diff --git a/src/treesit.c b/src/treesit.c index 326015cd7aa..3790f5046c1 100644 --- a/src/treesit.c +++ b/src/treesit.c @@ -504,7 +504,7 @@ treesit_debug_print_parser_list (char *msg, Lisp_Object parser) SSDATA (SYMBOL_NAME (Vthis_command)), SSDATA (SYMBOL_NAME (XTS_PARSER (parser)->language_symbol)), buf_name, BUF_BEG (buf), - BUF_BEGV (buf), BUF_Z (buf), BUF_ZV (buf)); + BUF_BEGV (buf), BUF_ZV (buf), BUF_Z (buf)); Lisp_Object tail = BVAR (buf, ts_parser_list); FOR_EACH_TAIL (tail) @@ -968,6 +968,9 @@ treesit_record_change (ptrdiff_t start_byte, ptrdiff_t old_end_byte, } } +static TSRange *treesit_make_ts_ranges (Lisp_Object, Lisp_Object, + uint32_t *); + /* Comment (ref:visible-beg-null) The purpose of visible_beg/end is to keep track of "which part of the buffer does the tree-sitter tree see", in order to update the tree correctly. Visible_beg/end have @@ -1094,8 +1097,87 @@ treesit_sync_visible_region (Lisp_Object parser) XTS_PARSER (parser)->visible_beg = visible_beg; XTS_PARSER (parser)->visible_end = visible_end; + + /* Fix ranges so that the ranges stays with in visible_end. Here we + try to do minimal work so that the ranges is minimally correct and + there's no OOB error. Usually treesit-update-ranges should update + the parser with semantically correct ranges. + + We start with the charpos ranges, because for bytepos ranges, after + user edits, the ranges start/end might end up inside a multibyte + char! See (ref:bytepos-range-pitfall) below. */ + Lisp_Object lisp_ranges = XTS_PARSER (parser)->last_set_ranges; + if (NILP (lisp_ranges)) return; + + Lisp_Object new_ranges_head = lisp_ranges; + + FOR_EACH_TAIL_SAFE (lisp_ranges) + { + Lisp_Object range = XCAR (lisp_ranges); + ptrdiff_t beg = XFIXNUM (XCAR (range)); + ptrdiff_t end = XFIXNUM (XCDR (range)); + + if (end <= visible_beg) + /* Even the end is before visible_beg, discard this range. */ + new_ranges_head = XCDR (new_ranges_head); + else if (beg >= visible_end) + { + /* Even the beg is after visible_end, dicard this range and all + the ranges after it. */ + XSETCDR (range, Qnil); + break; + } + else + { + /* At this point, the range overlaps with the visible portion of + the buffer in some way (in front / in back / completely + encased / completely encases). */ + if (beg < visible_beg) + XSETCAR (range, make_fixnum (visible_beg)); + if (end > visible_end) + XSETCDR (range, make_fixnum (visible_end)); + } + } + + XTS_PARSER (parser)->last_set_ranges = new_ranges_head; + + if (NILP (new_ranges_head)) + { + bool success; + success = ts_parser_set_included_ranges (XTS_PARSER (parser)->parser, + NULL, 0); + eassert (success); + } + else + { + uint32_t len = 0; + TSRange *ts_ranges = treesit_make_ts_ranges (new_ranges_head, parser, + &len); + bool success; + success = ts_parser_set_included_ranges (XTS_PARSER (parser)->parser, + ts_ranges, len); + xfree (ts_ranges); + eassert (success); + } } +/* (ref:bytepos-range-pitfall) Suppose we have the following buffer + content ([ ] is a unibyte char, [ ] is a multibyte char): + + [a][b][c][d][e][ f ] + + and the following ranges (denoted by braces): + + [a][b][c][d][e][ f ] + { }{ } + + So far so good, now user deletes a unibyte char at the beginning: + + [b][c][d][e][ f ] + { }{ } + + Oops, now our range cuts into the multibyte char, bad! */ + static void treesit_check_buffer_size (struct buffer *buffer) { @@ -1106,7 +1188,8 @@ treesit_check_buffer_size (struct buffer *buffer) make_fixnum (buffer_size_bytes)); } -static Lisp_Object treesit_make_ranges (const TSRange *, uint32_t, struct buffer *); +static Lisp_Object treesit_make_ranges (const TSRange *, uint32_t, + Lisp_Object, struct buffer *); static void treesit_call_after_change_functions (TSTree *old_tree, TSTree *new_tree, @@ -1120,7 +1203,7 @@ treesit_call_after_change_functions (TSTree *old_tree, TSTree *new_tree, { uint32_t len; TSRange *ranges = ts_tree_get_changed_ranges (old_tree, new_tree, &len); - lisp_ranges = treesit_make_ranges (ranges, len, buf); + lisp_ranges = treesit_make_ranges (ranges, len, parser, buf); xfree (ranges); } else @@ -1147,6 +1230,9 @@ treesit_call_after_change_functions (TSTree *old_tree, TSTree *new_tree, static void treesit_ensure_parsed (Lisp_Object parser) { + if (XTS_PARSER (parser)->within_reparse) return; + XTS_PARSER (parser)->within_reparse = true; + struct buffer *buffer = XBUFFER (XTS_PARSER (parser)->buffer); /* Before we parse, catch up with the narrowing situation. */ @@ -1155,10 +1241,11 @@ treesit_ensure_parsed (Lisp_Object parser) because it might set the flag to true. */ treesit_sync_visible_region (parser); - /* Make sure this comes before everything else, see comment - (ref:notifier-inside-ensure-parsed) for more detail. */ if (!XTS_PARSER (parser)->need_reparse) - return; + { + XTS_PARSER (parser)->within_reparse = false; + return; + } TSParser *treesit_parser = XTS_PARSER (parser)->parser; TSTree *tree = XTS_PARSER (parser)->tree; @@ -1183,14 +1270,10 @@ treesit_ensure_parsed (Lisp_Object parser) XTS_PARSER (parser)->need_reparse = false; XTS_PARSER (parser)->timestamp++; - /* After-change functions should run at the very end, most crucially - after need_reparse is set to false, this way if the function - calls some tree-sitter function which invokes - treesit_ensure_parsed again, it returns early and do not - recursively call the after change functions again. - (ref:notifier-inside-ensure-parsed) */ treesit_call_after_change_functions (tree, new_tree, parser); ts_tree_delete (tree); + + XTS_PARSER (parser)->within_reparse = false; } /* This is the read function provided to tree-sitter to read from a @@ -1231,7 +1314,12 @@ treesit_read_buffer (void *parser, uint32_t byte_index, beg = NULL; len = 0; } - /* Normal case, read a character. */ + /* Normal case, read a character. We can't give tree-sitter the + whole buffer range because we move the gap around, realloc the + buffer, etc; and there's no way to invalidate the previously + given range in tree-sitter. Move over, benchmark shows there's + very little difference between passing a whole chunk vs passing a + single char at once. The only cost is funcall I guess. */ else { beg = (char *) BUF_BYTE_ADDRESS (buffer, byte_pos); @@ -1274,6 +1362,7 @@ make_treesit_parser (Lisp_Object buffer, TSParser *parser, lisp_parser->timestamp = 0; lisp_parser->deleted = false; lisp_parser->need_to_gc_buffer = false; + lisp_parser->within_reparse = false; eassert (lisp_parser->visible_beg <= lisp_parser->visible_end); return make_lisp_ptr (lisp_parser, Lisp_Vectorlike); } @@ -1765,14 +1854,14 @@ treesit_check_range_argument (Lisp_Object ranges) convert between tree-sitter buffer offset and buffer position. */ static Lisp_Object treesit_make_ranges (const TSRange *ranges, uint32_t len, - struct buffer *buffer) + Lisp_Object parser, struct buffer *buffer) { Lisp_Object list = Qnil; for (int idx = 0; idx < len; idx++) { TSRange range = ranges[idx]; - uint32_t beg_byte = range.start_byte + BUF_BEGV_BYTE (buffer); - uint32_t end_byte = range.end_byte + BUF_BEGV_BYTE (buffer); + uint32_t beg_byte = range.start_byte + XTS_PARSER (parser)->visible_beg; + uint32_t end_byte = range.end_byte + XTS_PARSER (parser)->visible_beg; eassert (BUF_BEGV_BYTE (buffer) <= beg_byte); eassert (beg_byte <= end_byte); eassert (end_byte <= BUF_ZV_BYTE (buffer)); @@ -1785,6 +1874,48 @@ treesit_make_ranges (const TSRange *ranges, uint32_t len, return Fnreverse (list); } +/* Convert lisp ranges to tree-sitter ranges. Set LEN to the length of + the ranges. RANGES must be a valid ranges list, (cons of numbers, no + overlap, etc). PARSER must be a parser. This function doesn't check + for types. Caller must free the returned ranges. */ +static TSRange * +treesit_make_ts_ranges (Lisp_Object ranges, Lisp_Object parser, uint32_t *len) +{ + ptrdiff_t ranges_len = list_length (ranges); + if (ranges_len > UINT32_MAX) + xsignal (Qargs_out_of_range, list2 (ranges, Flength (ranges))); + + *len = (uint32_t) ranges_len; + TSRange *treesit_ranges = xmalloc (sizeof (TSRange) * ranges_len); + + struct buffer *buffer = XBUFFER (XTS_PARSER (parser)->buffer); + + for (int idx = 0; idx < ranges_len; idx++, ranges = XCDR (ranges)) + { + Lisp_Object range = XCAR (ranges); + ptrdiff_t beg_byte = buf_charpos_to_bytepos (buffer, + XFIXNUM (XCAR (range))); + ptrdiff_t end_byte = buf_charpos_to_bytepos (buffer, + XFIXNUM (XCDR (range))); + + /* Shouldn't violate assertion since we just checked for + buffer size at the beginning of this function. */ + eassert (beg_byte - BUF_BEGV_BYTE (buffer) <= UINT32_MAX); + eassert (end_byte - BUF_BEGV_BYTE (buffer) <= UINT32_MAX); + + /* We don't care about points, put in dummy values. */ + TSRange rg = + { + {0, 0}, {0, 0}, + (uint32_t) beg_byte - XTS_PARSER (parser)->visible_beg, + (uint32_t) end_byte - XTS_PARSER (parser)->visible_beg + }; + treesit_ranges[idx] = rg; + } + + return treesit_ranges; +} + DEFUN ("treesit-parser-set-included-ranges", Ftreesit_parser_set_included_ranges, Streesit_parser_set_included_ranges, @@ -1818,42 +1949,14 @@ buffer. */) if (NILP (ranges)) { /* If RANGES is nil, make parser to parse the whole document. - To do that we give tree-sitter a 0 length, the range is a - dummy. */ - TSRange treesit_range = {{0, 0}, {0, 0}, 0, 0}; + To do that we give tree-sitter a 0 length. */ success = ts_parser_set_included_ranges (XTS_PARSER (parser)->parser, - &treesit_range , 0); + NULL , 0); } else { - /* Set ranges for PARSER. */ - if (list_length (ranges) > UINT32_MAX) - xsignal (Qargs_out_of_range, list2 (ranges, Flength (ranges))); - uint32_t len = (uint32_t) list_length (ranges); - TSRange *treesit_ranges = xmalloc (sizeof (TSRange) * len); - struct buffer *buffer = XBUFFER (XTS_PARSER (parser)->buffer); - - /* We can use XFIXNUM, XCAR, XCDR freely because we have checked - the input by treesit_check_range_argument. */ - - for (int idx = 0; !NILP (ranges); idx++, ranges = XCDR (ranges)) - { - Lisp_Object range = XCAR (ranges); - ptrdiff_t beg_byte = buf_charpos_to_bytepos (buffer, - XFIXNUM (XCAR (range))); - ptrdiff_t end_byte = buf_charpos_to_bytepos (buffer, - XFIXNUM (XCDR (range))); - /* Shouldn't violate assertion since we just checked for - buffer size at the beginning of this function. */ - eassert (beg_byte - BUF_BEGV_BYTE (buffer) <= UINT32_MAX); - eassert (end_byte - BUF_BEGV_BYTE (buffer) <= UINT32_MAX); - /* We don't care about start and end points, put in dummy - values. */ - TSRange rg = {{0, 0}, {0, 0}, - (uint32_t) beg_byte - BUF_BEGV_BYTE (buffer), - (uint32_t) end_byte - BUF_BEGV_BYTE (buffer)}; - treesit_ranges[idx] = rg; - } + uint32_t len = 0; + TSRange *treesit_ranges = treesit_make_ts_ranges (ranges, parser, &len); success = ts_parser_set_included_ranges (XTS_PARSER (parser)->parser, treesit_ranges, len); xfree (treesit_ranges); @@ -1880,25 +1983,9 @@ See also `treesit-parser-set-included-ranges'. */) treesit_check_parser (parser); treesit_initialize (); - /* When the parser doesn't have a range set and we call - ts_parser_included_ranges on it, it doesn't return an empty list, - but rather return DEFAULT_RANGE. (A single range where start_byte - = 0, end_byte = UINT32_MAX). So we need to track whether the - parser is ranged ourselves. */ - if (NILP (XTS_PARSER (parser)->last_set_ranges)) - return Qnil; - - uint32_t len; - const TSRange *ranges - = ts_parser_included_ranges (XTS_PARSER (parser)->parser, &len); - - /* Our return value depends on the buffer state (BUF_BEGV_BYTE, - etc), so we need to sync up. */ - treesit_check_buffer_size (XBUFFER (XTS_PARSER (parser)->buffer)); treesit_sync_visible_region (parser); - struct buffer *buffer = XBUFFER (XTS_PARSER (parser)->buffer); - return treesit_make_ranges (ranges, len, buffer); + return XTS_PARSER (parser)->last_set_ranges; } DEFUN ("treesit-parser-notifiers", Ftreesit_parser_notifiers, diff --git a/src/treesit.h b/src/treesit.h index e20611eac67..b90f9134542 100644 --- a/src/treesit.h +++ b/src/treesit.h @@ -45,9 +45,23 @@ struct Lisp_TS_Parser same tag. A tag is primarily used to differentiate between parsers for the same language. */ Lisp_Object tag; - /* The Lisp ranges last set. This is use to compare to the new - ranges the users wants to set, and avoid reparse if the new - ranges is the same as the last set one. */ + /* The Lisp ranges last set. One purpose for it is to compare to the + new ranges the users wants to set, and avoid reparse if the new + ranges is the same as the current one. Another purpose is to store + the ranges in charpos (ts api returns ranges in bytepos). We need + to use charpos so we don't end up having a range cut into a + multibyte character. (See (ref:bytepos-range-pitfall) in treesit.c + for more detail.) + + treesit-parser-set-included-ranges sets this field; + treesit-parser-included-ranges directly returns this field, and + before each reparse, treesit_sync_visible_region uses this to + calculate a range for the parser that fits in the visible region. + + Trivia: when the parser doesn't have a range set and we call + ts_parser_included_ranges on it, it doesn't return an empty list, + but rather return DEFAULT_RANGE. (A single range where start_byte + = 0, end_byte = UINT32_MAX). */ Lisp_Object last_set_ranges; /* The buffer associated with this parser. */ Lisp_Object buffer;