feat(cli)!: implement treesitter-based chunking.

Davidyz · Davidyz · commit ca4f347e047d · 2025-03-15T21:40:29.000Z
diff --git a/README.md b/README.md
@@ -56,10 +56,12 @@ and chat plugin available on VSCode and JetBrain products.
 
 ## TODOs
 - [x] query by ~file path~ excluded paths;
-- [ ] chunking support;
+- [x] chunking support;
   - [x] add metadata for files;
   - [x] chunk-size configuration;
-  - [ ] smarter chunking (semantics/syntax based);
+  - [x] smarter chunking (semantics/syntax based), implemented with
+    [py-tree-sitter](https://github.com/tree-sitter/py-tree-sitter) and
+    [tree-sitter-language-pack](https://github.com/Goldziher/tree-sitter-language-pack);
   - [x] configurable document selection from query results.
 - [x] ~NeoVim Lua API with cache to skip the retrieval when a project has not
   been indexed~ Returns empty array instead;
diff --git a/docs/cli.md b/docs/cli.md
@@ -187,7 +187,15 @@ The JSON configuration file may hold the following values:
 - `overlap_ratio`: float between 0 and 1, the ratio of overlapping/shared content 
   between 2 adjacent chunks. A larger ratio improves the coherences of chunks,
   but at the cost of increasing number of entries in the database and hence
-  slowing down the search. Default: `0.2`;
+  slowing down the search. Default: `0.2`. _Starting from 0.4.11, VectorCode
+  will use treesitter to parse languages that it can automatically detect. It
+  uses [pygments](https://github.com/pygments/pygments) to guess the language
+  from filename, and 
+  [tree-sitter-language-pack](https://github.com/Goldziher/tree-sitter-language-pack) 
+  to fetch the correct parser. `overlap_ratio` has no effects when treesitter
+  works. If VectorCode fails to find an appropriate parser, it'll fallback to
+  the legacy naive parser, in which case `overlap_ratio` works exactly in the
+  same way as before;_
 - `query_multplier`: integer, when you use the `query` command to retrieve `n` documents,
   VectorCode will check `n * query_multplier` chunks and return at most `n` 
   documents. A larger value of `query_multplier`
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,6 +12,9 @@ dependencies = [
     "numpy",
     "psutil",
     "httpx",
+    "tree-sitter",
+    "tree-sitter-language-pack",
+    "pygments",
 ]
 requires-python = ">=3.11,<3.14"
 readme = "README.md"
diff --git a/src/vectorcode/chunking.py b/src/vectorcode/chunking.py
@@ -1,6 +1,14 @@
+import os
 from abc import abstractmethod
+from functools import cache
 from io import TextIOWrapper
-from typing import Generator
+from typing import Generator, Optional
+
+from pygments.lexer import Lexer
+from pygments.lexers import guess_lexer_for_filename
+from pygments.util import ClassNotFound
+from tree_sitter import Node
+from tree_sitter_language_pack import get_parser
 
 
 class ChunkerBase:
@@ -59,3 +67,68 @@ def chunk(self, data: TextIOWrapper) -> Generator[str, None, None]:
                 yield output
                 if len(new_chars) < step_size:
                     return
+
+
+class TreeSitterChunker(ChunkerBase):
+    def __init__(self, chunk_size: int = -1, overlap_ratio: float = 0.2):
+        super().__init__()
+        assert isinstance(chunk_size, int), "chunk_size parameter must be an integer"
+        assert 0 <= overlap_ratio < 1, (
+            "Overlap ratio has to be a float between 0 (inclusive) and 1 (exclusive)."
+        )
+        self.__chunk_size = chunk_size
+        self.__overlap_ratio = overlap_ratio
+
+    def __chunk_node(self, node: Node, text: str) -> Generator[str, None, None]:
+        current_chunk = ""
+        for child in node.children:
+            child_length = child.end_byte - child.start_byte
+            if child_length > self.__chunk_size:
+                if current_chunk:
+                    yield current_chunk
+                    current_chunk = ""
+                yield from self.__chunk_node(child, text)
+            elif len(current_chunk) + child_length > self.__chunk_size:
+                yield current_chunk
+                current_chunk = text[child.start_byte : child.end_byte]
+            else:
+                current_chunk += text[child.start_byte : child.end_byte]
+        if current_chunk:
+            yield current_chunk
+
+    @cache
+    def __guess_type(self, path: str, content: str) -> Optional[Lexer]:
+        try:
+            return guess_lexer_for_filename(path, content)
+
+        except ClassNotFound:
+            return None
+
+    def chunk(self, data: str) -> Generator[str, None, None]:
+        """
+        data: path to the file
+        """
+        assert os.path.isfile(data)
+        with open(data) as fin:
+            content = fin.read()
+        parser = None
+        lexer = self.__guess_type(data, content)
+        if lexer is not None:
+            lang_names = [lexer.name]
+            lang_names.extend(lexer.aliases)
+            for name in lang_names:
+                try:
+                    parser = get_parser(name.lower())
+                    break
+                except LookupError:
+                    pass
+
+        if parser is None:
+            # fall back to naive chunking
+            yield from StringChunker(self.__chunk_size, self.__overlap_ratio).chunk(
+                content
+            )
+        else:
+            content_bytes = content.encode()
+            tree = parser.parse(content_bytes)
+            yield from self.__chunk_node(tree.root_node, content)
diff --git a/src/vectorcode/subcommands/vectorise.py b/src/vectorcode/subcommands/vectorise.py
@@ -13,7 +13,7 @@
 from chromadb.api.models.AsyncCollection import AsyncCollection
 from chromadb.api.types import IncludeEnum
 
-from vectorcode.chunking import FileChunker
+from vectorcode.chunking import TreeSitterChunker
 from vectorcode.cli_utils import Config, expand_globs, expand_path
 from vectorcode.common import get_client, get_collection, verify_ef
 
@@ -54,24 +54,23 @@ async def chunked_add(
 
     try:
         async with semaphore:
-            with open(full_path_str) as fin:
-                chunks = list(
-                    FileChunker(configs.chunk_size, configs.overlap_ratio).chunk(fin)
+            chunks = list(
+                TreeSitterChunker(configs.chunk_size, configs.overlap_ratio).chunk(
+                    full_path_str
                 )
-                if len(chunks) == 0 or (len(chunks) == 1 and chunks[0] == ""):
-                    # empty file
-                    return
-                chunks.append(str(os.path.relpath(full_path_str, configs.project_root)))
-                async with collection_lock:
-                    for idx in range(0, len(chunks), max_batch_size):
-                        inserted_chunks = chunks[idx : idx + max_batch_size]
-                        await collection.add(
-                            ids=[get_uuid() for _ in inserted_chunks],
-                            documents=inserted_chunks,
-                            metadatas=[
-                                {"path": full_path_str} for _ in inserted_chunks
-                            ],
-                        )
+            )
+            if len(chunks) == 0 or (len(chunks) == 1 and chunks[0] == ""):
+                # empty file
+                return
+            chunks.append(str(os.path.relpath(full_path_str, configs.project_root)))
+            async with collection_lock:
+                for idx in range(0, len(chunks), max_batch_size):
+                    inserted_chunks = chunks[idx : idx + max_batch_size]
+                    await collection.add(
+                        ids=[get_uuid() for _ in inserted_chunks],
+                        documents=inserted_chunks,
+                        metadatas=[{"path": full_path_str} for _ in inserted_chunks],
+                    )
     except UnicodeDecodeError:
         # probably binary. skip it.
         return
diff --git a/tests/test_chunking.py b/tests/test_chunking.py
@@ -1,4 +1,7 @@
-from vectorcode.chunking import FileChunker, StringChunker
+import os
+import tempfile
+
+from vectorcode.chunking import FileChunker, StringChunker, TreeSitterChunker
 
 
 class TestChunking:
@@ -46,3 +49,50 @@ def test_file_chunker(self):
         )
         for string_chunk, file_chunk in zip(string_chunks, file_chunks):
             assert string_chunk == file_chunk
+
+
+def test_treesitter_chunker():
+    """Test TreeSitterChunker with a sample file using tempfile."""
+    chunker = TreeSitterChunker(chunk_size=30)
+    test_content = r"""
+def foo():
+    return "foo"
+
+def bar():
+    return "bar"
+    """
+
+    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".py") as tmp_file:
+        tmp_file.write(test_content)
+        test_file = tmp_file.name
+
+    chunks = list(chunker.chunk(test_file))
+    assert len(chunks) == 2
+    assert all(len(i) <= 30 for i in chunks)
+
+    os.remove(test_file)
+
+
+def test_treesitter_chunker_fallback():
+    """Test that TreeSitterChunker falls back to StringChunker when no parser is found."""
+    chunk_size = 30
+    overlap_ratio = 0.2
+    tree_sitter_chunker = TreeSitterChunker(
+        chunk_size=chunk_size, overlap_ratio=overlap_ratio
+    )
+    string_chunker = StringChunker(chunk_size=chunk_size, overlap_ratio=overlap_ratio)
+
+    test_content = "This is a test string."
+
+    with tempfile.NamedTemporaryFile(
+        mode="w", delete=False, suffix=".xyz"
+    ) as tmp_file:  # Use an uncommon extension
+        tmp_file.write(test_content)
+        test_file = tmp_file.name
+
+    tree_sitter_chunks = list(tree_sitter_chunker.chunk(test_file))
+    string_chunks = list(string_chunker.chunk(test_content))
+
+    assert tree_sitter_chunks == string_chunks
+
+    os.remove(test_file)