Skip to content

Commit dbe1abc

Browse files
JufraliceDavidyz
andauthored
fix(chunking): Fallback to StringChunker for Tree-sitter nodes with no children (#145)
* fix(chunking): Fallback to StringChunker for Tree-sitter nodes with no children When a Tree-sitter node has no children, the TreeSitterChunker would previously not yield any chunks for its content. This change adds a check for nodes with no children and falls back to using the StringChunker on the node's text, ensuring the content is processed. * test and cov --------- Co-authored-by: Zhe Yu <zcabzyu@ucl.ac.uk>
1 parent d899df3 commit dbe1abc

File tree

2 files changed

+27
-1
lines changed

2 files changed

+27
-1
lines changed

src/vectorcode/chunking.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ def __init__(self, config: Optional[Config] = None):
141141
if config is None:
142142
config = Config()
143143
super().__init__(config)
144+
self._fallback_chunker = StringChunker(config)
144145

145146
def __chunk_node(
146147
self, node: Node, text_bytes: bytes
@@ -153,6 +154,12 @@ def __chunk_node(
153154
prev_node = None
154155
current_start = None
155156

157+
logger.debug("nbr children: %s", len(node.children))
158+
# if node has no children we fallback to the string chunker
159+
if len(node.children) == 0 and node.text:
160+
logger.debug("No children, falling back to string chunker")
161+
yield from self._fallback_chunker.chunk(node.text.decode())
162+
156163
for child in node.children:
157164
child_bytes = text_bytes[child.start_byte : child.end_byte]
158165
child_text = child_bytes.decode()
@@ -307,7 +314,7 @@ def chunk(self, data: str) -> Generator[Chunk, None, None]:
307314
logger.debug(
308315
"Unable to pick a suitable parser. Fall back to naive chunking"
309316
)
310-
yield from StringChunker(self.config).chunk(content)
317+
yield from self._fallback_chunker.chunk(content)
311318
else:
312319
pattern_str = self.__build_pattern(language=language)
313320
content_bytes = content.encode()

tests/test_chunking.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import os
22
import tempfile
3+
from unittest.mock import MagicMock
34

45
import pytest
56
from tree_sitter import Point
@@ -159,6 +160,24 @@ def bar():
159160
os.remove(test_file)
160161

161162

163+
def test_treesitter_chunker_fallback_on_long_node():
164+
test_content = r"""
165+
def foo():
166+
return "a very very very very very long string"
167+
"""
168+
config = Config(chunk_size=15)
169+
with (
170+
tempfile.NamedTemporaryFile(
171+
mode="w", delete=False, suffix=".py"
172+
) as temp_py_file,
173+
):
174+
temp_py_file.write(test_content)
175+
ts_chunker = TreeSitterChunker(config)
176+
ts_chunker._fallback_chunker.chunk = MagicMock()
177+
list(ts_chunker.chunk(temp_py_file.name))
178+
ts_chunker._fallback_chunker.chunk.assert_called_once()
179+
180+
162181
def test_treesitter_chunker_python_encoding():
163182
"""Test TreeSitterChunker with a sample file using tempfile."""
164183
chunker = TreeSitterChunker(Config(chunk_size=30, encoding="gbk"))

0 commit comments

Comments
 (0)