Skip to content

Commit 36e1d18

Browse files
authored
feat(cli): Better parser selection for hybrid filetypes (#138)
Fallback to primary language name when tree_sitter_language_pack won't find a parser for 'hybrid' lexer language names (like 'javascript+genshitext')
2 parents 406f451 + e15967d commit 36e1d18

File tree

4 files changed

+185
-17
lines changed

4 files changed

+185
-17
lines changed

docs/cli.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,19 @@ The JSON configuration file may hold the following values:
324324
"hnsw:construction_ef": 100
325325
}
326326
```
327+
- `filetype_map`: `dict[str, list[str]]`, a dictionary where keys are
328+
[language name](https://github.com/Goldziher/tree-sitter-language-pack?tab=readme-ov-file#available-languages)
329+
and values are lists of [Python regex patterns](https://docs.python.org/3/library/re.html)
330+
that will match file extensions. This allows overriding automatic language
331+
detection and specifying a treesitter parser for certain file types for which the language parser cannot be
332+
correctly identified (e.g., `.phtml` files containing both php and html).
333+
Example configuration:
334+
```json5
335+
"filetype_map": {
336+
"php": ["^phtml$"]
337+
}
338+
```
339+
327340
- `chunk_filters`: `dict[str, list[str]]`, a dictionary where the keys are
328341
[language name](https://github.com/Goldziher/tree-sitter-language-pack?tab=readme-ov-file#available-languages)
329342
and values are lists of [Python regex patterns](https://docs.python.org/3/library/re.html)

src/vectorcode/chunking.py

Lines changed: 53 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from typing import Generator, Optional
99

1010
from pygments.lexer import Lexer
11-
from pygments.lexers import guess_lexer_for_filename
11+
from pygments.lexers import get_lexer_for_filename
1212
from pygments.util import ClassNotFound
1313
from tree_sitter import Node, Point
1414
from tree_sitter_language_pack import get_parser
@@ -240,7 +240,7 @@ def __chunk_node(
240240
@cache
241241
def __guess_type(self, path: str, content: str) -> Optional[Lexer]:
242242
try:
243-
return guess_lexer_for_filename(path, content)
243+
return get_lexer_for_filename(path, content)
244244

245245
except ClassNotFound:
246246
return None
@@ -279,6 +279,40 @@ def __load_file_lines(self, path: str) -> list[str]:
279279
lines = fin.readlines()
280280
return lines
281281

282+
283+
def __get_parser_from_config(self, file_path: str):
284+
"""
285+
Get parser based on filetype_map config.
286+
"""
287+
filetype_map = self.config.filetype_map
288+
if not filetype_map:
289+
logger.debug("filetype_map is empty in config.")
290+
return None
291+
292+
filename = os.path.basename(file_path)
293+
extension = os.path.splitext(file_path)[1]
294+
if extension.startswith('.'):
295+
extension = extension[1:]
296+
logger.debug(f"Checking filetype map for extension '{extension}' in {filename}")
297+
for _language, patterns in filetype_map.items():
298+
language = _language.lower()
299+
for pattern in patterns:
300+
try:
301+
if re.search(pattern, extension):
302+
logger.debug(f"'{filename}' extension matches pattern '{pattern}' for language '{language}'. Attempting to load parser.")
303+
parser = get_parser(language)
304+
logger.debug(f"Found parser for language '{language}' from config.")
305+
return parser
306+
except re.error as e:
307+
e.add_note(f"\nInvalid regex pattern '{pattern}' for language '{language}' in filetype_map")
308+
raise
309+
except LookupError as e:
310+
e.add_note(f"\nTreeSitter Parser for language '{language}' not found. Please check your filetype_map config.")
311+
raise
312+
313+
logger.debug(f"No matching filetype map entry found for {filename}.")
314+
return None
315+
282316
def chunk(self, data: str) -> Generator[Chunk, None, None]:
283317
"""
284318
data: path to the file
@@ -294,21 +328,23 @@ def chunk(self, data: str) -> Generator[Chunk, None, None]:
294328
return
295329
parser = None
296330
language = None
297-
lexer = self.__guess_type(data, content)
298-
if lexer is not None:
299-
lang_names = [lexer.name]
300-
lang_names.extend(lexer.aliases)
301-
for name in lang_names:
302-
try:
303-
parser = get_parser(name.lower())
304-
if parser is not None:
305-
language = name.lower()
306-
logger.debug(
307-
"Detected %s filetype for treesitter chunking.", language
308-
)
309-
break
310-
except LookupError: # pragma: nocover
311-
pass
331+
parser = self.__get_parser_from_config(data)
332+
if parser is None:
333+
lexer = self.__guess_type(data, content)
334+
if lexer is not None:
335+
lang_names = [lexer.name]
336+
lang_names.extend(lexer.aliases)
337+
for name in lang_names:
338+
try:
339+
parser = get_parser(name.lower())
340+
if parser is not None:
341+
language = name.lower()
342+
logger.debug(
343+
"Detected %s filetype for treesitter chunking.", language
344+
)
345+
break
346+
except LookupError: # pragma: nocover
347+
pass
312348

313349
if parser is None:
314350
logger.debug(

src/vectorcode/cli_utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ class Config:
9494
)
9595
hnsw: dict[str, str | int] = field(default_factory=dict)
9696
chunk_filters: dict[str, list[str]] = field(default_factory=dict)
97+
filetype_map: dict[str, list[str]] = field(default_factory=dict)
9798
encoding: str = "utf8"
9899
hooks: bool = False
99100

@@ -156,6 +157,9 @@ async def import_from(cls, config_dict: dict[str, Any]) -> "Config":
156157
"chunk_filters": config_dict.get(
157158
"chunk_filters", default_config.chunk_filters
158159
),
160+
"filetype_map": config_dict.get(
161+
"filetype_map", default_config.filetype_map
162+
),
159163
"encoding": config_dict.get("encoding", default_config.encoding),
160164
}
161165
)

tests/test_chunking.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,121 @@ def bar():
223223
assert chunks == ['def 测试():\n return "foo"', 'def bar():\n return "bar"']
224224
os.remove(test_file)
225225

226+
def test_treesitter_chunker_javascript():
227+
"""Test TreeSitterChunker with a sample javascript file using tempfile."""
228+
chunker = TreeSitterChunker(Config(chunk_size=60))
229+
230+
test_content = r"""
231+
function foo() {
232+
return "foo";
233+
}
234+
235+
function bar() {
236+
return "bar";
237+
}
238+
"""
239+
240+
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".js") as tmp_file:
241+
tmp_file.write(test_content)
242+
test_file = tmp_file.name
243+
244+
chunks = list(str(i) for i in chunker.chunk(test_file))
245+
assert chunks == ['function foo() {\n return "foo";\n}', 'function bar() {\n return "bar";\n}']
246+
os.remove(test_file)
247+
248+
def test_treesitter_chunker_javascript_genshi():
249+
"""Test TreeSitterChunker with a sample javascript + genshi file using tempfile. (bypassing lexers via the filetype_map config param)"""
250+
chunker = TreeSitterChunker(Config(chunk_size=60, filetype_map={"javascript": ["^kid$"]}))
251+
252+
test_content = r"""
253+
function foo() {
254+
return `foo with ${genshi}`;
255+
}
256+
257+
function bar() {
258+
return "bar";
259+
}
260+
"""
261+
262+
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".kid") as tmp_file:
263+
tmp_file.write(test_content)
264+
test_file = tmp_file.name
265+
266+
chunks = list(str(i) for i in chunker.chunk(test_file))
267+
assert chunks == ['function foo() {\n return `foo with ${genshi}`;\n}', 'function bar() {\n return "bar";\n}']
268+
os.remove(test_file)
269+
270+
def test_treesitter_chunker_parser_from_config_no_parser_found_error():
271+
"""Test TreeSitterChunker filetype_map: should raise an error if no parser is found"""
272+
chunker = TreeSitterChunker(Config(chunk_size=60, filetype_map={"unknown_parser": ["^kid$"]}))
273+
274+
test_content = r"""
275+
function foo() {
276+
return `foo with ${genshi}`;
277+
}
278+
279+
function bar() {
280+
return "bar";
281+
}
282+
"""
283+
284+
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".kid") as tmp_file:
285+
tmp_file.write(test_content)
286+
test_file = tmp_file.name
287+
288+
289+
with pytest.raises(LookupError):
290+
chunks = list(str(i) for i in chunker.chunk(test_file))
291+
assert chunks == []
292+
os.remove(test_file)
293+
294+
def test_treesitter_chunker_parser_from_config_regex_error():
295+
"""Test TreeSitterChunker filetype_map: should raise an error if a regex is invalid"""
296+
chunker = TreeSitterChunker(Config(chunk_size=60, filetype_map={"javascript": ["\\"]}))
297+
298+
test_content = r"""
299+
function foo() {
300+
return `foo with ${genshi}`;
301+
}
302+
303+
function bar() {
304+
return "bar";
305+
}
306+
"""
307+
308+
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".kid") as tmp_file:
309+
tmp_file.write(test_content)
310+
test_file = tmp_file.name
311+
312+
313+
with pytest.raises(Exception):
314+
chunks = list(str(i) for i in chunker.chunk(test_file))
315+
assert chunks == []
316+
os.remove(test_file)
317+
318+
def test_treesitter_chunker_parser_from_config_no_language_match():
319+
"""Test TreeSitterChunker filetype_map: should continue with the lexer parser checks if no language matches a regex"""
320+
chunker = TreeSitterChunker(Config(chunk_size=60, filetype_map={"php": ["^jsx$"]}))
321+
322+
test_content = r"""
323+
function foo() {
324+
return "foo";
325+
}
326+
327+
function bar() {
328+
return "bar";
329+
}
330+
"""
331+
332+
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".js") as tmp_file:
333+
tmp_file.write(test_content)
334+
test_file = tmp_file.name
335+
336+
chunks = list(str(i) for i in chunker.chunk(test_file))
337+
assert chunks == ['function foo() {\n return "foo";\n}', 'function bar() {\n return "bar";\n}']
338+
os.remove(test_file)
339+
340+
226341

227342
def test_treesitter_chunker_filter():
228343
chunker = TreeSitterChunker(

0 commit comments

Comments
 (0)