Skip to content

Commit 1333d0d

Browse files
authored
feat(chunking): allow customizing separators for the splitter (#584)
1 parent 3f808f8 commit 1333d0d

File tree

3 files changed

+305
-204
lines changed

3 files changed

+305
-204
lines changed

python/cocoindex/functions.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""All builtin functions."""
22

33
from typing import Annotated, Any, TYPE_CHECKING
4+
import dataclasses
45

56
from .typing import Float32, Vector, TypeAttr
67
from . import op, llm
@@ -14,9 +15,20 @@ class ParseJson(op.FunctionSpec):
1415
"""Parse a text into a JSON object."""
1516

1617

18+
@dataclasses.dataclass
19+
class CustomLanguageSpec:
20+
"""Custom language specification."""
21+
22+
language_name: str
23+
separators_regex: list[str]
24+
aliases: list[str] = dataclasses.field(default_factory=list)
25+
26+
1727
class SplitRecursively(op.FunctionSpec):
1828
"""Split a document (in string) recursively."""
1929

30+
custom_languages: list[CustomLanguageSpec] = dataclasses.field(default_factory=list)
31+
2032

2133
class ExtractByLlm(op.FunctionSpec):
2234
"""Extract information from a text using a LLM."""

0 commit comments

Comments
 (0)