|
2 | 2 |
|
3 | 3 | import dataclasses |
4 | 4 | import functools |
5 | | -from typing import Annotated, Any, Literal |
| 5 | +from typing import Any, Literal |
6 | 6 |
|
7 | 7 | import numpy as np |
8 | 8 | from numpy.typing import NDArray |
9 | 9 |
|
10 | 10 | from . import llm, op |
11 | | -from .typing import TypeAttr, Vector |
| 11 | +from .typing import Vector |
12 | 12 |
|
13 | 13 |
|
14 | 14 | class ParseJson(op.FunctionSpec): |
@@ -40,6 +40,24 @@ class SplitRecursively(op.FunctionSpec): |
40 | 40 | custom_languages: list[CustomLanguageSpec] = dataclasses.field(default_factory=list) |
41 | 41 |
|
42 | 42 |
|
| 43 | +class SplitBySeparators(op.FunctionSpec): |
| 44 | + """ |
| 45 | + Split text by specified regex separators only (no chunk-size planning). |
| 46 | + Output schema matches SplitRecursively for drop-in compatibility: |
| 47 | + KTable rows with fields: location (Range), text (Str), start, end. |
| 48 | + Args: |
| 49 | + separators_regex: list[str] # e.g., [r"\\n\\n+"] |
| 50 | + keep_separator: Literal["none", "left", "right"] = "none" |
| 51 | + include_empty: bool = False |
| 52 | + trim: bool = True |
| 53 | + """ |
| 54 | + |
| 55 | + separators_regex: list[str] = dataclasses.field(default_factory=list) |
| 56 | + keep_separator: Literal["none", "left", "right"] = "none" |
| 57 | + include_empty: bool = False |
| 58 | + trim: bool = True |
| 59 | + |
| 60 | + |
43 | 61 | class EmbedText(op.FunctionSpec): |
44 | 62 | """Embed a text into a vector space.""" |
45 | 63 |
|
|
0 commit comments