Skip to content

Commit ad048d1

Browse files
committed
feat(ops): add SplitBySeparators (Rust+Py) + registration
1 parent a3a0c3a commit ad048d1

File tree

4 files changed

+454
-2
lines changed

4 files changed

+454
-2
lines changed

python/cocoindex/functions.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@
22

33
import dataclasses
44
import functools
5-
from typing import Annotated, Any, Literal
5+
from typing import Any, Literal
66

77
import numpy as np
88
from numpy.typing import NDArray
99

1010
from . import llm, op
11-
from .typing import TypeAttr, Vector
11+
from .typing import Vector
1212

1313

1414
class ParseJson(op.FunctionSpec):
@@ -40,6 +40,24 @@ class SplitRecursively(op.FunctionSpec):
4040
custom_languages: list[CustomLanguageSpec] = dataclasses.field(default_factory=list)
4141

4242

43+
class SplitBySeparators(op.FunctionSpec):
44+
"""
45+
Split text by specified regex separators only (no chunk-size planning).
46+
Output schema matches SplitRecursively for drop-in compatibility:
47+
KTable rows with fields: location (Range), text (Str), start, end.
48+
Args:
49+
separators_regex: list[str] # e.g., [r"\\n\\n+"]
50+
keep_separator: Literal["none", "left", "right"] = "none"
51+
include_empty: bool = False
52+
trim: bool = True
53+
"""
54+
55+
separators_regex: list[str] = dataclasses.field(default_factory=list)
56+
keep_separator: Literal["none", "left", "right"] = "none"
57+
include_empty: bool = False
58+
trim: bool = True
59+
60+
4361
class EmbedText(op.FunctionSpec):
4462
"""Embed a text into a vector space."""
4563

src/ops/functions/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
pub mod embed_text;
22
pub mod extract_by_llm;
33
pub mod parse_json;
4+
pub mod split_by_separators;
45
pub mod split_recursively;
56

67
#[cfg(test)]

0 commit comments

Comments
 (0)