77from typing import Tuple , List , Any , Callable
88
99from fitframework import fit_logger
10+ from fitframework .api .decorators import fitable
1011from llama_index .core .node_parser import (
1112 SentenceSplitter ,
1213 TokenTextSplitter ,
1718from llama_index .core .schema import Document as LDocument
1819from llama_index .embeddings .openai import OpenAIEmbedding
1920
20- from .callable_registers import register_callable_tool
2121from .node_utils import to_llama_index_document
22+ from .types .semantic_splitter_options import SemanticSplitterOptions
2223
23-
24- def sentence_splitter (text : str , separator : str , chunk_size : int , chunk_overlap : int , ** kwargs ) -> List [str ]:
24+ @ fitable ( "llama.tools.sentence_splitter" , "default" )
25+ def sentence_splitter (text : str , separator : str , chunk_size : int , chunk_overlap : int ) -> List [str ]:
2526 """Parse text with a preference for complete sentences."""
2627 if len (text ) == 0 :
2728 return []
@@ -38,7 +39,8 @@ def sentence_splitter(text: str, separator: str, chunk_size: int, chunk_overlap:
3839 return []
3940
4041
41- def token_text_splitter (text : str , separator : str , chunk_size : int , chunk_overlap : int , ** kwargs ) -> List [str ]:
42+ @fitable ("llama.tools.token_text_splitter" , "default" )
43+ def token_text_splitter (text : str , separator : str , chunk_size : int , chunk_overlap : int ) -> List [str ]:
4244 """Splitting text that looks at word tokens."""
4345 if len (text ) == 0 :
4446 return []
@@ -55,14 +57,15 @@ def token_text_splitter(text: str, separator: str, chunk_size: int, chunk_overla
5557 return []
5658
5759
58- def semantic_splitter (buffer_size : int , breakpoint_percentile_threshold : int , docs : List [LDocument ], ** kwargs ) \
60+ # @fitable("llama.tools.semantic_splitter", "default")
61+ def semantic_splitter (buffer_size : int , breakpoint_percentile_threshold : int , docs : List [LDocument ], options : SemanticSplitterOptions ) \
5962 -> List [BaseNode ]:
6063 """Splitting text that looks at word tokens."""
6164 if len (docs ) == 0 :
6265 return []
63- api_key = kwargs . get ( " api_key" )
64- model_name = kwargs . get ( " model_name" )
65- api_base = kwargs . get ( " api_base" )
66+ api_key = options . api_key
67+ model_name = options . model_name
68+ api_base = options . api_base
6669
6770 embed_model = OpenAIEmbedding (model_name = model_name , api_base = api_base , api_key = api_key , max_tokens = 4096 )
6871
@@ -80,8 +83,9 @@ def semantic_splitter(buffer_size: int, breakpoint_percentile_threshold: int, do
8083 return []
8184
8285
86+ # @fitable("llama.tools.sentence_window_node_parser", "default")
8387def sentence_window_node_parser (window_size : int , window_metadata_key : str , original_text_metadata_key : str ,
84- docs : List [LDocument ], ** kwargs ) -> List [BaseNode ]:
88+ docs : List [LDocument ]) -> List [BaseNode ]:
8589 """Splitting text that looks at word tokens."""
8690 if len (docs ) == 0 :
8791 return []
@@ -96,26 +100,4 @@ def sentence_window_node_parser(window_size: int, window_metadata_key: str, orig
96100 except BaseException :
97101 fit_logger .error ("Invoke semantic splitter failed." )
98102 traceback .print_exc ()
99- return []
100-
101-
102- # Tuple 结构: (tool_func, config_args, return_description)
103- splitter_basic_toolkit : List [Tuple [Callable [..., Any ], List [str ], str ]] = [
104- (sentence_splitter , ["text" , "separator" , "chunk_size" , "chunk_overlap" ], "Split sentences by sentence." ),
105- (token_text_splitter , ["text" , "separator" , "chunk_size" , "chunk_overlap" ], "Split sentences by token." ),
106- (semantic_splitter ,
107- ["docs" , "buffer_size" , "breakpoint_percentile_threshold" , "chunk_overlap" , "model_name" , "api_key" , "api_base" ],
108- "Split sentences by semantic." ),
109- (sentence_window_node_parser , ["docs" , "window_size" , "window_metadata_key" , "original_text_metadata_key" ],
110- "Splits all documents into individual sentences" )
111- ]
112-
113- for tool in splitter_basic_toolkit :
114- register_callable_tool (tool , sentence_splitter .__module__ , "llama_index.rag.toolkit" )
115-
116- if __name__ == '__main__' :
117- import time
118- from .llama_schema_helper import dump_llama_schema
119-
120- current_timestamp = time .strftime ('%Y%m%d%H%M%S' )
121- dump_llama_schema (splitter_basic_toolkit , f"./llama_tool_schema-{ str (current_timestamp )} .json" )
103+ return []
0 commit comments