3
3
import copy
4
4
import logging
5
5
from abc import ABC , abstractmethod
6
+ from collections .abc import Collection , Iterable , Sequence , Set
6
7
from dataclasses import dataclass
7
8
from enum import Enum
8
9
from typing import (
9
- AbstractSet ,
10
10
Any ,
11
11
Callable ,
12
- Collection ,
13
- Iterable ,
14
- List ,
15
12
Literal ,
16
13
Optional ,
17
- Sequence ,
18
- Type ,
19
14
TypeVar ,
20
15
Union ,
21
16
)
@@ -64,12 +59,12 @@ def __init__(
64
59
self ._strip_whitespace = strip_whitespace
65
60
66
61
@abstractmethod
67
- def split_text (self , text : str ) -> List [str ]:
62
+ def split_text (self , text : str ) -> list [str ]:
68
63
"""Split text into multiple components."""
69
64
70
65
def create_documents (
71
66
self , texts : list [str ], metadatas : Optional [list [dict [Any , Any ]]] = None
72
- ) -> List [Document ]:
67
+ ) -> list [Document ]:
73
68
"""Create documents from a list of texts."""
74
69
_metadatas = metadatas or [{}] * len (texts )
75
70
documents = []
@@ -87,15 +82,15 @@ def create_documents(
87
82
documents .append (new_doc )
88
83
return documents
89
84
90
- def split_documents (self , documents : Iterable [Document ]) -> List [Document ]:
85
+ def split_documents (self , documents : Iterable [Document ]) -> list [Document ]:
91
86
"""Split documents."""
92
87
texts , metadatas = [], []
93
88
for doc in documents :
94
89
texts .append (doc .page_content )
95
90
metadatas .append (doc .metadata )
96
91
return self .create_documents (texts , metadatas = metadatas )
97
92
98
- def _join_docs (self , docs : List [str ], separator : str ) -> Optional [str ]:
93
+ def _join_docs (self , docs : list [str ], separator : str ) -> Optional [str ]:
99
94
text = separator .join (docs )
100
95
if self ._strip_whitespace :
101
96
text = text .strip ()
@@ -104,13 +99,13 @@ def _join_docs(self, docs: List[str], separator: str) -> Optional[str]:
104
99
else :
105
100
return text
106
101
107
- def _merge_splits (self , splits : Iterable [str ], separator : str ) -> List [str ]:
102
+ def _merge_splits (self , splits : Iterable [str ], separator : str ) -> list [str ]:
108
103
# We now want to combine these smaller pieces into medium size
109
104
# chunks to send to the LLM.
110
105
separator_len = self ._length_function (separator )
111
106
112
107
docs = []
113
- current_doc : List [str ] = []
108
+ current_doc : list [str ] = []
114
109
total = 0
115
110
for d in splits :
116
111
_len = self ._length_function (d )
@@ -169,10 +164,10 @@ def _huggingface_tokenizer_length(text: str) -> int:
169
164
170
165
@classmethod
171
166
def from_tiktoken_encoder (
172
- cls : Type [TS ],
167
+ cls : type [TS ],
173
168
encoding_name : str = "gpt2" ,
174
169
model_name : Optional [str ] = None ,
175
- allowed_special : Union [Literal ["all" ], AbstractSet [str ]] = set (),
170
+ allowed_special : Union [Literal ["all" ], Set [str ]] = set (),
176
171
disallowed_special : Union [Literal ["all" ], Collection [str ]] = "all" ,
177
172
** kwargs : Any ,
178
173
) -> TS :
@@ -225,7 +220,7 @@ def __init__(
225
220
self ,
226
221
encoding_name : str = "gpt2" ,
227
222
model_name : Optional [str ] = None ,
228
- allowed_special : Union [Literal ["all" ], AbstractSet [str ]] = set (),
223
+ allowed_special : Union [Literal ["all" ], Set [str ]] = set (),
229
224
disallowed_special : Union [Literal ["all" ], Collection [str ]] = "all" ,
230
225
** kwargs : Any ,
231
226
) -> None :
@@ -248,7 +243,7 @@ def __init__(
248
243
self ._allowed_special = allowed_special
249
244
self ._disallowed_special = disallowed_special
250
245
251
- def split_text (self , text : str ) -> List [str ]:
246
+ def split_text (self , text : str ) -> list [str ]:
252
247
"""Splits the input text into smaller chunks based on tokenization.
253
248
254
249
This method uses a custom tokenizer configuration to encode the input text
@@ -264,7 +259,7 @@ def split_text(self, text: str) -> List[str]:
264
259
of the input text based on the tokenization and chunking rules.
265
260
"""
266
261
267
- def _encode (_text : str ) -> List [int ]:
262
+ def _encode (_text : str ) -> list [int ]:
268
263
return self ._tokenizer .encode (
269
264
_text ,
270
265
allowed_special = self ._allowed_special ,
@@ -320,15 +315,15 @@ class Tokenizer:
320
315
"""Overlap in tokens between chunks"""
321
316
tokens_per_chunk : int
322
317
"""Maximum number of tokens per chunk"""
323
- decode : Callable [[List [int ]], str ]
318
+ decode : Callable [[list [int ]], str ]
324
319
""" Function to decode a list of token ids to a string"""
325
- encode : Callable [[str ], List [int ]]
320
+ encode : Callable [[str ], list [int ]]
326
321
""" Function to encode a string to a list of token ids"""
327
322
328
323
329
- def split_text_on_tokens (* , text : str , tokenizer : Tokenizer ) -> List [str ]:
324
+ def split_text_on_tokens (* , text : str , tokenizer : Tokenizer ) -> list [str ]:
330
325
"""Split incoming text and return chunks using tokenizer."""
331
- splits : List [str ] = []
326
+ splits : list [str ] = []
332
327
input_ids = tokenizer .encode (text )
333
328
start_idx = 0
334
329
cur_idx = min (start_idx + tokenizer .tokens_per_chunk , len (input_ids ))
0 commit comments