@@ -65,13 +65,13 @@ def _patch_tokenizer_and_max_tokens(self) -> Self:
6565 )
6666 return self
6767
68- def _count_tokens (self , text : Optional [Union [str , list [str ]]]):
68+ def _count_text_tokens (self , text : Optional [Union [str , list [str ]]]):
6969 if text is None :
7070 return 0
7171 elif isinstance (text , list ):
7272 total = 0
7373 for t in text :
74- total += self ._count_tokens (t )
74+ total += self ._count_text_tokens (t )
7575 return total
7676 return len (self ._tokenizer .tokenize (text , max_length = None ))
7777
@@ -80,102 +80,83 @@ class _ChunkLengthInfo(BaseModel):
8080 text_len : int
8181 other_len : int
8282
83+ def _count_chunk_tokens (self , doc_chunk : DocChunk ):
84+ ser_txt = self .serialize (chunk = doc_chunk )
85+ return len (self ._tokenizer .tokenize (text = ser_txt , max_length = None ))
86+
8387 def _doc_chunk_length (self , doc_chunk : DocChunk ):
84- text_length = self ._count_tokens (doc_chunk .text )
85- headings_length = self ._count_tokens (doc_chunk .meta .headings )
86- captions_length = self ._count_tokens (doc_chunk .meta .captions )
87- total = text_length + headings_length + captions_length
88+ text_length = self ._count_text_tokens (doc_chunk .text )
89+ total = self ._count_chunk_tokens (doc_chunk = doc_chunk )
8890 return self ._ChunkLengthInfo (
8991 total_len = total ,
9092 text_len = text_length ,
9193 other_len = total - text_length ,
9294 )
9395
9496 def _make_chunk_from_doc_items (
95- self , doc_chunk : DocChunk , window_text : str , window_start : int , window_end : int
97+ self , doc_chunk : DocChunk , window_start : int , window_end : int
9698 ):
99+ doc_items = doc_chunk .meta .doc_items [window_start : window_end + 1 ]
97100 meta = DocMeta (
98- doc_items = doc_chunk . meta . doc_items [ window_start : window_end + 1 ] ,
101+ doc_items = doc_items ,
99102 headings = doc_chunk .meta .headings ,
100103 captions = doc_chunk .meta .captions ,
101104 origin = doc_chunk .meta .origin ,
102105 )
106+ window_text = (
107+ doc_chunk .text
108+ if len (doc_chunk .meta .doc_items ) == 1
109+ else self .delim .join (
110+ [
111+ doc_item .text
112+ for doc_item in doc_items
113+ if isinstance (doc_item , TextItem )
114+ ]
115+ )
116+ )
103117 new_chunk = DocChunk (text = window_text , meta = meta )
104118 return new_chunk
105119
106- def _merge_text (self , t1 , t2 ):
107- if t1 == "" :
108- return t2
109- elif t2 == "" :
110- return t1
111- else :
112- return f"{ t1 } { self .delim } { t2 } "
113-
114120 def _split_by_doc_items (self , doc_chunk : DocChunk ) -> list [DocChunk ]:
115- if doc_chunk .meta .doc_items is None or len (doc_chunk .meta .doc_items ) <= 1 :
116- return [doc_chunk ]
117- length = self ._doc_chunk_length (doc_chunk )
118- if length .total_len <= self .max_tokens :
119- return [doc_chunk ]
120- else :
121- chunks = []
122- window_start = 0
123- window_end = 0
124- window_text = ""
125- window_text_length = 0
126- other_length = length .other_len
127- num_items = len (doc_chunk .meta .doc_items )
128- while window_end < num_items :
129- doc_item = doc_chunk .meta .doc_items [window_end ]
130- if isinstance (doc_item , TextItem ):
131- text = doc_item .text
132- else :
133- raise RuntimeError ("Non-TextItem split not implemented yet" )
134- text_length = self ._count_tokens (text )
135- if (
136- text_length + window_text_length + other_length < self .max_tokens
137- and window_end < num_items - 1
138- ):
121+ chunks = []
122+ window_start = 0
123+ window_end = 0 # an inclusive index
124+ num_items = len (doc_chunk .meta .doc_items )
125+ while window_end < num_items :
126+ new_chunk = self ._make_chunk_from_doc_items (
127+ doc_chunk = doc_chunk ,
128+ window_start = window_start ,
129+ window_end = window_end ,
130+ )
131+ if self ._count_chunk_tokens (doc_chunk = new_chunk ) <= self .max_tokens :
132+ if window_end < num_items - 1 :
133+ window_end += 1
139134 # Still room left to add more to this chunk AND still at least one
140135 # item left
141- window_end += 1
142- window_text_length += text_length
143- window_text = self ._merge_text (window_text , text )
144- elif text_length + window_text_length + other_length < self .max_tokens :
136+ continue
137+ else :
145138 # All the items in the window fit into the chunk and there are no
146139 # other items left
147- window_text = self ._merge_text (window_text , text )
148- new_chunk = self ._make_chunk_from_doc_items (
149- doc_chunk , window_text , window_start , window_end
150- )
151- chunks .append (new_chunk )
152- window_end = num_items
153- elif window_start == window_end :
154- # Only one item in the window and it doesn't fit into the chunk. So
155- # we'll just make it a chunk for now and it will get split in the
156- # plain text splitter.
157- window_text = self ._merge_text (window_text , text )
158- new_chunk = self ._make_chunk_from_doc_items (
159- doc_chunk , window_text , window_start , window_end
160- )
161- chunks .append (new_chunk )
162- window_start = window_end + 1
163- window_end = window_start
164- window_text = ""
165- window_text_length = 0
166- else :
167- # Multiple items in the window but they don't fit into the chunk.
168- # However, the existing items must have fit or we wouldn't have
169- # gotten here. So we put everything but the last item into the chunk
170- # and then start a new window INCLUDING the current window end.
171- new_chunk = self ._make_chunk_from_doc_items (
172- doc_chunk , window_text , window_start , window_end - 1
173- )
174- chunks .append (new_chunk )
175- window_start = window_end
176- window_text = ""
177- window_text_length = 0
178- return chunks
140+ window_end = num_items # signalizing the last loop
141+ elif window_start == window_end :
142+ # Only one item in the window and it doesn't fit into the chunk. So
143+ # we'll just make it a chunk for now and it will get split in the
144+ # plain text splitter.
145+ window_end += 1
146+ window_start = window_end
147+ else :
148+ # Multiple items in the window but they don't fit into the chunk.
149+ # However, the existing items must have fit or we wouldn't have
150+ # gotten here. So we put everything but the last item into the chunk
151+ # and then start a new window INCLUDING the current window end.
152+ new_chunk = self ._make_chunk_from_doc_items (
153+ doc_chunk = doc_chunk ,
154+ window_start = window_start ,
155+ window_end = window_end - 1 ,
156+ )
157+ window_start = window_end
158+ chunks .append (new_chunk )
159+ return chunks
179160
180161 def _split_using_plain_text (
181162 self ,
@@ -204,53 +185,45 @@ def _split_using_plain_text(
204185 def _merge_chunks_with_matching_metadata (self , chunks : list [DocChunk ]):
205186 output_chunks = []
206187 window_start = 0
207- window_end = 0
188+ window_end = 0 # an inclusive index
208189 num_chunks = len (chunks )
209190 while window_end < num_chunks :
210191 chunk = chunks [window_end ]
211- lengths = self ._doc_chunk_length (chunk )
212192 headings_and_captions = (chunk .meta .headings , chunk .meta .captions )
213193 ready_to_append = False
214194 if window_start == window_end :
215- # starting a new block of chunks to potentially merge
216195 current_headings_and_captions = headings_and_captions
217- window_text = chunk .text
218- window_other_length = lengths .other_len
219- window_text_length = lengths .text_len
220- window_items = chunk .meta .doc_items
221196 window_end += 1
222197 first_chunk_of_window = chunk
223- elif (
224- headings_and_captions == current_headings_and_captions
225- and window_text_length + window_other_length + lengths .text_len
226- <= self .max_tokens
227- ):
228- # there is room to include the new chunk so add it to the window and
229- # continue
230- window_text = self ._merge_text (window_text , chunk .text )
231- window_text_length += lengths .text_len
232- window_items = window_items + chunk .meta .doc_items
233- window_end += 1
234198 else :
235- ready_to_append = True
236-
199+ chks = chunks [window_start : window_end + 1 ]
200+ doc_items = [it for chk in chks for it in chk .meta .doc_items ]
201+ candidate = DocChunk (
202+ text = self .delim .join ([chk .text for chk in chks ]),
203+ meta = DocMeta (
204+ doc_items = doc_items ,
205+ headings = current_headings_and_captions [0 ],
206+ captions = current_headings_and_captions [1 ],
207+ origin = chunk .meta .origin ,
208+ ),
209+ )
210+ if (
211+ headings_and_captions == current_headings_and_captions
212+ and self ._count_chunk_tokens (doc_chunk = candidate ) <= self .max_tokens
213+ ):
214+ # there is room to include the new chunk so add it to the window and
215+ # continue
216+ window_end += 1
217+ new_chunk = candidate
218+ else :
219+ ready_to_append = True
237220 if ready_to_append or window_end == num_chunks :
238221 # no more room OR the start of new metadata. Either way, end the block
239222 # and use the current window_end as the start of a new block
240223 if window_start + 1 == window_end :
241224 # just one chunk so use it as is
242225 output_chunks .append (first_chunk_of_window )
243226 else :
244- new_meta = DocMeta (
245- doc_items = window_items ,
246- headings = current_headings_and_captions [0 ],
247- captions = current_headings_and_captions [1 ],
248- origin = chunk .meta .origin ,
249- )
250- new_chunk = DocChunk (
251- text = window_text ,
252- meta = new_meta ,
253- )
254227 output_chunks .append (new_chunk )
255228 # no need to reset window_text, etc. because that will be reset in the
256229 # next iteration in the if window_start == window_end block
0 commit comments