@@ -59,6 +59,22 @@ def num_tokens_from_string(self, string: str) -> int:
5959
6060 return len (encoding .encode (string ))
6161
62+ def clean_chunks_and_map (self , chunks , is_table_or_figure_map ):
63+ cleaned_chunks = []
64+ cleaned_is_table_or_figure_map = []
65+
66+ for current_chunk , is_table_or_figure in zip (chunks , is_table_or_figure_map ):
67+ cleaned_chunk = current_chunk .strip ()
68+ if len (cleaned_chunk ) > 0 :
69+ # Add a newline if the chunk ends with a newline (it was a title)
70+ if self .is_markdown_heading (current_chunk ):
71+ cleaned_chunk = "\n \n " + cleaned_chunk + "\n \n "
72+
73+ cleaned_chunks .append (cleaned_chunk )
74+ cleaned_is_table_or_figure_map .append (is_table_or_figure )
75+
76+ return cleaned_chunks , cleaned_is_table_or_figure_map
77+
6278 async def chunk (self , text : str ) -> list [dict ]:
6379 """Attempts to chunk the text by:
6480 Splitting into sentences
@@ -86,6 +102,10 @@ async def chunk(self, text: str) -> list[dict]:
86102 grouped_sentences , is_table_or_figure_map
87103 )
88104
105+ forward_pass_chunks , new_is_table_or_figure_map = self .clean_chunks_and_map (
106+ forward_pass_chunks , new_is_table_or_figure_map
107+ )
108+
89109 logging .info (
90110 f"""Number of Forward pass chunks: {
91111 len (forward_pass_chunks )} """
@@ -129,7 +149,7 @@ def filter_empty_figures(self, text):
129149
130150 def clean_new_lines (self , text ):
131151 # Remove single newlines surrounded by < and >
132- cleaned_text = re .sub (r"(?<=>)(\n)(?=<)" , "" , text )
152+ cleaned_text = re .sub (r"(?<=>)(\n)(?=<)" , "" , text . strip () )
133153
134154 # Replace all other single newlines with space
135155 cleaned_text = re .sub (r"(?<!\n)\n(?!\n)" , " " , cleaned_text )
@@ -190,7 +210,7 @@ def split_into_sentences(self, text: str) -> list[str]:
190210 self .is_markdown_heading (part )
191211 and part .endswith ("\n \n " ) is False
192212 ):
193- part = part + "\n \n "
213+ part = " \n \n " + part + "\n \n "
194214
195215 heading_split_sentences .append (part )
196216
@@ -300,23 +320,36 @@ def retrive_current_chunk_at_n(n):
300320 else :
301321 return current_chunk [n ]
302322
303- current_chunk_tokens = self .num_tokens_from_string (" " .join (current_chunk ))
323+ def get_current_chunk_tokens (chunk_segments ):
324+ return self .num_tokens_from_string (" " .join (chunk_segments ))
325+
326+ current_chunk_tokens = get_current_chunk_tokens (current_chunk )
304327
305328 if len (current_chunk ) >= 2 and current_chunk_tokens >= self .min_chunk_tokens :
306- logging .info ("Comparing chunks" )
307- cosine_sim = self .sentence_similarity (
308- retrieve_current_chunks_from_n (- 2 ), current_sentence
309- )
329+ # Calculate the tokens if we were to split
330+ if len (current_chunk ) > 2 :
331+ would_be_new_chunk = retrieve_current_chunk_up_to_n (1 )
332+ would_be_current_chunk = [retrive_current_chunk_at_n (- 1 )]
333+ else :
334+ would_be_new_chunk = retrive_current_chunk_at_n (0 )
335+ would_be_current_chunk = [retrive_current_chunk_at_n (1 )]
336+
310337 if (
311- cosine_sim < self .similarity_threshold
312- or current_chunk_tokens >= self .max_chunk_tokens
338+ get_current_chunk_tokens (would_be_new_chunk ) >= self .min_chunk_tokens
339+ and get_current_chunk_tokens (would_be_current_chunk )
340+ >= self .min_chunk_tokens
313341 ):
314- if len (current_chunk ) > 2 :
315- new_chunk = retrieve_current_chunk_up_to_n (1 )
316- current_chunk = [retrive_current_chunk_at_n (- 1 )]
317- else :
318- new_chunk = retrive_current_chunk_at_n (0 )
319- current_chunk = [retrive_current_chunk_at_n (1 )]
342+ logging .info ("Comparing chunks" )
343+ if (
344+ current_chunk_tokens >= self .max_chunk_tokens
345+ or self .sentence_similarity (
346+ retrieve_current_chunks_from_n (- 2 ), current_sentence
347+ )
348+ < self .similarity_threshold
349+ ):
350+ return would_be_new_chunk , would_be_current_chunk
351+ else :
352+ logging .info ("Chunk too small to compare" )
320353 else :
321354 logging .info ("Chunk too small to compare" )
322355
0 commit comments