Minor performance improvements.

umarbutler · umarbutler · commit 55910cb325ca · 2023-11-07T22:25:49.000+11:00
diff --git a/src/semchunk/semchunk.py b/src/semchunk/semchunk.py
@@ -8,8 +8,10 @@
 )
 """A tuple of semantically meaningful non-whitespace splitters that may be used to chunk texts, ordered from most desirable to least desirable."""
     
-def _split_text(text: str) -> tuple[str, list[str]]:
+def _split_text(text: str) -> tuple[str, bool, list[str]]:
     """Split text using the most semantically meaningful splitter possible."""
+    
+    splitter_is_whitespace = True
 
     # Try splitting at, in order of most desirable to least desirable:
     # - The largest sequence of newlines and/or carriage returns;
@@ -29,14 +31,15 @@ def _split_text(text: str) -> tuple[str, list[str]]:
         # Identify the most desirable semantically meaningful non-whitespace splitter present in the text.
         for splitter in NON_WHITESPACE_SEMANTIC_SPLITTERS:
             if splitter in text:
+                splitter_is_whitespace = False
                 break
         
         # If no semantically meaningful splitter is present in the text, return an empty string as the splitter and the text as a list of characters.
         else: # NOTE This code block will only be executed if the for loop completes without breaking.
-            return '', list(text)
+            return '', splitter_is_whitespace, list(text)
     
     # Return the splitter and the split text.
-    return splitter, text.split(splitter)
+    return splitter, splitter_is_whitespace, text.split(splitter)
 
 def chunk(text: str, chunk_size: int, token_counter: callable, _recursion_depth: int = 0) -> list[str]:
     """Split text into semantically meaningful chunks of a specified size as determined by the provided token counter.
@@ -50,13 +53,10 @@ def chunk(text: str, chunk_size: int, token_counter: callable, _recursion_depth:
         list[str]: A list of chunks up to `chunk_size`-tokens-long, with any whitespace used to split the text removed."""
 
     # Split the text using the most semantically meaningful splitter possible.
-    splitter, splits = _split_text(text)
-    
-    # Flag whether the splitter is whitespace.
-    splitter_is_whitespace = not splitter.split()
+    splitter, splitter_is_whitespace, splits = _split_text(text)
     
     chunks = []
-    skips = []
+    skips = set()
     """A list of indices of splits to skip because they have already been added to a chunk."""
     
     # Iterate through the splits.
@@ -71,26 +71,23 @@ def chunk(text: str, chunk_size: int, token_counter: callable, _recursion_depth:
 
         # If the split is equal to or under the chunk size, merge it with all subsequent splits until the chunk size is reached.
         else:
-            # Initialise a list of splits to be merged into a new chunk.
-            new_chunk = [split]
+            # Initalise the new chunk.
+            new_chunk = split
             
             # Iterate through each subsequent split until the chunk size is reached.
             for j, next_split in enumerate(splits[i+1:], start=i+1):
                 # Check whether the next split can be added to the chunk without exceeding the chunk size.
-                if token_counter(splitter.join(new_chunk+[next_split])) <= chunk_size:
-                    # Add the next split to the chunk.
-                    new_chunk.append(next_split)
+                if token_counter(updated_chunk:=new_chunk+splitter+next_split) <= chunk_size:
+                    # Add the next split to the new chunk.
+                    new_chunk = updated_chunk
                     
                     # Add the index of the next split to the list of indices to skip.
-                    skips.append(j)
+                    skips.add(j)
                 
                 # If the next split cannot be added to the chunk without exceeding the chunk size, break.
                 else:
                     break
             
-            # Join the splits with the splitter.
-            new_chunk = splitter.join(new_chunk)
-            
             # Add the chunk.
             chunks.append(new_chunk)
 
@@ -104,6 +101,6 @@ def chunk(text: str, chunk_size: int, token_counter: callable, _recursion_depth:
     
     # If this is not a recursive call, remove any empty chunks.
     if not _recursion_depth:
-        chunks = [chunk for chunk in chunks if chunk]
+        chunks = list(filter(None, chunks))
     
     return chunks