Added support for multiprocessing.

umarbutler · umarbutler · commit ee802ed578d6 · 2024-06-19T16:01:45.000+10:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,13 @@
 ## Changelog 🔄
 All notable changes to `semchunk` will be documented here. This project adheres to [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [2.0.0] - 2024-06-19
+### Added
+- Added support for multiprocessing through the `processes` argument passable to chunkers constructed by `chunkerify()`.
+
+### Removed
+- No longer guaranteed that `semchunk` is pure Python.
+
 ## [1.0.1] - 2024-06-02
 ### Fixed
 - Documented the `progress` argument in the docstring for `chunkerify()` and its type hint in the README.
diff --git a/README.md b/README.md
@@ -36,6 +36,10 @@ chunker = semchunk.chunkerify('umarbutler/emubert', chunk_size) or \
 # chunks or a list of lists of chunks, respectively.
 assert chunker(text) == ['The quick', 'brown', 'fox', 'jumps', 'over the', 'lazy', 'dog.']
 assert chunker([text], progress = True) == [['The quick', 'brown', 'fox', 'jumps', 'over the', 'lazy', 'dog.']]
+
+# If you have a large number of texts to chunk and speed is a concern, you can also enable
+# multiprocessing by setting `processes` to a number greater than 1.
+assert chunker([text], processes = 2) == [['The quick', 'brown', 'fox', 'jumps', 'over the', 'lazy', 'dog.']]
 ```
 
 ### Chunkerify
@@ -46,7 +50,7 @@ def chunkerify(
     chunk_size: int = None,
     max_token_chars: int = None,
     memoize: bool = True,
-) -> Callable[[str | Sequence[str], bool], list[str] | list[list[str]]]:
+) -> Callable[[str | Sequence[str], bool, bool], list[str] | list[list[str]]]:
 ```
 
 `chunkerify()` constructs a chunker that splits one or more texts into semantically meaningful chunks of a specified size as determined by the provided tokenizer or token counter.
@@ -59,7 +63,11 @@ def chunkerify(
 
 `memoize` flags whether to memoize the token counter. It defaults to `True`.
 
-This function returns a callable that takes either a single text or a sequence of texts and returns, if a single text has been provided, a list of chunks up to `chunk_size`-tokens-long with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts. The callable can also be passed a `progress` argument which if set to `True` and multiple texts are passed, will display a progress bar.
+This function returns a callable that takes either a single text or a sequence of texts and returns, if a single text has been provided, a list of chunks up to `chunk_size`-tokens-long with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts.
+
+The resulting chunker function can also be passed a `processes` argument that specifies the number of processes to be used when chunking multiple texts.
+
+It is also possible to pass a `progress` argument which, if set to `True` and multiple texts are passed, will display a progress bar.
 
 ### Chunk
 ```python
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "semchunk"
-version = "1.0.1"
+version = "2.0.0"
 authors = [
   {name="Umar Butler", email="umar@umar.au"},
 ]
@@ -45,6 +45,7 @@ classifiers = [
 ]
 dependencies = [
     "tqdm",
+    "mpire",
 ]
 
 [project.urls]
diff --git a/src/semchunk/semchunk.py b/src/semchunk/semchunk.py
@@ -9,6 +9,8 @@
 from itertools import accumulate
 from contextlib import suppress
 
+import mpire
+
 from tqdm import tqdm
 
 if TYPE_CHECKING:
@@ -168,7 +170,11 @@ def chunkerify(
         memoize (bool, optional): Whether to memoize the token counter. Defaults to `True`.
     
     Returns:
-        Callable[[str | Sequence[str], bool], list[str] | list[list[str]]]: A function that takes either a single text or a sequence of texts and returns, if a single text has been provided, a list of chunks up to `chunk_size`-tokens-long with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts. The function can also be passed a `progress` argument which if set to `True` and multiple texts are passed, will display a progress bar."""
+        Callable[[str | Sequence[str], bool, bool], list[str] | list[list[str]]]: A function that takes either a single text or a sequence of texts and returns, if a single text has been provided, a list of chunks up to `chunk_size`-tokens-long with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts.
+        
+        The resulting chunker function can also be passed a `processes` argument that specifies the number of processes to be used when chunking multiple texts.
+        
+        It is also possible to pass a `progress` argument which, if set to `True` and multiple texts are passed, will display a progress bar."""
     
     # If the provided tokenizer is a string, try to load it with either `tiktoken` or `transformers` or raise an error if neither is available.
     if isinstance(tokenizer_or_token_counter, str):
@@ -251,24 +257,36 @@ def faster_token_counter(text: str) -> int:
     if memoize:
         token_counter = _memoized_token_counters.setdefault(token_counter, cache(token_counter))
     
+    # Construct a chunking function that passes the chunk size and token counter to `chunk()`.
+    def chunking_function(text: str) -> list[str]:
+        return chunk(text, chunk_size, token_counter, memoize = False)
+    
     # Construct and return the chunker.
-    def chunker(text_or_texts: str | Sequence[str], progress: bool = False) -> list[str] | list[list[str]]:
+    def chunker(
+        text_or_texts: str | Sequence[str],
+        processes: int = 1,
+        progress: bool = False,
+    ) -> list[str] | list[list[str]]:
         """Split text or texts into semantically meaningful chunks of a specified size as determined by the provided tokenizer or token counter.
         
         Args:
             text_or_texts (str | Sequence[str]): The text or texts to be chunked.
         
         Returns:
             list[str] | list[list[str]]: If a single text has been provided, a list of chunks up to `chunk_size`-tokens-long, with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts.
+            processes (int, optional): The number of processes to use when chunking multiple texts. Defaults to `1` in which case chunking will occur in the main process.
             progress (bool, optional): Whether to display a progress bar when chunking multiple texts. Defaults to `False`."""
                 
         if isinstance(text_or_texts, str):
-            return chunk(text_or_texts, chunk_size, token_counter, memoize = False)
+            return chunking_function(text_or_texts)
         
-        if progress:
-            return [chunk(text, chunk_size, token_counter, memoize = False) for text in tqdm(text_or_texts)]
+        if progress and processes == 1:
+            text_or_texts = tqdm(text_or_texts)
         
-        else:
-            return [chunk(text, chunk_size, token_counter, memoize = False) for text in text_or_texts]
+        if processes == 1:
+            return [chunking_function(text) for text in text_or_texts]
+        
+        with mpire.WorkerPool(processes, use_dill = True) as pool:
+            return pool.map(chunking_function, text_or_texts, progress_bar = progress)
     
     return chunker
diff --git a/tests/test_semchunk.py b/tests/test_semchunk.py
@@ -48,6 +48,10 @@ def tiktoken_token_counter(text: str) -> int:
     chunker = semchunk.chunkerify(tiktoken_token_counter, 4)
     assert chunker(['ThisIs\tATest.', 'ThisIs\tATest.']) == [['ThisIs', 'ATest.'], ['ThisIs', 'ATest.']]
     
+    # Test chunking multiple texts with multiple processes.
+    chunker = semchunk.chunkerify(tiktoken_token_counter, 4)
+    assert chunker(['ThisIs\tATest.', 'ThisIs\tATest.'], processes = 2) == [['ThisIs', 'ATest.'], ['ThisIs', 'ATest.']]
+    
     # Test using a `transformers` tokenizer.
     chunker = semchunk.chunkerify(transformers_tokenizer)
     assert chunker('ThisIs\tATest.') == ['ThisIs\tATest.']

Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@ build-backend = "hatchling.build"`
`4`	`4`
`5`	`5`	`[project]`
`6`	`6`	`name = "semchunk"`
`7`		`-version = "1.0.1"`
	`7`	`+version = "2.0.0"`
`8`	`8`	`authors = [`
`9`	`9`	`{name="Umar Butler", email="[email protected]"},`
`10`	`10`	`]`
`@@ -45,6 +45,7 @@ classifiers = [`
`45`	`45`	`]`
`46`	`46`	`dependencies = [`
`47`	`47`	`"tqdm",`
	`48`	`+ "mpire",`
`48`	`49`	`]`
`49`	`50`
`50`	`51`	`[project.urls]`