Added an optional progress bar.

umarbutler · umarbutler · commit 525890beeac4 · 2024-06-02T21:11:59.000+10:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,10 @@
 ## Changelog 🔄
 All notable changes to `semchunk` will be documented here. This project adheres to [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.0.0] - 2024-06-02
+### Added
+- Added a `progress` argument to the chunker returned by `chunkerify()` that, when set to `True` and multiple texts are passed, displays a progress bar.
+
 ## [0.3.2] - 2024-06-01
 ### Fixed
 - Fixed a bug where a `DivisionByZeroError` would be raised where a token counter returned zero tokens when called from `merge_splits()`, courtesy of [@jcobol](https://github.com/jcobol) ([#5](https://github.com/umarbutler/semchunk/pull/5)) ([7fd64eb](https://github.com/umarbutler/semchunk/pull/5/commits/7fd64eb8cf51f45702c59f43795be9a00c7d0d17)), fixing [#4](https://github.com/umarbutler/semchunk/issues/4).
@@ -56,6 +60,7 @@ All notable changes to `semchunk` will be documented here. This project adheres
 ### Added
 - Added the `chunk()` function, which splits text into semantically meaningful chunks of a specified size as determined by a provided token counter.
 
+[1.0.0]: https://github.com/umarbutler/semchunk/compare/v0.3.2...v1.0.0
 [0.3.2]: https://github.com/umarbutler/semchunk/compare/v0.3.1...v0.3.2
 [0.3.1]: https://github.com/umarbutler/semchunk/compare/v0.3.0...v0.3.1
 [0.3.0]: https://github.com/umarbutler/semchunk/compare/v0.2.4...v0.3.0
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 # semchunk
-<a href="https://pypi.org/project/semchunk/" alt="PyPI Version"><img src="https://img.shields.io/pypi/v/semchunk"></a> <a href="https://github.com/umarbutler/semchunk/actions/workflows/ci.yml" alt="Build Status"><img src="https://img.shields.io/github/actions/workflow/status/umarbutler/semchunk/ci.yml?branch=main"></a> <a href="https://app.codecov.io/gh/umarbutler/semchunk" alt="Code Coverage"><img src="https://img.shields.io/codecov/c/github/umarbutler/semchunk"></a> <!-- <a href="https://pypistats.org/packages/semchunk" alt="Downloads"><img src="https://img.shields.io/pypi/dm/semchunk"></a> -->
+<a href="https://pypi.org/project/semchunk/" alt="PyPI Version"><img src="https://img.shields.io/pypi/v/semchunk"></a> <a href="https://github.com/umarbutler/semchunk/actions/workflows/ci.yml" alt="Build Status"><img src="https://img.shields.io/github/actions/workflow/status/umarbutler/semchunk/ci.yml?branch=main"></a> <a href="https://app.codecov.io/gh/umarbutler/semchunk" alt="Code Coverage"><img src="https://img.shields.io/codecov/c/github/umarbutler/semchunk"></a> <a href="https://pypistats.org/packages/semchunk" alt="Downloads"><img src="https://img.shields.io/pypi/dm/semchunk"></a>
 
 `semchunk` is a fast and lightweight pure Python library for splitting text into semantically meaningful chunks.
 
@@ -35,7 +35,7 @@ chunker = semchunk.chunkerify('umarbutler/emubert', chunk_size) or \
 # The resulting `chunker` can take and chunk a single text or a list of texts, returning a list of
 # chunks or a list of lists of chunks, respectively.
 assert chunker(text) == ['The quick', 'brown', 'fox', 'jumps', 'over the', 'lazy', 'dog.']
-assert chunker([text]) == [['The quick', 'brown', 'fox', 'jumps', 'over the', 'lazy', 'dog.']]
+assert chunker([text], progress = True) == [['The quick', 'brown', 'fox', 'jumps', 'over the', 'lazy', 'dog.']]
 ```
 
 ### Chunkerify
@@ -59,7 +59,7 @@ def chunkerify(
 
 `memoize` flags whether to memoize the token counter. It defaults to `True`.
 
-This function returns a callable that takes either a single text or a sequence of texts and returns, if a single text has been provided, a list of chunks up to `chunk_size`-tokens-long with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts.
+This function returns a callable that takes either a single text or a sequence of texts and returns, if a single text has been provided, a list of chunks up to `chunk_size`-tokens-long with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts. The callable can also be passed a `progress` argument which if set to `True` and multiple texts are passed, will display a progress bar.
 
 ### Chunk
 ```python
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "semchunk"
-version = "0.3.2"
+version = "1.0.0"
 authors = [
   {name="Umar Butler", email="umar@umar.au"},
 ]
@@ -45,6 +45,7 @@ classifiers = [
     "Typing :: Typed"
 ]
 dependencies = [
+    "tqdm",
 ]
 
 [project.urls]
diff --git a/src/semchunk/semchunk.py b/src/semchunk/semchunk.py
@@ -9,6 +9,8 @@
 from itertools import accumulate
 from contextlib import suppress
 
+from tqdm import tqdm
+
 if TYPE_CHECKING:
     import tiktoken
     import tokenizers
@@ -242,18 +244,23 @@ def faster_token_counter(text: str) -> int:
         token_counter = _memoized_token_counters.setdefault(token_counter, cache(token_counter))
     
     # Construct and return the chunker.
-    def chunker(text_or_texts: str | Sequence[str]) -> list[str] | list[list[str]]:
+    def chunker(text_or_texts: str | Sequence[str], progress: bool = False) -> list[str] | list[list[str]]:
         """Split text or texts into semantically meaningful chunks of a specified size as determined by the provided tokenizer or token counter.
         
         Args:
             text_or_texts (str | Sequence[str]): The text or texts to be chunked.
         
         Returns:
-            list[str] | list[list[str]]: If a single text has been provided, a list of chunks up to `chunk_size`-tokens-long, with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts."""
+            list[str] | list[list[str]]: If a single text has been provided, a list of chunks up to `chunk_size`-tokens-long, with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts.
+            progress (bool, optional): Whether to display a progress bar when chunking multiple texts. Defaults to `False`."""
                 
         if isinstance(text_or_texts, str):
             return chunk(text_or_texts, chunk_size, token_counter, memoize = False)
         
-        return [chunk(text, chunk_size, token_counter, memoize = False) for text in text_or_texts]
+        if progress:
+            return [chunk(text, chunk_size, token_counter, memoize = False) for text in tqdm(text_or_texts)]
+        
+        else:
+            return [chunk(text, chunk_size, token_counter, memoize = False) for text in text_or_texts]
     
     return chunker
diff --git a/tests/test_semchunk.py b/tests/test_semchunk.py
@@ -82,4 +82,7 @@ def tiktoken_token_counter(text: str) -> int:
     except ValueError:
         worked = True
     
-    assert worked
+    assert worked
+    
+    # Try enabling a progress bar.
+    chunker(['ThisIs\tATest.', 'ThisIs\tATest.'], progress = True)

Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@ build-backend = "hatchling.build"`
`4`	`4`
`5`	`5`	`[project]`
`6`	`6`	`name = "semchunk"`
`7`		`-version = "0.3.2"`
	`7`	`+version = "1.0.0"`
`8`	`8`	`authors = [`
`9`	`9`	`{name="Umar Butler", email="[email protected]"},`
`10`	`10`	`]`
`@@ -45,6 +45,7 @@ classifiers = [`
`45`	`45`	`"Typing :: Typed"`
`46`	`46`	`]`
`47`	`47`	`dependencies = [`
	`48`	`+ "tqdm",`
`48`	`49`	`]`
`49`	`50`
`50`	`51`	`[project.urls]`