Skip to content

Commit 16dfb4d

Browse files
committed
Fixing #7.
1 parent 63dd211 commit 16dfb4d

File tree

5 files changed

+59
-42
lines changed

5 files changed

+59
-42
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
## Changelog 🔄
22
All notable changes to `semchunk` will be documented here. This project adheres to [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
33

4+
## [2.2.0] - 2024-07-12
5+
### Changed
6+
- Switched from having `chunkerify()` output a function to having it return an instance of the new `Chunker()` class which should not alter functionality in any way but will allow for the preservation of type hints, fixing [#7](https://github.com/umarbutler/semchunk/pull/7).
7+
48
## [2.1.0] - 2024-06-20
59
### Fixed
610
- Ceased memoizing `chunk()` (but not token counters) due to the fact that cached outputs of memoized functions are shallow rather than deep copies of original outputs, meaning that if one were to chunk a text and then chunk that same text again and then modify one of the chunks outputted by the first call, the chunks outputted by the second call would also be modified. This behaviour is not expected and therefore undesirable. The memoization of token counters is not impacted as they output immutable objects, namely, integers.

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,12 +66,14 @@ def chunkerify(
6666

6767
`memoize` flags whether to memoize the token counter. It defaults to `True`.
6868

69-
This function returns a callable that takes either a single text or a sequence of texts and returns, if a single text has been provided, a list of chunks up to `chunk_size`-tokens-long with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts.
69+
This function returns a chunker that takes either a single text or a sequence of texts and returns, if a single text has been provided, a list of chunks up to `chunk_size`-tokens-long with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts.
7070

71-
The resulting chunker function can also be passed a `processes` argument that specifies the number of processes to be used when chunking multiple texts.
71+
The resulting chunker can be passed a `processes` argument that specifies the number of processes to be used when chunking multiple texts.
7272

7373
It is also possible to pass a `progress` argument which, if set to `True` and multiple texts are passed, will display a progress bar.
7474

75+
Technically, the chunker will be an instance of the `semchunk.Chunker` class to assist with type hinting, though this should have no impact on how it can be used.
76+
7577
### Chunk
7678
```python
7779
def chunk(

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "semchunk"
7-
version = "2.1.0"
7+
version = "2.2.0"
88
authors = [
99
{name="Umar Butler", email="[email protected]"},
1010
]

src/semchunk/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
"""A fast and lightweight Python library for splitting text into semantically meaningful chunks."""
22

3-
from .semchunk import chunk, chunkerify
3+
from .semchunk import chunk, Chunker, chunkerify

src/semchunk/semchunk.py

Lines changed: 49 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import tokenizers
1919
import transformers
2020

21+
2122
_memoized_token_counters = {}
2223
"""A map of token counters to their memoized versions."""
2324

@@ -29,6 +30,7 @@
2930
)
3031
"""A tuple of semantically meaningful non-whitespace splitters that may be used to chunk texts, ordered from most desirable to least desirable."""
3132

33+
3234
def _split_text(text: str) -> tuple[str, bool, list[str]]:
3335
"""Split text using the most semantically meaningful splitter possible."""
3436

@@ -151,13 +153,51 @@ def chunk(
151153

152154
return chunks
153155

156+
157+
class Chunker:
158+
def __init__(self, chunk_size: int, token_counter: Callable[[str], int]) -> None:
159+
self.chunk_size = chunk_size
160+
self.token_counter = token_counter
161+
162+
def chunk(self, text: str) -> list[str]:
163+
"""Chunk a text."""
164+
165+
return chunk(text, self.chunk_size, self.token_counter, memoize = False)
166+
167+
def __call__(
168+
self,
169+
text_or_texts: str | Sequence[str],
170+
processes: int = 1,
171+
progress: bool = False,
172+
) -> list[str] | list[list[str]]:
173+
"""Split text or texts into semantically meaningful chunks of a specified size as determined by the provided tokenizer or token counter.
174+
175+
Args:
176+
text_or_texts (str | Sequence[str]): The text or texts to be chunked.
177+
178+
Returns:
179+
list[str] | list[list[str]]: If a single text has been provided, a list of chunks up to `chunk_size`-tokens-long, with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts.
180+
processes (int, optional): The number of processes to use when chunking multiple texts. Defaults to `1` in which case chunking will occur in the main process.
181+
progress (bool, optional): Whether to display a progress bar when chunking multiple texts. Defaults to `False`."""
182+
if isinstance(text_or_texts, str):
183+
return self.chunk(text_or_texts)
184+
185+
if progress and processes == 1:
186+
text_or_texts = tqdm(text_or_texts)
187+
188+
if processes == 1:
189+
return [self.chunk(text) for text in text_or_texts]
190+
191+
with mpire.WorkerPool(processes, use_dill = True) as pool:
192+
return pool.map(self.chunk, text_or_texts, progress_bar = progress)
193+
154194
def chunkerify(
155195
tokenizer_or_token_counter: str | tiktoken.Encoding | transformers.PreTrainedTokenizer \
156196
| tokenizers.Tokenizer | Callable[[str], int],
157-
chunk_size: int = None,
158-
max_token_chars: int = None,
197+
chunk_size: int | None = None,
198+
max_token_chars: int | None = None,
159199
memoize: bool = True,
160-
): # NOTE The output of `chunkerify()` is not type hinted because it causes `vscode` to overwrite the signature and docstring of the outputted chunker with the type hint.
200+
) -> Chunker:
161201
"""Construct a chunker that splits one or more texts into semantically meaningful chunks of a specified size as determined by the provided tokenizer or token counter.
162202
163203
Args:
@@ -167,11 +207,13 @@ def chunkerify(
167207
memoize (bool, optional): Whether to memoize the token counter. Defaults to `True`.
168208
169209
Returns:
170-
Callable[[str | Sequence[str], bool, bool], list[str] | list[list[str]]]: A function that takes either a single text or a sequence of texts and returns, if a single text has been provided, a list of chunks up to `chunk_size`-tokens-long with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts.
210+
Callable[[str | Sequence[str], bool, bool], list[str] | list[list[str]]]: A chunker that takes either a single text or a sequence of texts and returns, if a single text has been provided, a list of chunks up to `chunk_size`-tokens-long with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts.
211+
212+
The resulting chunker can be passed a `processes` argument that specifies the number of processes to be used when chunking multiple texts.
171213
172-
The resulting chunker function can also be passed a `processes` argument that specifies the number of processes to be used when chunking multiple texts.
214+
It is also possible to pass a `progress` argument which, if set to `True` and multiple texts are passed, will display a progress bar.
173215
174-
It is also possible to pass a `progress` argument which, if set to `True` and multiple texts are passed, will display a progress bar."""
216+
Technically, the chunker will be an instance of the `semchunk.Chunker` class to assist with type hinting, though this should have no impact on how it can be used."""
175217

176218
# If the provided tokenizer is a string, try to load it with either `tiktoken` or `transformers` or raise an error if neither is available.
177219
if isinstance(tokenizer_or_token_counter, str):
@@ -254,36 +296,5 @@ def faster_token_counter(text: str) -> int:
254296
if memoize:
255297
token_counter = _memoized_token_counters.setdefault(token_counter, cache(token_counter))
256298

257-
# Construct a chunking function that passes the chunk size and token counter to `chunk()`.
258-
def chunking_function(text: str) -> list[str]:
259-
return chunk(text, chunk_size, token_counter, memoize = False)
260-
261299
# Construct and return the chunker.
262-
def chunker(
263-
text_or_texts: str | Sequence[str],
264-
processes: int = 1,
265-
progress: bool = False,
266-
) -> list[str] | list[list[str]]:
267-
"""Split text or texts into semantically meaningful chunks of a specified size as determined by the provided tokenizer or token counter.
268-
269-
Args:
270-
text_or_texts (str | Sequence[str]): The text or texts to be chunked.
271-
272-
Returns:
273-
list[str] | list[list[str]]: If a single text has been provided, a list of chunks up to `chunk_size`-tokens-long, with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts.
274-
processes (int, optional): The number of processes to use when chunking multiple texts. Defaults to `1` in which case chunking will occur in the main process.
275-
progress (bool, optional): Whether to display a progress bar when chunking multiple texts. Defaults to `False`."""
276-
277-
if isinstance(text_or_texts, str):
278-
return chunking_function(text_or_texts)
279-
280-
if progress and processes == 1:
281-
text_or_texts = tqdm(text_or_texts)
282-
283-
if processes == 1:
284-
return [chunking_function(text) for text in text_or_texts]
285-
286-
with mpire.WorkerPool(processes, use_dill = True) as pool:
287-
return pool.map(chunking_function, text_or_texts, progress_bar = progress)
288-
289-
return chunker
300+
return Chunker(chunk_size, token_counter)

0 commit comments

Comments
 (0)