Skip to content

Commit 5bd05cd

Browse files
committed
Switched to American spelling.
1 parent 3e61371 commit 5bd05cd

File tree

2 files changed

+10
-10
lines changed

2 files changed

+10
-10
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def chunkerify(
5757

5858
`max_token_chars` is the maximum numbers of characters a token may contain. It is used to significantly speed up the token counting of long inputs. It defaults to `None` in which case it will either not be used or will, if possible, be set to the numbers of characters in the longest token in the tokenizer's vocabulary as determined by the `token_byte_values` or `get_vocab` methods.
5959

60-
`memoize` flags whether to memoise the token counter. It defaults to `True`.
60+
`memoize` flags whether to memoize the token counter. It defaults to `True`.
6161

6262
This function returns a callable that takes either a single text or a sequence of texts and returns, if a single text has been provided, a list of chunks up to `chunk_size`-tokens-long with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts.
6363

@@ -79,7 +79,7 @@ def chunk(
7979

8080
`token_counter` is a callable that takes a string and returns the number of tokens in it.
8181

82-
`memoize` flags whether to memoise the token counter. It defaults to `True`.
82+
`memoize` flags whether to memoize the token counter. It defaults to `True`.
8383

8484
This function returns a list of chunks up to `chunk_size`-tokens-long, with any whitespace used to split the text removed.
8585

src/semchunk/semchunk.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@
1414
import tokenizers
1515
import transformers
1616

17-
_memoised_token_counters = {}
18-
"""A map of token counters to their memoised versions."""
17+
_memoized_token_counters = {}
18+
"""A map of token counters to their memoized versions."""
1919

2020
_NON_WHITESPACE_SEMANTIC_SPLITTERS = (
2121
'.', '?', '!', '*', # Sentence terminators.
@@ -89,14 +89,14 @@ def chunk(text: str, chunk_size: int, token_counter: Callable[[str], int], memoi
8989
text (str): The text to be chunked.
9090
chunk_size (int): The maximum number of tokens a chunk may contain.
9191
token_counter (Callable[[str], int]): A callable that takes a string and returns the number of tokens in it.
92-
memoize (bool, optional): Whether to memoise the token counter. Defaults to `True`.
92+
memoize (bool, optional): Whether to memoize the token counter. Defaults to `True`.
9393
9494
Returns:
9595
list[str]: A list of chunks up to `chunk_size`-tokens-long, with any whitespace used to split the text removed."""
9696

97-
# If this is not a recursive call and memoization is enabled, overwrite the `token_counter` with a memoised version of itself.
97+
# If this is not a recursive call and memoization is enabled, overwrite the `token_counter` with a memoized version of itself.
9898
if not _recursion_depth and memoize:
99-
token_counter = _memoised_token_counters.setdefault(token_counter, cache(token_counter))
99+
token_counter = _memoized_token_counters.setdefault(token_counter, cache(token_counter))
100100

101101
# Split the text using the most semantically meaningful splitter possible.
102102
splitter, splitter_is_whitespace, splits = _split_text(text)
@@ -139,7 +139,7 @@ def chunk(text: str, chunk_size: int, token_counter: Callable[[str], int], memoi
139139

140140
return chunks
141141

142-
# Memoise the `chunk` function, preserving its signature and docstring.
142+
# Memoize the `chunk` function, preserving its signature and docstring.
143143
chunk = wraps(chunk)(cache(chunk))
144144

145145
def chunkerify(
@@ -155,7 +155,7 @@ def chunkerify(
155155
tokenizer_or_token_counter (str | tiktoken.Encoding | transformers.PreTrainedTokenizer | tokenizers.Tokenizer | Callable[[str], int]): Either: the name of a `tiktoken` or `transformers` tokenizer (with priority given to the former); a tokenizer that possesses an `encode` attribute (eg, a `tiktoken`, `transformers` or `tokenizers` tokenizer); or a token counter that returns the number of tokens in a input.
156156
chunk_size (int, optional): The maximum number of tokens a chunk may contain. Defaults to `None` in which case it will be set to the same value as the tokenizer's `model_max_length` attribute (deducted by the number of tokens returned by attempting to tokenize an empty string) if possible otherwise a `ValueError` will be raised.
157157
max_token_chars (int, optional): The maximum numbers of characters a token may contain. Used to significantly speed up the token counting of long inputs. Defaults to `None` in which case it will either not be used or will, if possible, be set to the numbers of characters in the longest token in the tokenizer's vocabulary as determined by the `token_byte_values` or `get_vocab` methods.
158-
memoize (bool, optional): Whether to memoise the token counter. Defaults to `True`.
158+
memoize (bool, optional): Whether to memoize the token counter. Defaults to `True`.
159159
160160
Returns:
161161
Callable[[str | Sequence[str]], list[str] | list[list[str]]]: A function that takes either a single text or a sequence of texts and returns, if a single text has been provided, a list of chunks up to `chunk_size`-tokens-long with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts."""
@@ -239,7 +239,7 @@ def faster_token_counter(text: str) -> int:
239239

240240
# Memoize the token counter if necessary.
241241
if memoize:
242-
token_counter = _memoised_token_counters.setdefault(token_counter, cache(token_counter))
242+
token_counter = _memoized_token_counters.setdefault(token_counter, cache(token_counter))
243243

244244
# Construct and return the chunker.
245245
def chunker(text_or_texts: str | Sequence[str]) -> list[str] | list[list[str]]:

0 commit comments

Comments
 (0)