Skip to content

Commit f8a6482

Browse files
committed
Merge branch 'pr/3' into dev
2 parents 7e8f62d + 1e3ddb9 commit f8a6482

File tree

4 files changed

+47
-24
lines changed

4 files changed

+47
-24
lines changed

.gitignore

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,10 @@
1515
!.gitignore
1616

1717
# Finally, exclude anything in the above inclusions that we don't want.
18-
# Exclude common Python files and folders.
1918
*.pyc
2019
*.pyo
2120
*.ipynb
2221
__pycache__/
2322
.pytest_cache/
2423
tests/profiler.py
25-
tests/test_bench.py
24+
tests/test_bench.py

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
## Changelog 🔄
22
All notable changes to `semchunk` will be documented here. This project adheres to [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
33

4+
## [Unreleased] - 2024-XX-XX
5+
### Changed
6+
- Improved chunking performance with larger chunk sizes by switching from linear to binary search for the identification of optimal chunk boundaries.
7+
48
## [0.2.3] - 2024-03-11
59
### Fixed
610
- Ensured that memoization does not overwrite `chunk()`'s function signature.

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
`semchunk` is a fast and lightweight pure Python library for splitting text into semantically meaningful chunks.
55

6-
Owing to its complex yet highly efficient chunking algorithm, `semchunk` is both more semantically accurate than [`langchain.text_splitter.RecursiveCharacterTextSplitter`](https://python.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/recursive_text_splitter) (see [How It Works 🔍](https://github.com/umarbutler/semchunk#how-it-works-)) and is also over 70% faster than [`semantic-text-splitter`](https://pypi.org/project/semantic-text-splitter/) (see the [Benchmarks 📊](https://github.com/umarbutler/semchunk#benchmarks-)).
6+
Owing to its complex yet highly efficient chunking algorithm, `semchunk` is both more semantically accurate than [`langchain.text_splitter.RecursiveCharacterTextSplitter`](https://python.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/recursive_text_splitter) (see [How It Works 🔍](https://github.com/umarbutler/semchunk#how-it-works-)) and is also over 80% faster than [`semantic-text-splitter`](https://pypi.org/project/semantic-text-splitter/) (see the [Benchmarks 📊](https://github.com/umarbutler/semchunk#benchmarks-)).
77

88
## Installation 📦
99
`semchunk` may be installed with `pip`:

src/semchunk/semchunk.py

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
import re
2+
3+
from bisect import bisect_left
4+
from typing import Callable
25
from functools import cache, wraps
6+
from itertools import accumulate
7+
38

49
_memoised_token_counters = {}
510
"""A map of token counters to their memoised versions."""
@@ -45,7 +50,33 @@ def _split_text(text: str) -> tuple[str, bool, list[str]]:
4550
# Return the splitter and the split text.
4651
return splitter, splitter_is_whitespace, text.split(splitter)
4752

48-
def chunk(text: str, chunk_size: int, token_counter: callable, memoize: bool=True, _recursion_depth: int = 0) -> list[str]:
53+
54+
def merge_splits(splits: list[str], chunk_size: int, splitter: str, token_counter: Callable) -> tuple[int, str]:
55+
"""Merge splits until a chunk size is reached, returning the index of the last split included in the merged chunk along with the merged chunk itself."""
56+
57+
average = 0.2
58+
low = 0
59+
high = len(splits) + 1
60+
cumulative_lengths = tuple(accumulate(map(len, splits), initial=0))
61+
cumulative_lengths += (cumulative_lengths[-1],)
62+
63+
while low < high:
64+
i = bisect_left(cumulative_lengths[low : high + 1], chunk_size * average)
65+
midpoint = min(i + low, high - 1)
66+
67+
tokens = token_counter(splitter.join(splits[:midpoint]))
68+
69+
average = cumulative_lengths[midpoint] / tokens if cumulative_lengths[midpoint] else average
70+
71+
if tokens > chunk_size:
72+
high = midpoint
73+
else:
74+
low = midpoint + 1
75+
76+
return low - 1, splitter.join(splits[:low - 1])
77+
78+
79+
def chunk(text: str, chunk_size: int, token_counter: Callable, memoize: bool = True, _recursion_depth: int = 0) -> list[str]:
4980
"""Split text into semantically meaningful chunks of a specified size as determined by the provided token counter.
5081
5182
Args:
@@ -76,35 +107,23 @@ def chunk(text: str, chunk_size: int, token_counter: callable, memoize: bool=Tru
76107

77108
# If the split is over the chunk size, recursively chunk it.
78109
if token_counter(split) > chunk_size:
79-
chunks.extend(chunk(split, chunk_size, token_counter=token_counter, memoize=memoize, _recursion_depth=_recursion_depth+1))
110+
chunks.extend(chunk(split, chunk_size, token_counter = token_counter, memoize = memoize, _recursion_depth = _recursion_depth + 1))
80111

81-
# If the split is equal to or under the chunk size, merge it with all subsequent splits until the chunk size is reached.
112+
# If the split is equal to or under the chunk size, add it and any subsequent splits to a new chunk until the chunk size is reached.
82113
else:
83-
# Initalise the new chunk.
84-
new_chunk = split
114+
# Merge the split with subsequent splits until the chunk size is reached.
115+
final_split_in_chunk_i, new_chunk = merge_splits(splits[i:], chunk_size, splitter, token_counter)
85116

86-
# Iterate through each subsequent split until the chunk size is reached.
87-
for j, next_split in enumerate(splits[i+1:], start=i+1):
88-
# Check whether the next split can be added to the chunk without exceeding the chunk size.
89-
if token_counter(updated_chunk:=new_chunk+splitter+next_split) <= chunk_size:
90-
# Add the next split to the new chunk.
91-
new_chunk = updated_chunk
92-
93-
# Add the index of the next split to the list of indices to skip.
94-
skips.add(j)
95-
96-
# If the next split cannot be added to the chunk without exceeding the chunk size, break.
97-
else:
98-
break
117+
# Mark any splits included in the new chunk for exclusion from future chunks.
118+
skips.update(range(i + 1, i + final_split_in_chunk_i))
99119

100120
# Add the chunk.
101121
chunks.append(new_chunk)
102122

103123
# If the splitter is not whitespace and the split is not the last split, add the splitter to the end of the last chunk if doing so would not cause it to exceed the chunk size otherwise add the splitter as a new chunk.
104-
if not splitter_is_whitespace and not (i == len(splits) - 1 or all(j in skips for j in range(i+1, len(splits)))):
105-
if token_counter(last_chunk_with_splitter:=chunks[-1]+splitter) <= chunk_size:
124+
if not splitter_is_whitespace and not (i == len(splits) - 1 or all(j in skips for j in range(i + 1, len(splits)))):
125+
if token_counter(last_chunk_with_splitter := chunks[-1] + splitter) <= chunk_size:
106126
chunks[-1] = last_chunk_with_splitter
107-
108127
else:
109128
chunks.append(splitter)
110129

@@ -114,4 +133,5 @@ def chunk(text: str, chunk_size: int, token_counter: callable, memoize: bool=Tru
114133

115134
return chunks
116135

136+
117137
chunk = wraps(chunk)(cache(chunk))

0 commit comments

Comments
 (0)