Skip to content

Commit b04fc66

Browse files
Improve text splitter for non-English documents (#1326)
* Add tests to verify that tokens in each section can never be above 500 tokens * Improve the test so we parametrize it for each PDF instead of failing on the first one. * Add arabic book * Formatting fixes * Verbose assertion messages with the file name * Implement a recursive splitter when sections are too large. * 5 percent overlap with recursive splitter * Resolve some of the PR feedback * Make the tests set to the max tokens per section of the text splitter instead of hardcoded values * Find a better split position in the central third of the text * Correct the boundary check and fix the position to come after the full stops * Remove some silly line breaks * Update import * reformatting * Make the overlap percent and section size defaults module level constants * Reformatted PDFs using the online PDF parser for better accessibility * Add an RTL test * Add a korean test file about a mouse that goes to the big city. Add a test for table overlapping. * Add a snapshot of the content sections from the test data PDFs * Fix my formatting * Sort the keys first --------- Co-authored-by: Pamela Fox <[email protected]>
1 parent afbe70c commit b04fc66

File tree

10 files changed

+834
-10
lines changed

10 files changed

+834
-10
lines changed

requirements-dev.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ coverage
99
playwright
1010
pytest-cov
1111
pytest-playwright
12+
pytest-snapshot
1213
pre-commit
1314
locust
1415
pip-tools

scripts/prepdocslib/textsplitter.py

Lines changed: 108 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from abc import ABC
22
from typing import Generator, List
33

4+
import tiktoken
5+
46
from .page import Page, SplitPage
57

68

@@ -16,20 +18,118 @@ def split_pages(self, pages: List[Page]) -> Generator[SplitPage, None, None]:
1618
yield # pragma: no cover - this is necessary for mypy to type check
1719

1820

21+
ENCODING_MODEL = "text-embedding-ada-002"
22+
23+
STANDARD_WORD_BREAKS = [",", ";", ":", " ", "(", ")", "[", "]", "{", "}", "\t", "\n"]
24+
25+
# See W3C document https://www.w3.org/TR/jlreq/#cl-01
26+
CJK_WORD_BREAKS = [
27+
"、",
28+
",",
29+
";",
30+
":",
31+
"(",
32+
")",
33+
"【",
34+
"】",
35+
"「",
36+
"」",
37+
"『",
38+
"』",
39+
"〔",
40+
"〕",
41+
"〈",
42+
"〉",
43+
"《",
44+
"》",
45+
"〖",
46+
"〗",
47+
"〘",
48+
"〙",
49+
"〚",
50+
"〛",
51+
"〝",
52+
"〞",
53+
"〟",
54+
"〰",
55+
"–",
56+
"—",
57+
"‘",
58+
"’",
59+
"‚",
60+
"‛",
61+
"“",
62+
"”",
63+
"„",
64+
"‟",
65+
"‹",
66+
"›",
67+
]
68+
69+
STANDARD_SENTENCE_ENDINGS = [".", "!", "?"]
70+
71+
# See CL05 and CL06, based on JIS X 4051:2004
72+
# https://www.w3.org/TR/jlreq/#cl-04
73+
CJK_SENTENCE_ENDINGS = ["。", "!", "?", "‼", "⁇", "⁈", "⁉"]
74+
75+
# NB: text-embedding-3-XX is the same BPE as text-embedding-ada-002
76+
bpe = tiktoken.encoding_for_model(ENCODING_MODEL)
77+
78+
DEFAULT_OVERLAP_PERCENT = 10 # See semantic search article for 10% overlap performance
79+
DEFAULT_SECTION_LENGTH = 1000 # Roughly 400-500 tokens for English
80+
81+
1982
class SentenceTextSplitter(TextSplitter):
2083
"""
2184
Class that splits pages into smaller chunks. This is required because embedding models may not be able to analyze an entire page at once
2285
"""
2386

24-
def __init__(self, has_image_embeddings: bool, verbose: bool = False):
25-
self.sentence_endings = [".", "!", "?"]
26-
self.word_breaks = [",", ";", ":", " ", "(", ")", "[", "]", "{", "}", "\t", "\n"]
27-
self.max_section_length = 1000
87+
def __init__(self, has_image_embeddings: bool, verbose: bool = False, max_tokens_per_section: int = 500):
88+
self.sentence_endings = STANDARD_SENTENCE_ENDINGS + CJK_SENTENCE_ENDINGS
89+
self.word_breaks = STANDARD_WORD_BREAKS + CJK_WORD_BREAKS
90+
self.max_section_length = DEFAULT_SECTION_LENGTH
2891
self.sentence_search_limit = 100
29-
self.section_overlap = 100
92+
self.max_tokens_per_section = max_tokens_per_section
93+
self.section_overlap = self.max_section_length // DEFAULT_OVERLAP_PERCENT
3094
self.verbose = verbose
3195
self.has_image_embeddings = has_image_embeddings
3296

97+
def split_page_by_max_tokens(self, page_num: int, text: str) -> Generator[SplitPage, None, None]:
98+
"""
99+
Recursively splits page by maximum number of tokens to better handle languages with higher token/word ratios.
100+
"""
101+
tokens = bpe.encode(text)
102+
if len(tokens) <= self.max_tokens_per_section:
103+
# Section is already within max tokens, return
104+
yield SplitPage(page_num=page_num, text=text)
105+
else:
106+
# Start from the center and try and find the closest sentence ending by spiralling outward.
107+
# IF we get to the outer thirds, then just split in half with a 5% overlap
108+
start = int(len(text) // 2)
109+
pos = 0
110+
boundary = int(len(text) // 3)
111+
split_position = -1
112+
while start - pos > boundary:
113+
if text[start - pos] in self.sentence_endings:
114+
split_position = start - pos
115+
break
116+
elif text[start + pos] in self.sentence_endings:
117+
split_position = start + pos
118+
break
119+
else:
120+
pos += 1
121+
122+
if split_position > 0:
123+
first_half = text[: split_position + 1]
124+
second_half = text[split_position + 1 :]
125+
else:
126+
# Split page in half and call function again
127+
# Overlap first and second halves by DEFAULT_OVERLAP_PERCENT%
128+
first_half = text[: int(len(text) // (2.0 + (DEFAULT_OVERLAP_PERCENT / 100)))]
129+
second_half = text[int(len(text) // (1.0 - (DEFAULT_OVERLAP_PERCENT / 100))) :]
130+
yield from self.split_page_by_max_tokens(page_num, first_half)
131+
yield from self.split_page_by_max_tokens(page_num, second_half)
132+
33133
def split_pages(self, pages: List[Page]) -> Generator[SplitPage, None, None]:
34134
# Chunking is disabled when using GPT4V. To be updated in the future.
35135
if self.has_image_embeddings:
@@ -49,7 +149,7 @@ def find_page(offset):
49149

50150
length = len(all_text)
51151
if length <= self.max_section_length:
52-
yield SplitPage(page_num=find_page(0), text=all_text)
152+
yield from self.split_page_by_max_tokens(page_num=find_page(0), text=all_text)
53153
return
54154

55155
start = 0
@@ -91,7 +191,7 @@ def find_page(offset):
91191
start += 1
92192

93193
section_text = all_text[start:end]
94-
yield SplitPage(page_num=find_page(start), text=section_text)
194+
yield from self.split_page_by_max_tokens(page_num=find_page(start), text=section_text)
95195

96196
last_table_start = section_text.rfind("<table")
97197
if last_table_start > 2 * self.sentence_search_limit and last_table_start > section_text.rfind("</table"):
@@ -107,7 +207,7 @@ def find_page(offset):
107207
start = end - self.section_overlap
108208

109209
if start + self.section_overlap < end:
110-
yield SplitPage(page_num=find_page(start), text=all_text[start:end])
210+
yield from self.split_page_by_max_tokens(page_num=find_page(start), text=all_text[start:end])
111211

112212

113213
class SimpleTextSplitter(TextSplitter):

0 commit comments

Comments
 (0)