Skip to content

Commit a979e91

Browse files
Copilotpamelafox
andcommitted
Fix tiktoken import issue causing all tests to fail
Co-authored-by: pamelafox <[email protected]>
1 parent 84708cf commit a979e91

File tree

1 file changed

+10
-2
lines changed

1 file changed

+10
-2
lines changed

app/backend/prepdocslib/textsplitter.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,15 @@ def split_pages(self, pages: list[Page]) -> Generator[SplitPage, None, None]:
7676
CJK_SENTENCE_ENDINGS = ["。", "!", "?", "‼", "⁇", "⁈", "⁉"]
7777

7878
# NB: text-embedding-3-XX is the same BPE as text-embedding-ada-002
79-
bpe = tiktoken.encoding_for_model(ENCODING_MODEL)
79+
_bpe = None
80+
81+
82+
def get_encoding():
83+
"""Get the tiktoken encoding, loading it lazily when first needed."""
84+
global _bpe
85+
if _bpe is None:
86+
_bpe = tiktoken.encoding_for_model(ENCODING_MODEL)
87+
return _bpe
8088

8189
DEFAULT_OVERLAP_PERCENT = 10 # See semantic search article for 10% overlap performance
8290
DEFAULT_SECTION_LENGTH = 1000 # Roughly 400-500 tokens for English
@@ -99,7 +107,7 @@ def split_page_by_max_tokens(self, page_num: int, text: str) -> Generator[SplitP
99107
"""
100108
Recursively splits page by maximum number of tokens to better handle languages with higher token/word ratios.
101109
"""
102-
tokens = bpe.encode(text)
110+
tokens = get_encoding().encode(text)
103111
if len(tokens) <= self.max_tokens_per_section:
104112
# Section is already within max tokens, return
105113
yield SplitPage(page_num=page_num, text=text)

0 commit comments

Comments
 (0)