Fix tiktoken import issue causing all tests to fail

Copilot · pamelafox · Copilot · commit a979e91c7fdb · 2025-07-30T14:24:04.000Z
Co-authored-by: pamelafox &lt;297042+pamelafox@users.noreply.github.com&gt;
diff --git a/app/backend/prepdocslib/textsplitter.py b/app/backend/prepdocslib/textsplitter.py
@@ -76,7 +76,15 @@ def split_pages(self, pages: list[Page]) -> Generator[SplitPage, None, None]:
 CJK_SENTENCE_ENDINGS = ["。", "！", "？", "‼", "⁇", "⁈", "⁉"]
 
 # NB: text-embedding-3-XX is the same BPE as text-embedding-ada-002
-bpe = tiktoken.encoding_for_model(ENCODING_MODEL)
+_bpe = None
+
+
+def get_encoding():
+    """Get the tiktoken encoding, loading it lazily when first needed."""
+    global _bpe
+    if _bpe is None:
+        _bpe = tiktoken.encoding_for_model(ENCODING_MODEL)
+    return _bpe
 
 DEFAULT_OVERLAP_PERCENT = 10  # See semantic search article for 10% overlap performance
 DEFAULT_SECTION_LENGTH = 1000  # Roughly 400-500 tokens for English
@@ -99,7 +107,7 @@ def split_page_by_max_tokens(self, page_num: int, text: str) -> Generator[SplitP
         """
         Recursively splits page by maximum number of tokens to better handle languages with higher token/word ratios.
         """
-        tokens = bpe.encode(text)
+        tokens = get_encoding().encode(text)
         if len(tokens) <= self.max_tokens_per_section:
             # Section is already within max tokens, return
             yield SplitPage(page_num=page_num, text=text)