diff --git a/.gitignore b/.gitignore index 185ad0f3ef..9b721eb2f9 100644 --- a/.gitignore +++ b/.gitignore @@ -111,6 +111,7 @@ celerybeat.pid # Environments .env .venv +.venv_* .evalenv env/ venv/ diff --git a/app/backend/prepdocslib/textsplitter.py b/app/backend/prepdocslib/textsplitter.py index 1beedf86a1..ec6b6af669 100644 --- a/app/backend/prepdocslib/textsplitter.py +++ b/app/backend/prepdocslib/textsplitter.py @@ -76,7 +76,15 @@ def split_pages(self, pages: list[Page]) -> Generator[SplitPage, None, None]: CJK_SENTENCE_ENDINGS = ["。", "!", "?", "‼", "⁇", "⁈", "⁉"] # NB: text-embedding-3-XX is the same BPE as text-embedding-ada-002 -bpe = tiktoken.encoding_for_model(ENCODING_MODEL) +_bpe = None + + +def get_encoding(): + """Get the tiktoken encoding, loading it lazily when first needed.""" + global _bpe + if _bpe is None: + _bpe = tiktoken.encoding_for_model(ENCODING_MODEL) + return _bpe DEFAULT_OVERLAP_PERCENT = 10 # See semantic search article for 10% overlap performance DEFAULT_SECTION_LENGTH = 1000 # Roughly 400-500 tokens for English @@ -99,7 +107,7 @@ def split_page_by_max_tokens(self, page_num: int, text: str) -> Generator[SplitP """ Recursively splits page by maximum number of tokens to better handle languages with higher token/word ratios. """ - tokens = bpe.encode(text) + tokens = get_encoding().encode(text) if len(tokens) <= self.max_tokens_per_section: # Section is already within max tokens, return yield SplitPage(page_num=page_num, text=text) diff --git a/app/backend/requirements.txt b/app/backend/requirements.txt index 4029e3338f..fe75fc9413 100644 --- a/app/backend/requirements.txt +++ b/app/backend/requirements.txt @@ -1,5 +1,5 @@ # This file was autogenerated by uv via the following command: -# uv pip compile requirements.in -o requirements.txt --python-version 3.9 +# uv pip compile requirements.in -o requirements_updated.txt --python-version 3.9 aiofiles==24.1.0 # via # prompty @@ -14,11 +14,11 @@ aiosignal==1.4.0 # via aiohttp annotated-types==0.7.0 # via pydantic -anyio==4.4.0 +anyio==4.9.0 # via # httpx # openai -asgiref==3.8.1 +asgiref==3.9.1 # via opentelemetry-instrumentation-asgi async-timeout==5.0.1 # via aiohttp @@ -26,13 +26,14 @@ attrs==25.3.0 # via aiohttp azure-ai-documentintelligence==1.0.0b4 # via -r requirements.in -azure-cognitiveservices-speech==1.40.0 +azure-cognitiveservices-speech==1.45.0 # via -r requirements.in azure-common==1.1.28 # via azure-search-documents -azure-core==1.30.2 +azure-core==1.35.0 # via # azure-ai-documentintelligence + # azure-cognitiveservices-speech # azure-core-tracing-opentelemetry # azure-cosmos # azure-identity @@ -43,56 +44,57 @@ azure-core==1.30.2 # azure-storage-file-datalake # microsoft-kiota-authentication-azure # msrest -azure-core-tracing-opentelemetry==1.0.0b11 +azure-core-tracing-opentelemetry==1.0.0b12 # via azure-monitor-opentelemetry azure-cosmos==4.9.0 # via -r requirements.in -azure-identity==1.17.1 +azure-identity==1.23.1 # via # -r requirements.in + # azure-monitor-opentelemetry-exporter # msgraph-sdk -azure-monitor-opentelemetry==1.6.1 +azure-monitor-opentelemetry==1.6.12 # via -r requirements.in -azure-monitor-opentelemetry-exporter==1.0.0b32 +azure-monitor-opentelemetry-exporter==1.0.0b40 # via azure-monitor-opentelemetry azure-search-documents==11.6.0b12 # via -r requirements.in -azure-storage-blob==12.22.0 +azure-storage-blob==12.26.0 # via # -r requirements.in # azure-storage-file-datalake -azure-storage-file-datalake==12.16.0 +azure-storage-file-datalake==12.21.0 # via -r requirements.in -beautifulsoup4==4.12.3 +beautifulsoup4==4.13.4 # via -r requirements.in -blinker==1.8.2 +blinker==1.9.0 # via # flask # quart -certifi==2024.7.4 +certifi==2025.7.14 # via # httpcore # httpx # msrest # requests -cffi==1.17.0 +cffi==1.17.1 # via cryptography -charset-normalizer==3.3.2 +charset-normalizer==3.4.2 # via requests -click==8.1.7 +click==8.1.8 # via # flask # prompty # quart # uvicorn -cryptography==44.0.1 +cryptography==45.0.5 # via # -r requirements.in # azure-identity # azure-storage-blob # msal # pyjwt -deprecated==1.2.14 +deprecated==1.2.18 # via # opentelemetry-api # opentelemetry-semantic-conventions @@ -105,9 +107,9 @@ exceptiongroup==1.3.0 # taskgroup fixedint==0.1.6 # via azure-monitor-opentelemetry-exporter -flask==3.0.3 +flask==3.1.1 # via quart -frozenlist==1.4.1 +frozenlist==1.7.0 # via # aiohttp # aiosignal @@ -117,22 +119,22 @@ h11==0.16.0 # hypercorn # uvicorn # wsproto -h2==4.1.0 +h2==4.2.0 # via # httpx # hypercorn -hpack==4.0.0 +hpack==4.1.0 # via h2 httpcore==1.0.9 # via httpx -httpx[http2]==0.27.0 +httpx==0.28.1 # via # microsoft-kiota-http # msgraph-core # openai hypercorn==0.17.3 # via quart -hyperframe==6.0.1 +hyperframe==6.1.0 # via h2 idna==3.10 # via @@ -140,12 +142,12 @@ idna==3.10 # httpx # requests # yarl -importlib-metadata==8.0.0 +importlib-metadata==8.6.1 # via # flask # opentelemetry-api # quart -isodate==0.6.1 +isodate==0.7.2 # via # azure-ai-documentintelligence # azure-search-documents @@ -161,18 +163,19 @@ jinja2==3.1.6 # flask # prompty # quart -jiter==0.8.2 +jiter==0.10.0 # via openai markdown-it-py==3.0.0 # via rich -markupsafe==2.1.5 +markupsafe==3.0.2 # via + # flask # jinja2 # quart # werkzeug mdurl==0.1.2 # via markdown-it-py -microsoft-kiota-abstractions==1.9.3 +microsoft-kiota-abstractions==1.9.5 # via # microsoft-kiota-authentication-azure # microsoft-kiota-http @@ -181,38 +184,38 @@ microsoft-kiota-abstractions==1.9.3 # microsoft-kiota-serialization-multipart # microsoft-kiota-serialization-text # msgraph-core -microsoft-kiota-authentication-azure==1.9.3 +microsoft-kiota-authentication-azure==1.9.5 # via msgraph-core -microsoft-kiota-http==1.9.3 +microsoft-kiota-http==1.9.5 # via msgraph-core -microsoft-kiota-serialization-form==1.9.3 +microsoft-kiota-serialization-form==1.9.5 # via msgraph-sdk -microsoft-kiota-serialization-json==1.9.3 +microsoft-kiota-serialization-json==1.9.5 # via msgraph-sdk -microsoft-kiota-serialization-multipart==1.9.3 +microsoft-kiota-serialization-multipart==1.9.5 # via msgraph-sdk -microsoft-kiota-serialization-text==1.9.3 +microsoft-kiota-serialization-text==1.9.5 # via msgraph-sdk -msal==1.30.0 +msal==1.33.0 # via # -r requirements.in # azure-identity # msal-extensions msal-extensions==1.3.1 # via azure-identity -msgraph-core==1.3.3 +msgraph-core==1.3.5 # via msgraph-sdk -msgraph-sdk==1.26.0 +msgraph-sdk==1.39.0 # via -r requirements.in msrest==0.7.1 # via azure-monitor-opentelemetry-exporter -multidict==6.0.5 +multidict==6.6.3 # via # aiohttp # yarl -oauthlib==3.2.2 +oauthlib==3.3.1 # via requests-oauthlib -openai==1.63.0 +openai==1.97.1 # via -r requirements.in opentelemetry-api==1.31.1 # via @@ -268,7 +271,7 @@ opentelemetry-instrumentation-flask==0.52b1 # via azure-monitor-opentelemetry opentelemetry-instrumentation-httpx==0.52b1 # via -r requirements.in -opentelemetry-instrumentation-openai==0.39.0 +opentelemetry-instrumentation-openai==0.43.1 # via -r requirements.in opentelemetry-instrumentation-psycopg2==0.52b1 # via azure-monitor-opentelemetry @@ -308,7 +311,7 @@ opentelemetry-semantic-conventions==0.52b1 # opentelemetry-instrumentation-urllib3 # opentelemetry-instrumentation-wsgi # opentelemetry-sdk -opentelemetry-semantic-conventions-ai==0.4.3 +opentelemetry-semantic-conventions-ai==0.4.11 # via opentelemetry-instrumentation-openai opentelemetry-util-http==0.52b1 # via @@ -322,37 +325,37 @@ opentelemetry-util-http==0.52b1 # opentelemetry-instrumentation-urllib # opentelemetry-instrumentation-urllib3 # opentelemetry-instrumentation-wsgi -packaging==24.1 +packaging==25.0 # via # opentelemetry-instrumentation # opentelemetry-instrumentation-flask -pillow==10.4.0 +pillow==11.3.0 # via -r requirements.in priority==2.0.0 # via hypercorn prompty==0.1.50 # via -r requirements.in -propcache==0.2.0 +propcache==0.3.2 # via # aiohttp # yarl -psutil==5.9.8 +psutil==7.0.0 # via azure-monitor-opentelemetry-exporter pycparser==2.22 # via cffi -pydantic==2.8.2 +pydantic==2.11.7 # via openai -pydantic-core==2.20.1 +pydantic-core==2.33.2 # via pydantic -pygments==2.18.0 +pygments==2.19.2 # via rich -pyjwt[crypto]==2.10.1 +pyjwt==2.10.1 # via # -r requirements.in # msal -pymupdf==1.26.0 +pymupdf==1.26.3 # via -r requirements.in -pypdf==4.3.1 +pypdf==5.9.0 # via -r requirements.in python-dotenv==1.1.1 # via @@ -364,7 +367,7 @@ quart==0.20.0 # via # -r requirements.in # quart-cors -quart-cors==0.7.0 +quart-cors==0.8.0 # via -r requirements.in regex==2024.11.6 # via tiktoken @@ -377,16 +380,13 @@ requests==2.32.4 # tiktoken requests-oauthlib==2.0.0 # via msrest -rich==13.9.4 +rich==14.1.0 # via -r requirements.in -six==1.16.0 - # via - # azure-core - # isodate +six==1.17.0 + # via azure-core sniffio==1.3.1 # via # anyio - # httpx # openai soupsieve==2.7 # via beautifulsoup4 @@ -394,23 +394,23 @@ std-uritemplate==2.0.5 # via microsoft-kiota-abstractions taskgroup==0.2.2 # via hypercorn -tenacity==9.0.0 +tenacity==9.1.2 # via -r requirements.in -tiktoken==0.8.0 +tiktoken==0.9.0 # via # -r requirements.in # opentelemetry-instrumentation-openai tomli==2.2.1 # via hypercorn -tqdm==4.66.5 +tqdm==4.67.1 # via openai -types-beautifulsoup4==4.12.0.20240511 +types-beautifulsoup4==4.12.0.20250516 # via -r requirements.in -types-html5lib==1.1.11.20241018 +types-html5lib==1.1.11.20250708 # via types-beautifulsoup4 types-pillow==10.2.0.20240822 # via -r requirements.in -typing-extensions==4.13.2 +typing-extensions==4.14.1 # via # -r requirements.in # aiosignal @@ -423,8 +423,10 @@ typing-extensions==4.13.2 # azure-search-documents # azure-storage-blob # azure-storage-file-datalake + # beautifulsoup4 # exceptiongroup # hypercorn + # multidict # openai # opentelemetry-sdk # pydantic @@ -432,18 +434,20 @@ typing-extensions==4.13.2 # pypdf # quart # quart-cors - # rich # taskgroup + # typing-inspection # uvicorn +typing-inspection==0.4.1 + # via pydantic urllib3==2.5.0 # via requests -uvicorn==0.30.6 +uvicorn==0.35.0 # via -r requirements.in -werkzeug==3.0.6 +werkzeug==3.1.3 # via # flask # quart -wrapt==1.16.0 +wrapt==1.17.2 # via # deprecated # opentelemetry-instrumentation @@ -453,7 +457,7 @@ wrapt==1.16.0 # opentelemetry-instrumentation-urllib3 wsproto==1.2.0 # via hypercorn -yarl==1.17.2 +yarl==1.20.1 # via aiohttp -zipp==3.21.0 +zipp==3.23.0 # via importlib-metadata