Fix all mypy issues

pamelafox · pamelafox · commit b1e622500c42 · 2025-06-30T23:28:51.000-07:00
diff --git a/app/backend/app.py b/app/backend/app.py
@@ -94,6 +94,7 @@
 from decorators import authenticated, authenticated_path
 from error import error_dict, error_response
 from prepdocs import (
+    OpenAIHost,
     clean_key_if_exists,
     setup_embeddings_service,
     setup_file_processors,
@@ -583,7 +584,7 @@ async def setup_clients():
         )
         text_embeddings_service = setup_embeddings_service(
             azure_credential=azure_credential,
-            openai_host=OPENAI_HOST,
+            openai_host=OpenAIHost(OPENAI_HOST),
             emb_model_name=OPENAI_EMB_MODEL,
             emb_model_dimensions=OPENAI_EMB_DIMENSIONS,
             azure_openai_service=AZURE_OPENAI_SERVICE,
diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py
@@ -188,17 +188,19 @@ def setup_embeddings_service(
 
 def setup_openai_client(
     openai_host: OpenAIHost,
+    azure_credential: AsyncTokenCredential,
     azure_openai_api_key: Union[str, None] = None,
     azure_openai_api_version: Union[str, None] = None,
     azure_openai_service: Union[str, None] = None,
     azure_openai_custom_url: Union[str, None] = None,
-    azure_credential: AsyncTokenCredential = None,
     openai_api_key: Union[str, None] = None,
     openai_organization: Union[str, None] = None,
 ):
     if openai_host not in OpenAIHost:
         raise ValueError(f"Invalid OPENAI_HOST value: {openai_host}. Must be one of {[h.value for h in OpenAIHost]}.")
 
+    openai_client: AsyncOpenAI
+
     if openai_host in [OpenAIHost.AZURE, OpenAIHost.AZURE_CUSTOM]:
         if openai_host == OpenAIHost.AZURE_CUSTOM:
             logger.info("OPENAI_HOST is azure_custom, setting up Azure OpenAI custom client")
@@ -501,11 +503,11 @@ async def main(strategy: Strategy, setup_index: bool = True):
     )
     openai_client = setup_openai_client(
         openai_host=openai_host,
+        azure_credential=azd_credential,
         azure_openai_api_version=azure_openai_api_version,
         azure_openai_service=os.getenv("AZURE_OPENAI_SERVICE"),
         azure_openai_custom_url=os.getenv("AZURE_OPENAI_CUSTOM_URL"),
         azure_openai_api_key=os.getenv("AZURE_OPENAI_API_KEY_OVERRIDE"),
-        azure_credential=azd_credential,
         openai_api_key=clean_key_if_exists(os.getenv("OPENAI_API_KEY")),
         openai_organization=os.getenv("OPENAI_ORGANIZATION"),
     )
diff --git a/app/backend/prepdocslib/filestrategy.py b/app/backend/prepdocslib/filestrategy.py
@@ -30,6 +30,8 @@ async def parse_file(
     pages = [page async for page in processor.parser.parse(content=file.content)]
     for page in pages:
         for image in page.images:
+            if not blob_manager or not image_embeddings_client:
+                raise ValueError("BlobManager and ImageEmbeddingsClient must be provided to parse images in the file.")
             if image.url is None:
                 image.url = await blob_manager.upload_document_image(file, image.bytes, image.filename, image.page_num)
             if image_embeddings_client:
diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py
@@ -298,7 +298,9 @@ def crop_image_from_pdf_page(
         """
         # Scale the bounding box to 72 DPI
         bbox_dpi = 72
-        bbox_pixels = tuple(x * bbox_dpi for x in bbox_inches)  # Convert to tuple
+        # We multiply using unpacking to ensure the resulting tuple has the correct number of elements
+        x0, y0, x1, y1 = (x * bbox_dpi for x in bbox_inches)
+        bbox_pixels = (x0, y0, x1, y1)
         rect = pymupdf.Rect(bbox_pixels)
         # Assume that the PDF has 300 DPI,
         # and use the matrix to convert between the 2 DPIs
diff --git a/todo.txt b/todo.txt
@@ -3,8 +3,9 @@ TODO:
 * Fix/add unit tests - check coverage
 * mypy
 * Test with integrated vectorization
+* Test with user upload feature
 * Update all TODOs in the code/docs
-
+* shall i truncate the image_urls to ".data:image/png;base64,asdsa...." for the JSON display
 
 Decide:
 * In conftest, should I make a new env for vision? Currently I mashed it into the existing env, but it might be cleaner to have a separate one, as now I have to pass llm_inputs explicitly in the tests to turn off image responses.