examples: fix mypy errors across example entry points; add helper script and docs (#1091)

samojavo · samojavo · commit 7309315725e9 · 2025-11-03T21:38:31.000+01:00
diff --git a/dev/README.md b/dev/README.md
@@ -35,3 +35,26 @@ python dev/generate_cli_docs.py
 - `cocoindex` package must be importable (the CLI module)
 
 This ensures that CLI documentation is always kept in sync with the actual command-line interface.
+
+## Type-checking Examples
+
+We provide a helper script to run mypy on each example entry point individually with minimal assumptions about optional dependencies.
+
+### `mypy_check_examples.ps1`
+
+Runs mypy for every `main.py` (and `colpali_main.py`) under the `examples/` folder using these rules:
+
+- Only ignore missing imports (no broad suppressions)
+- Avoid type-checking CocoIndex internals by setting `--follow-imports=silent`
+- Make CocoIndex sources discoverable via `MYPYPATH=python`
+
+Usage (Windows PowerShell):
+
+```powershell
+powershell -NoProfile -ExecutionPolicy Bypass -File dev/mypy_check_examples.ps1
+```
+
+Notes:
+
+- Ensure you have a local virtual environment with `mypy` installed (e.g. `.venv` with `pip install mypy`).
+- The script will report any failing example files and exit non-zero on failures.
diff --git a/dev/mypy_check_examples.ps1 b/dev/mypy_check_examples.ps1
@@ -0,0 +1,34 @@
+$ErrorActionPreference = 'Stop'
+
+# Resolve python in local venv
+$repoRoot = Split-Path -Parent $PSScriptRoot
+$python = Join-Path $repoRoot '.venv\Scripts\python.exe'
+if (-not (Test-Path $python)) {
+  $python = 'python'
+}
+
+# Ensure mypy can resolve local cocoindex package sources
+$env:MYPYPATH = Join-Path $repoRoot 'python'
+
+# Collect example entry files
+$examples = Join-Path $repoRoot 'examples'
+$files = Get-ChildItem -Path $examples -Recurse -File |
+  Where-Object { $_.Name -in @('main.py','colpali_main.py') } |
+  Sort-Object FullName
+
+$failed = @()
+foreach ($f in $files) {
+  Write-Host (">>> Checking " + $f.FullName)
+  & $python -m mypy --ignore-missing-imports --follow-imports=silent $f.FullName
+  if ($LASTEXITCODE -ne 0) {
+    $failed += $f.FullName
+  }
+}
+
+if ($failed.Count -gt 0) {
+  Write-Host "\nFailures:"
+  $failed | ForEach-Object { Write-Host $_ }
+  exit 1
+} else {
+  Write-Host "\nAll example entry files passed mypy."
+}
diff --git a/examples/custom_output_files/main.py b/examples/custom_output_files/main.py
@@ -4,6 +4,7 @@
 
 import cocoindex
 from markdown_it import MarkdownIt
+from typing import cast
 
 _markdown_it = MarkdownIt("gfm-like")
 
@@ -96,7 +97,7 @@ def mutate(
 
 @cocoindex.op.function()
 def markdown_to_html(text: str) -> str:
-    return _markdown_it.render(text)
+    return cast(str, _markdown_it.render(text))
 
 
 @cocoindex.flow_def(name="CustomOutputFiles")
diff --git a/examples/face_recognition/main.py b/examples/face_recognition/main.py
@@ -7,6 +7,7 @@
 import face_recognition
 import numpy as np
 from PIL import Image
+from typing import cast
 
 QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6334/")
 QDRANT_COLLECTION = "face_embeddings"
@@ -85,7 +86,7 @@ def extract_face_embedding(
         np.array(img),
         known_face_locations=[(0, img.width - 1, img.height - 1, 0)],
     )[0]
-    return embedding
+    return cast(cocoindex.Vector[cocoindex.Float32], embedding)
 
 
 @cocoindex.flow_def(name="FaceRecognition")
diff --git a/examples/fastapi_server_docker/main.py b/examples/fastapi_server_docker/main.py
@@ -6,6 +6,7 @@
 from psycopg_pool import ConnectionPool
 from contextlib import asynccontextmanager
 import os
+from typing import Any, AsyncIterator
 
 
 @cocoindex.transform_flow()
@@ -26,7 +27,7 @@ def text_to_embedding(
 @cocoindex.flow_def(name="MarkdownEmbeddingFastApiExample")
 def markdown_embedding_flow(
     flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
-):
+) -> None:
     """
     Define an example flow that embeds markdown files into a vector database.
     """
@@ -65,7 +66,7 @@ def markdown_embedding_flow(
     )
 
 
-def search(pool: ConnectionPool, query: str, top_k: int = 5):
+def search(pool: ConnectionPool, query: str, top_k: int = 5) -> list[dict[str, Any]]:
     # Get the table name, for the export target in the text_embedding_flow above.
     table_name = cocoindex.utils.get_target_default_name(
         markdown_embedding_flow, "doc_embeddings"
@@ -89,7 +90,7 @@ def search(pool: ConnectionPool, query: str, top_k: int = 5):
 
 
 @asynccontextmanager
-def lifespan(app: FastAPI):
+async def lifespan(app: FastAPI) -> AsyncIterator[None]:
     load_dotenv()
     cocoindex.init()
     pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL"))
@@ -103,16 +104,19 @@ def lifespan(app: FastAPI):
 fastapi_app = FastAPI(lifespan=lifespan)
 
 
-@fastapi_app.get("/search")
 def search_endpoint(
     request: Request,
     q: str = Query(..., description="Search query"),
     limit: int = Query(5, description="Number of results"),
-):
+) -> dict[str, Any]:
     pool = request.app.state.pool
     results = search(pool, q, limit)
     return {"results": results}
 
 
+# Attach route without using decorator to avoid untyped-decorator when FastAPI types are unavailable
+fastapi_app.get("/search")(search_endpoint)
+
+
 if __name__ == "__main__":
     uvicorn.run(fastapi_app, host="0.0.0.0", port=8080)
diff --git a/examples/gdrive_text_embedding/main.py b/examples/gdrive_text_embedding/main.py
@@ -3,6 +3,7 @@
 import cocoindex
 import datetime
 import os
+from typing import Any
 
 
 @cocoindex.transform_flow()
@@ -23,7 +24,7 @@ def text_to_embedding(
 @cocoindex.flow_def(name="GoogleDriveTextEmbedding")
 def gdrive_text_embedding_flow(
     flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
-):
+) -> None:
     """
     Define an example flow that embeds text into a vector database.
     """
@@ -71,7 +72,7 @@ def gdrive_text_embedding_flow(
     )
 
 
-def search(pool: ConnectionPool, query: str, top_k: int = 5):
+def search(pool: ConnectionPool, query: str, top_k: int = 5) -> list[dict[str, Any]]:
     # Get the table name, for the export target in the gdrive_text_embedding_flow above.
     table_name = cocoindex.utils.get_target_default_name(
         gdrive_text_embedding_flow, "doc_embeddings"
@@ -94,7 +95,7 @@ def search(pool: ConnectionPool, query: str, top_k: int = 5):
             ]
 
 
-def _main():
+def _main() -> None:
     # Initialize the database connection pool.
     pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL"))
     # Run queries in a loop to demonstrate the query capabilities.
diff --git a/examples/image_search/colpali_main.py b/examples/image_search/colpali_main.py
@@ -1,7 +1,7 @@
 import datetime
 import os
 from contextlib import asynccontextmanager
-from typing import Any
+from typing import Any, AsyncIterator
 
 import cocoindex
 from dotenv import load_dotenv
@@ -71,7 +71,7 @@ def image_object_embedding_flow(
 
 
 @asynccontextmanager
-async def lifespan(app: FastAPI) -> None:
+async def lifespan(app: FastAPI) -> AsyncIterator[None]:
     load_dotenv()
     cocoindex.init()
     image_object_embedding_flow.setup(report_to_stdout=True)
@@ -100,11 +100,10 @@ async def lifespan(app: FastAPI) -> None:
 
 
 # --- Search API ---
-@app.get("/search")
 def search(
     q: str = Query(..., description="Search query"),
     limit: int = Query(5, description="Number of results"),
-) -> Any:
+) -> dict[str, Any]:
     # Get the multi-vector embedding for the query
     query_embedding = text_to_colpali_embedding.eval(q)
     print(
@@ -132,3 +131,7 @@ def search(
             for result in search_results.points
         ]
     }
+
+
+# Attach route without using decorator to avoid untyped-decorator when FastAPI types are unavailable
+app.get("/search")(search)
diff --git a/examples/image_search/main.py b/examples/image_search/main.py
@@ -3,7 +3,7 @@
 import io
 import os
 from contextlib import asynccontextmanager
-from typing import Any, Literal
+from typing import Any, Literal, Final, TypeAlias, cast, AsyncIterator
 
 import cocoindex
 import torch
@@ -19,7 +19,8 @@
 QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6334/")
 QDRANT_COLLECTION = "ImageSearch"
 CLIP_MODEL_NAME = "openai/clip-vit-large-patch14"
-CLIP_MODEL_DIMENSION = 768
+CLIP_MODEL_DIMENSION: Final[int] = 768
+CLIPVector: TypeAlias = cocoindex.Vector[cocoindex.Float32, Literal[768]]
 
 
 @functools.cache
@@ -37,13 +38,13 @@ def embed_query(text: str) -> list[float]:
     inputs = processor(text=[text], return_tensors="pt", padding=True)
     with torch.no_grad():
         features = model.get_text_features(**inputs)
-    return features[0].tolist()
+    return cast(list[float], features[0].tolist())
 
 
 @cocoindex.op.function(cache=True, behavior_version=1, gpu=True)
 def embed_image(
     img_bytes: bytes,
-) -> cocoindex.Vector[cocoindex.Float32, Literal[CLIP_MODEL_DIMENSION]]:
+) -> CLIPVector:
     """
     Convert image to embedding using CLIP model.
     """
@@ -52,7 +53,7 @@ def embed_image(
     inputs = processor(images=image, return_tensors="pt")
     with torch.no_grad():
         features = model.get_image_features(**inputs)
-    return features[0].tolist()
+    return cast(CLIPVector, features[0].tolist())
 
 
 # CocoIndex flow: Ingest images, extract captions, embed, export to Qdrant
@@ -112,7 +113,7 @@ def image_object_embedding_flow(
 
 
 @asynccontextmanager
-async def lifespan(app: FastAPI) -> None:
+async def lifespan(app: FastAPI) -> AsyncIterator[None]:
     load_dotenv()
     cocoindex.init()
     image_object_embedding_flow.setup(report_to_stdout=True)
@@ -141,11 +142,10 @@ async def lifespan(app: FastAPI) -> None:
 
 
 # --- Search API ---
-@app.get("/search")
 def search(
     q: str = Query(..., description="Search query"),
     limit: int = Query(5, description="Number of results"),
-) -> Any:
+) -> dict[str, Any]:
     # Get the embedding for the query
     query_embedding = embed_query(q)
 
@@ -169,3 +169,7 @@ def search(
             for result in search_results
         ]
     }
+
+
+# Attach route without using decorator to avoid untyped-decorator when FastAPI types are unavailable
+app.get("/search")(search)
diff --git a/examples/manuals_llm_extraction/main.py b/examples/manuals_llm_extraction/main.py
@@ -5,6 +5,7 @@
 from marker.models import create_model_dict
 from marker.output import text_from_rendered
 from marker.config.parser import ConfigParser
+from typing import cast
 
 import cocoindex
 
@@ -20,7 +21,7 @@ class PdfToMarkdownExecutor:
     spec: PdfToMarkdown
     _converter: PdfConverter
 
-    def prepare(self):
+    def prepare(self) -> None:
         config_parser = ConfigParser({})
         self._converter = PdfConverter(
             create_model_dict(), config=config_parser.generate_config_dict()
@@ -30,8 +31,8 @@ def __call__(self, content: bytes) -> str:
         with tempfile.NamedTemporaryFile(delete=True, suffix=".pdf") as temp_file:
             temp_file.write(content)
             temp_file.flush()
-            text, _, _ = text_from_rendered(self._converter(temp_file.name))
-            return text
+            text_any, _, _ = text_from_rendered(self._converter(temp_file.name))
+            return cast(str, text_any)
 
 
 @dataclasses.dataclass
@@ -90,7 +91,7 @@ def summarize_module(module_info: ModuleInfo) -> ModuleSummary:
 @cocoindex.flow_def(name="ManualExtraction")
 def manual_extraction_flow(
     flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
-):
+) -> None:
     """
     Define an example flow that extracts manual information from a Markdown.
     """
diff --git a/examples/paper_metadata/main.py b/examples/paper_metadata/main.py
@@ -9,6 +9,7 @@
 from marker.models import create_model_dict
 from marker.output import text_from_rendered
 from functools import cache
+from typing import cast
 from pypdf import PdfReader, PdfWriter
 
 
@@ -66,8 +67,8 @@ def pdf_to_markdown(content: bytes) -> str:
     with tempfile.NamedTemporaryFile(delete=True, suffix=".pdf") as temp_file:
         temp_file.write(content)
         temp_file.flush()
-        text, _, _ = text_from_rendered(get_marker_converter()(temp_file.name))
-        return text
+    text_any, _, _ = text_from_rendered(get_marker_converter()(temp_file.name))
+    return cast(str, text_any)
 
 
 @cocoindex.flow_def(name="PaperMetadata")
diff --git a/examples/patient_intake_extraction/main.py b/examples/patient_intake_extraction/main.py
@@ -5,6 +5,7 @@
 
 from markitdown import MarkItDown
 from openai import OpenAI
+from typing import cast
 
 import cocoindex
 
@@ -97,7 +98,7 @@ class ToMarkdownExecutor:
     spec: ToMarkdown
     _converter: MarkItDown
 
-    def prepare(self):
+    def prepare(self) -> None:
         client = OpenAI()
         self._converter = MarkItDown(llm_client=client, llm_model="gpt-4o")
 
@@ -106,14 +107,15 @@ def __call__(self, content: bytes, filename: str) -> str:
         with tempfile.NamedTemporaryFile(delete=True, suffix=suffix) as temp_file:
             temp_file.write(content)
             temp_file.flush()
-            text = self._converter.convert(temp_file.name).text_content
+            text_any = self._converter.convert(temp_file.name).text_content
+            text: str = cast(str, text_any)
             return text
 
 
 @cocoindex.flow_def(name="PatientIntakeExtraction")
 def patient_intake_extraction_flow(
     flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
-):
+) -> None:
     """
     Define a flow that extracts patient information from intake forms.
     """
diff --git a/examples/pdf_elements_embedding/main.py b/examples/pdf_elements_embedding/main.py
diff --git a/examples/pdf_embedding/main.py b/examples/pdf_embedding/main.py
diff --git a/examples/postgres_source/main.py b/examples/postgres_source/main.py