Skip to content

Commit 7309315

Browse files
committed
examples: fix mypy errors across example entry points; add helper script and docs (#1091)
1 parent 0296c97 commit 7309315

File tree

14 files changed

+120
-43
lines changed

14 files changed

+120
-43
lines changed

dev/README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,26 @@ python dev/generate_cli_docs.py
3535
- `cocoindex` package must be importable (the CLI module)
3636

3737
This ensures that CLI documentation is always kept in sync with the actual command-line interface.
38+
39+
## Type-checking Examples
40+
41+
We provide a helper script to run mypy on each example entry point individually with minimal assumptions about optional dependencies.
42+
43+
### `mypy_check_examples.ps1`
44+
45+
Runs mypy for every `main.py` (and `colpali_main.py`) under the `examples/` folder using these rules:
46+
47+
- Only ignore missing imports (no broad suppressions)
48+
- Avoid type-checking CocoIndex internals by setting `--follow-imports=silent`
49+
- Make CocoIndex sources discoverable via `MYPYPATH=python`
50+
51+
Usage (Windows PowerShell):
52+
53+
```powershell
54+
powershell -NoProfile -ExecutionPolicy Bypass -File dev/mypy_check_examples.ps1
55+
```
56+
57+
Notes:
58+
59+
- Ensure you have a local virtual environment with `mypy` installed (e.g. `.venv` with `pip install mypy`).
60+
- The script will report any failing example files and exit non-zero on failures.

dev/mypy_check_examples.ps1

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
$ErrorActionPreference = 'Stop'
2+
3+
# Resolve python in local venv
4+
$repoRoot = Split-Path -Parent $PSScriptRoot
5+
$python = Join-Path $repoRoot '.venv\Scripts\python.exe'
6+
if (-not (Test-Path $python)) {
7+
$python = 'python'
8+
}
9+
10+
# Ensure mypy can resolve local cocoindex package sources
11+
$env:MYPYPATH = Join-Path $repoRoot 'python'
12+
13+
# Collect example entry files
14+
$examples = Join-Path $repoRoot 'examples'
15+
$files = Get-ChildItem -Path $examples -Recurse -File |
16+
Where-Object { $_.Name -in @('main.py','colpali_main.py') } |
17+
Sort-Object FullName
18+
19+
$failed = @()
20+
foreach ($f in $files) {
21+
Write-Host (">>> Checking " + $f.FullName)
22+
& $python -m mypy --ignore-missing-imports --follow-imports=silent $f.FullName
23+
if ($LASTEXITCODE -ne 0) {
24+
$failed += $f.FullName
25+
}
26+
}
27+
28+
if ($failed.Count -gt 0) {
29+
Write-Host "\nFailures:"
30+
$failed | ForEach-Object { Write-Host $_ }
31+
exit 1
32+
} else {
33+
Write-Host "\nAll example entry files passed mypy."
34+
}

examples/custom_output_files/main.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import cocoindex
66
from markdown_it import MarkdownIt
7+
from typing import cast
78

89
_markdown_it = MarkdownIt("gfm-like")
910

@@ -96,7 +97,7 @@ def mutate(
9697

9798
@cocoindex.op.function()
9899
def markdown_to_html(text: str) -> str:
99-
return _markdown_it.render(text)
100+
return cast(str, _markdown_it.render(text))
100101

101102

102103
@cocoindex.flow_def(name="CustomOutputFiles")

examples/face_recognition/main.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import face_recognition
88
import numpy as np
99
from PIL import Image
10+
from typing import cast
1011

1112
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6334/")
1213
QDRANT_COLLECTION = "face_embeddings"
@@ -85,7 +86,7 @@ def extract_face_embedding(
8586
np.array(img),
8687
known_face_locations=[(0, img.width - 1, img.height - 1, 0)],
8788
)[0]
88-
return embedding
89+
return cast(cocoindex.Vector[cocoindex.Float32], embedding)
8990

9091

9192
@cocoindex.flow_def(name="FaceRecognition")

examples/fastapi_server_docker/main.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from psycopg_pool import ConnectionPool
77
from contextlib import asynccontextmanager
88
import os
9+
from typing import Any, AsyncIterator
910

1011

1112
@cocoindex.transform_flow()
@@ -26,7 +27,7 @@ def text_to_embedding(
2627
@cocoindex.flow_def(name="MarkdownEmbeddingFastApiExample")
2728
def markdown_embedding_flow(
2829
flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
29-
):
30+
) -> None:
3031
"""
3132
Define an example flow that embeds markdown files into a vector database.
3233
"""
@@ -65,7 +66,7 @@ def markdown_embedding_flow(
6566
)
6667

6768

68-
def search(pool: ConnectionPool, query: str, top_k: int = 5):
69+
def search(pool: ConnectionPool, query: str, top_k: int = 5) -> list[dict[str, Any]]:
6970
# Get the table name, for the export target in the text_embedding_flow above.
7071
table_name = cocoindex.utils.get_target_default_name(
7172
markdown_embedding_flow, "doc_embeddings"
@@ -89,7 +90,7 @@ def search(pool: ConnectionPool, query: str, top_k: int = 5):
8990

9091

9192
@asynccontextmanager
92-
def lifespan(app: FastAPI):
93+
async def lifespan(app: FastAPI) -> AsyncIterator[None]:
9394
load_dotenv()
9495
cocoindex.init()
9596
pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL"))
@@ -103,16 +104,19 @@ def lifespan(app: FastAPI):
103104
fastapi_app = FastAPI(lifespan=lifespan)
104105

105106

106-
@fastapi_app.get("/search")
107107
def search_endpoint(
108108
request: Request,
109109
q: str = Query(..., description="Search query"),
110110
limit: int = Query(5, description="Number of results"),
111-
):
111+
) -> dict[str, Any]:
112112
pool = request.app.state.pool
113113
results = search(pool, q, limit)
114114
return {"results": results}
115115

116116

117+
# Attach route without using decorator to avoid untyped-decorator when FastAPI types are unavailable
118+
fastapi_app.get("/search")(search_endpoint)
119+
120+
117121
if __name__ == "__main__":
118122
uvicorn.run(fastapi_app, host="0.0.0.0", port=8080)

examples/gdrive_text_embedding/main.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import cocoindex
44
import datetime
55
import os
6+
from typing import Any
67

78

89
@cocoindex.transform_flow()
@@ -23,7 +24,7 @@ def text_to_embedding(
2324
@cocoindex.flow_def(name="GoogleDriveTextEmbedding")
2425
def gdrive_text_embedding_flow(
2526
flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
26-
):
27+
) -> None:
2728
"""
2829
Define an example flow that embeds text into a vector database.
2930
"""
@@ -71,7 +72,7 @@ def gdrive_text_embedding_flow(
7172
)
7273

7374

74-
def search(pool: ConnectionPool, query: str, top_k: int = 5):
75+
def search(pool: ConnectionPool, query: str, top_k: int = 5) -> list[dict[str, Any]]:
7576
# Get the table name, for the export target in the gdrive_text_embedding_flow above.
7677
table_name = cocoindex.utils.get_target_default_name(
7778
gdrive_text_embedding_flow, "doc_embeddings"
@@ -94,7 +95,7 @@ def search(pool: ConnectionPool, query: str, top_k: int = 5):
9495
]
9596

9697

97-
def _main():
98+
def _main() -> None:
9899
# Initialize the database connection pool.
99100
pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL"))
100101
# Run queries in a loop to demonstrate the query capabilities.

examples/image_search/colpali_main.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import datetime
22
import os
33
from contextlib import asynccontextmanager
4-
from typing import Any
4+
from typing import Any, AsyncIterator
55

66
import cocoindex
77
from dotenv import load_dotenv
@@ -71,7 +71,7 @@ def image_object_embedding_flow(
7171

7272

7373
@asynccontextmanager
74-
async def lifespan(app: FastAPI) -> None:
74+
async def lifespan(app: FastAPI) -> AsyncIterator[None]:
7575
load_dotenv()
7676
cocoindex.init()
7777
image_object_embedding_flow.setup(report_to_stdout=True)
@@ -100,11 +100,10 @@ async def lifespan(app: FastAPI) -> None:
100100

101101

102102
# --- Search API ---
103-
@app.get("/search")
104103
def search(
105104
q: str = Query(..., description="Search query"),
106105
limit: int = Query(5, description="Number of results"),
107-
) -> Any:
106+
) -> dict[str, Any]:
108107
# Get the multi-vector embedding for the query
109108
query_embedding = text_to_colpali_embedding.eval(q)
110109
print(
@@ -132,3 +131,7 @@ def search(
132131
for result in search_results.points
133132
]
134133
}
134+
135+
136+
# Attach route without using decorator to avoid untyped-decorator when FastAPI types are unavailable
137+
app.get("/search")(search)

examples/image_search/main.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import io
44
import os
55
from contextlib import asynccontextmanager
6-
from typing import Any, Literal
6+
from typing import Any, Literal, Final, TypeAlias, cast, AsyncIterator
77

88
import cocoindex
99
import torch
@@ -19,7 +19,8 @@
1919
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6334/")
2020
QDRANT_COLLECTION = "ImageSearch"
2121
CLIP_MODEL_NAME = "openai/clip-vit-large-patch14"
22-
CLIP_MODEL_DIMENSION = 768
22+
CLIP_MODEL_DIMENSION: Final[int] = 768
23+
CLIPVector: TypeAlias = cocoindex.Vector[cocoindex.Float32, Literal[768]]
2324

2425

2526
@functools.cache
@@ -37,13 +38,13 @@ def embed_query(text: str) -> list[float]:
3738
inputs = processor(text=[text], return_tensors="pt", padding=True)
3839
with torch.no_grad():
3940
features = model.get_text_features(**inputs)
40-
return features[0].tolist()
41+
return cast(list[float], features[0].tolist())
4142

4243

4344
@cocoindex.op.function(cache=True, behavior_version=1, gpu=True)
4445
def embed_image(
4546
img_bytes: bytes,
46-
) -> cocoindex.Vector[cocoindex.Float32, Literal[CLIP_MODEL_DIMENSION]]:
47+
) -> CLIPVector:
4748
"""
4849
Convert image to embedding using CLIP model.
4950
"""
@@ -52,7 +53,7 @@ def embed_image(
5253
inputs = processor(images=image, return_tensors="pt")
5354
with torch.no_grad():
5455
features = model.get_image_features(**inputs)
55-
return features[0].tolist()
56+
return cast(CLIPVector, features[0].tolist())
5657

5758

5859
# CocoIndex flow: Ingest images, extract captions, embed, export to Qdrant
@@ -112,7 +113,7 @@ def image_object_embedding_flow(
112113

113114

114115
@asynccontextmanager
115-
async def lifespan(app: FastAPI) -> None:
116+
async def lifespan(app: FastAPI) -> AsyncIterator[None]:
116117
load_dotenv()
117118
cocoindex.init()
118119
image_object_embedding_flow.setup(report_to_stdout=True)
@@ -141,11 +142,10 @@ async def lifespan(app: FastAPI) -> None:
141142

142143

143144
# --- Search API ---
144-
@app.get("/search")
145145
def search(
146146
q: str = Query(..., description="Search query"),
147147
limit: int = Query(5, description="Number of results"),
148-
) -> Any:
148+
) -> dict[str, Any]:
149149
# Get the embedding for the query
150150
query_embedding = embed_query(q)
151151

@@ -169,3 +169,7 @@ def search(
169169
for result in search_results
170170
]
171171
}
172+
173+
174+
# Attach route without using decorator to avoid untyped-decorator when FastAPI types are unavailable
175+
app.get("/search")(search)

examples/manuals_llm_extraction/main.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from marker.models import create_model_dict
66
from marker.output import text_from_rendered
77
from marker.config.parser import ConfigParser
8+
from typing import cast
89

910
import cocoindex
1011

@@ -20,7 +21,7 @@ class PdfToMarkdownExecutor:
2021
spec: PdfToMarkdown
2122
_converter: PdfConverter
2223

23-
def prepare(self):
24+
def prepare(self) -> None:
2425
config_parser = ConfigParser({})
2526
self._converter = PdfConverter(
2627
create_model_dict(), config=config_parser.generate_config_dict()
@@ -30,8 +31,8 @@ def __call__(self, content: bytes) -> str:
3031
with tempfile.NamedTemporaryFile(delete=True, suffix=".pdf") as temp_file:
3132
temp_file.write(content)
3233
temp_file.flush()
33-
text, _, _ = text_from_rendered(self._converter(temp_file.name))
34-
return text
34+
text_any, _, _ = text_from_rendered(self._converter(temp_file.name))
35+
return cast(str, text_any)
3536

3637

3738
@dataclasses.dataclass
@@ -90,7 +91,7 @@ def summarize_module(module_info: ModuleInfo) -> ModuleSummary:
9091
@cocoindex.flow_def(name="ManualExtraction")
9192
def manual_extraction_flow(
9293
flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
93-
):
94+
) -> None:
9495
"""
9596
Define an example flow that extracts manual information from a Markdown.
9697
"""

examples/paper_metadata/main.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from marker.models import create_model_dict
1010
from marker.output import text_from_rendered
1111
from functools import cache
12+
from typing import cast
1213
from pypdf import PdfReader, PdfWriter
1314

1415

@@ -66,8 +67,8 @@ def pdf_to_markdown(content: bytes) -> str:
6667
with tempfile.NamedTemporaryFile(delete=True, suffix=".pdf") as temp_file:
6768
temp_file.write(content)
6869
temp_file.flush()
69-
text, _, _ = text_from_rendered(get_marker_converter()(temp_file.name))
70-
return text
70+
text_any, _, _ = text_from_rendered(get_marker_converter()(temp_file.name))
71+
return cast(str, text_any)
7172

7273

7374
@cocoindex.flow_def(name="PaperMetadata")

0 commit comments

Comments
 (0)