Skip to content

Commit f04f442

Browse files
authored
example: add multi_format_indexing example (#837)
1 parent 49fb933 commit f04f442

File tree

11 files changed

+222
-0
lines changed

11 files changed

+222
-0
lines changed
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Postgres database address for cocoindex
2+
COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# Build visual document index from PDFs and images with ColPali
2+
[![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex)
3+
4+
5+
In this example, we build a visual document indexing flow using ColPali for embedding PDFs and images. and query the index with natural language.
6+
7+
We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful.
8+
9+
## Steps
10+
### Indexing Flow
11+
12+
1. We ingest a list of PDF files and image files from the `source_files` directory.
13+
2. For each file:
14+
- **PDF files**: convert each page to a high-resolution image (300 DPI)
15+
- **Image files**: use the image directly
16+
- Generate visual embeddings for each page/image using ColPali model
17+
3. We will save the embeddings and metadata in Qdrant vector database.
18+
19+
### Query
20+
We will match against user-provided natural language text using ColPali's text-to-visual embedding capability, enabling semantic search across visual document content.
21+
22+
23+
24+
## Prerequisite
25+
[Install Qdrant](https://qdrant.tech/documentation/guides/installation/) if you don't have one running locally.
26+
27+
You can start Qdrant with Docker:
28+
```bash
29+
docker run -p 6333:6333 -p 6334:6334 qdrant/qdrant
30+
```
31+
32+
## Run
33+
34+
Install dependencies:
35+
36+
```bash
37+
pip install -e .
38+
```
39+
40+
Setup:
41+
42+
```bash
43+
cocoindex setup main.py
44+
```
45+
46+
Update index:
47+
48+
```bash
49+
cocoindex update main.py
50+
```
51+
52+
Run:
53+
54+
```bash
55+
python main.py
56+
```
57+
58+
## About ColPali
59+
This example uses [ColPali](https://github.com/illuin-tech/colpali), a state-of-the-art vision-language model that enables:
60+
- Direct visual understanding of document layouts, tables, and figures
61+
- Natural language queries against visual document content
62+
- No need for OCR or text extraction - works directly with document images
63+
64+
## CocoInsight
65+
I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline. It just connects to your local CocoIndex server, with Zero pipeline data retention. Run following command to start CocoInsight:
66+
67+
```
68+
cocoindex server -ci main.py
69+
```
70+
71+
Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight).
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
import cocoindex
2+
import os
3+
import mimetypes
4+
5+
from dotenv import load_dotenv
6+
from dataclasses import dataclass
7+
from pdf2image import convert_from_bytes
8+
from io import BytesIO
9+
10+
from qdrant_client import QdrantClient
11+
12+
QDRANT_GRPC_URL = "http://localhost:6334"
13+
QDRANT_COLLECTION = "MultiFormatIndexings"
14+
COLPALI_MODEL_NAME = os.getenv("COLPALI_MODEL", "vidore/colpali-v1.2")
15+
16+
17+
@dataclass
18+
class Page:
19+
page_number: int | None
20+
image: bytes
21+
22+
23+
@cocoindex.op.function()
24+
def file_to_pages(filename: str, content: bytes) -> list[Page]:
25+
"""
26+
Classify file content based on MIME type detection.
27+
Returns ClassifiedFileContent with appropriate field populated based on file type.
28+
"""
29+
# Guess the MIME type based on the filename
30+
mime_type, _ = mimetypes.guess_type(filename)
31+
32+
if mime_type == "application/pdf":
33+
images = convert_from_bytes(content, dpi=300)
34+
pages = []
35+
for i, image in enumerate(images):
36+
with BytesIO() as buffer:
37+
image.save(buffer, format="PNG")
38+
pages.append(Page(page_number=i + 1, image=buffer.getvalue()))
39+
return pages
40+
elif mime_type and mime_type.startswith("image/"):
41+
return [Page(page_number=None, image=content)]
42+
else:
43+
return []
44+
45+
46+
qdrant_connection = cocoindex.add_auth_entry(
47+
"qdrant_connection",
48+
cocoindex.targets.QdrantConnection(grpc_url=QDRANT_GRPC_URL),
49+
)
50+
51+
52+
@cocoindex.flow_def(name="MultiFormatIndexing")
53+
def multi_format_indexing_flow(
54+
flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
55+
) -> None:
56+
"""
57+
Define an example flow that embeds files into a vector database.
58+
"""
59+
data_scope["documents"] = flow_builder.add_source(
60+
cocoindex.sources.LocalFile(path="source_files", binary=True)
61+
)
62+
63+
output_embeddings = data_scope.add_collector()
64+
65+
with data_scope["documents"].row() as doc:
66+
doc["pages"] = flow_builder.transform(
67+
file_to_pages, filename=doc["filename"], content=doc["content"]
68+
)
69+
with doc["pages"].row() as page:
70+
page["embedding"] = page["image"].transform(
71+
cocoindex.functions.ColPaliEmbedImage(model=COLPALI_MODEL_NAME)
72+
)
73+
output_embeddings.collect(
74+
id=cocoindex.GeneratedField.UUID,
75+
filename=doc["filename"],
76+
page=page["page_number"],
77+
embedding=page["embedding"],
78+
)
79+
80+
output_embeddings.export(
81+
"multi_format_indexings",
82+
cocoindex.targets.Qdrant(
83+
connection=qdrant_connection,
84+
collection_name=QDRANT_COLLECTION,
85+
),
86+
primary_key_fields=["id"],
87+
)
88+
89+
90+
@cocoindex.transform_flow()
91+
def query_to_colpali_embedding(
92+
text: cocoindex.DataSlice[str],
93+
) -> cocoindex.DataSlice[list[list[float]]]:
94+
return text.transform(
95+
cocoindex.functions.ColPaliEmbedQuery(model=COLPALI_MODEL_NAME)
96+
)
97+
98+
99+
def _main() -> None:
100+
# Initialize Qdrant client
101+
client = QdrantClient(url=QDRANT_GRPC_URL, prefer_grpc=True)
102+
103+
# Run queries in a loop to demonstrate the query capabilities.
104+
while True:
105+
query = input("Enter search query (or Enter to quit): ")
106+
if query == "":
107+
break
108+
109+
# Get the embedding for the query
110+
query_embedding = query_to_colpali_embedding.eval(query)
111+
112+
search_results = client.query_points(
113+
collection_name=QDRANT_COLLECTION,
114+
query=query_embedding, # Multi-vector format: list[list[float]]
115+
using="embedding", # Specify the vector field name
116+
limit=5,
117+
with_payload=True,
118+
)
119+
print("\nSearch results:")
120+
for result in search_results.points:
121+
score = result.score
122+
payload = result.payload
123+
if payload is None:
124+
continue
125+
page_number = payload["page"]
126+
page_number_str = f"Page:{page_number}" if page_number is not None else ""
127+
print(f"[{score:.3f}] {payload['filename']} {page_number_str}")
128+
print("---")
129+
print()
130+
131+
132+
if __name__ == "__main__":
133+
load_dotenv()
134+
cocoindex.init()
135+
_main()
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
[project]
2+
name = "pdf-embedding"
3+
version = "0.1.0"
4+
description = "Simple example for cocoindex: build embedding index based on local PDF files."
5+
requires-python = ">=3.11"
6+
dependencies = [
7+
"cocoindex[colpali]>=0.1.75",
8+
"python-dotenv>=1.0.1",
9+
"pdf2image>=1.17.0",
10+
"qdrant-client>=1.15.0",
11+
]
12+
13+
[tool.setuptools]
14+
packages = []
2.11 MB
Binary file not shown.
757 KB
Binary file not shown.
403 KB
Loading
986 KB
Loading
40.8 KB
Loading
321 KB
Loading

0 commit comments

Comments
 (0)