cocoindex-io
diff --git a/‎examples/multi_format_indexing/.env‎
Lines changed: 2 additions & 0 deletions b/‎examples/multi_format_indexing/.env‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/multi_format_indexing/README.md‎
Lines changed: 71 additions & 0 deletions b/‎examples/multi_format_indexing/README.md‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎examples/multi_format_indexing/main.py‎
Lines changed: 135 additions & 0 deletions b/‎examples/multi_format_indexing/main.py‎
Lines changed: 135 additions & 0 deletions
diff --git a/‎examples/multi_format_indexing/pyproject.toml‎
Lines changed: 14 additions & 0 deletions b/‎examples/multi_format_indexing/pyproject.toml‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎examples/multi_format_indexing/source_files/1706.03762v7.pdf‎
2.11 MB b/‎examples/multi_format_indexing/source_files/1706.03762v7.pdf‎
2.11 MB
diff --git a/‎examples/multi_format_indexing/source_files/1810.04805v2.pdf‎
757 KB b/‎examples/multi_format_indexing/source_files/1810.04805v2.pdf‎
757 KB
diff --git a/‎examples/multi_format_indexing/source_files/cat1.jpeg‎
403 KB b/‎examples/multi_format_indexing/source_files/cat1.jpeg‎
403 KB
diff --git a/‎examples/multi_format_indexing/source_files/dog1.jpeg‎
986 KB b/‎examples/multi_format_indexing/source_files/dog1.jpeg‎
986 KB
diff --git a/‎examples/multi_format_indexing/source_files/elephant1.jpg‎
40.8 KB b/‎examples/multi_format_indexing/source_files/elephant1.jpg‎
40.8 KB
diff --git a/‎examples/multi_format_indexing/source_files/giraffe.jpg‎
321 KB b/‎examples/multi_format_indexing/source_files/giraffe.jpg‎
321 KB
@@ -0,0 +1,2 @@
+# Postgres database address for cocoindex
+COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex
@@ -0,0 +1,71 @@
+# Build visual document index from PDFs and images with ColPali
+[![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex)
+
+
+In this example, we build a visual document indexing flow using ColPali for embedding PDFs and images. and query the index with natural language.
+
+We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful.
+
+## Steps
+### Indexing Flow
+
+1. We ingest a list of PDF files and image files from the `source_files` directory.
+2. For each file:
+   - **PDF files**: convert each page to a high-resolution image (300 DPI)
+   - **Image files**: use the image directly
+   - Generate visual embeddings for each page/image using ColPali model
+3. We will save the embeddings and metadata in Qdrant vector database.
+
+### Query
+We will match against user-provided natural language text using ColPali's text-to-visual embedding capability, enabling semantic search across visual document content.
+
+
+
+## Prerequisite
+[Install Qdrant](https://qdrant.tech/documentation/guides/installation/) if you don't have one running locally.
+
+You can start Qdrant with Docker:
+```bash
+docker run -p 6333:6333 -p 6334:6334 qdrant/qdrant
+```
+
+## Run
+
+Install dependencies:
+
+```bash
+pip install -e .
+```
+
+Setup:
+
+```bash
+cocoindex setup main.py
+```
+
+Update index:
+
+```bash
+cocoindex update main.py
+```
+
+Run:
+
+```bash
+python main.py
+```
+
+## About ColPali
+This example uses [ColPali](https://github.com/illuin-tech/colpali), a state-of-the-art vision-language model that enables:
+- Direct visual understanding of document layouts, tables, and figures
+- Natural language queries against visual document content
+- No need for OCR or text extraction - works directly with document images
+
+## CocoInsight
+I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline. It just connects to your local CocoIndex server, with Zero pipeline data retention. Run following command to start CocoInsight:
+
+```
+cocoindex server -ci main.py
+```
+
+Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight).
@@ -0,0 +1,135 @@
+import cocoindex
+import os
+import mimetypes
+
+from dotenv import load_dotenv
+from dataclasses import dataclass
+from pdf2image import convert_from_bytes
+from io import BytesIO
+
+from qdrant_client import QdrantClient
+
+QDRANT_GRPC_URL = "http://localhost:6334"
+QDRANT_COLLECTION = "MultiFormatIndexings"
+COLPALI_MODEL_NAME = os.getenv("COLPALI_MODEL", "vidore/colpali-v1.2")
+
+
+@dataclass
+class Page:
+    page_number: int | None
+    image: bytes
+
+
+@cocoindex.op.function()
+def file_to_pages(filename: str, content: bytes) -> list[Page]:
+    """
+    Classify file content based on MIME type detection.
+    Returns ClassifiedFileContent with appropriate field populated based on file type.
+    """
+    # Guess the MIME type based on the filename
+    mime_type, _ = mimetypes.guess_type(filename)
+
+    if mime_type == "application/pdf":
+        images = convert_from_bytes(content, dpi=300)
+        pages = []
+        for i, image in enumerate(images):
+            with BytesIO() as buffer:
+                image.save(buffer, format="PNG")
+                pages.append(Page(page_number=i + 1, image=buffer.getvalue()))
+        return pages
+    elif mime_type and mime_type.startswith("image/"):
+        return [Page(page_number=None, image=content)]
+    else:
+        return []
+
+
+qdrant_connection = cocoindex.add_auth_entry(
+    "qdrant_connection",
+    cocoindex.targets.QdrantConnection(grpc_url=QDRANT_GRPC_URL),
+)
+
+
+@cocoindex.flow_def(name="MultiFormatIndexing")
+def multi_format_indexing_flow(
+    flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
+) -> None:
+    """
+    Define an example flow that embeds files into a vector database.
+    """
+    data_scope["documents"] = flow_builder.add_source(
+        cocoindex.sources.LocalFile(path="source_files", binary=True)
+    )
+
+    output_embeddings = data_scope.add_collector()
+
+    with data_scope["documents"].row() as doc:
+        doc["pages"] = flow_builder.transform(
+            file_to_pages, filename=doc["filename"], content=doc["content"]
+        )
+        with doc["pages"].row() as page:
+            page["embedding"] = page["image"].transform(
+                cocoindex.functions.ColPaliEmbedImage(model=COLPALI_MODEL_NAME)
+            )
+            output_embeddings.collect(
+                id=cocoindex.GeneratedField.UUID,
+                filename=doc["filename"],
+                page=page["page_number"],
+                embedding=page["embedding"],
+            )
+
+    output_embeddings.export(
+        "multi_format_indexings",
+        cocoindex.targets.Qdrant(
+            connection=qdrant_connection,
+            collection_name=QDRANT_COLLECTION,
+        ),
+        primary_key_fields=["id"],
+    )
+
+
+@cocoindex.transform_flow()
+def query_to_colpali_embedding(
+    text: cocoindex.DataSlice[str],
+) -> cocoindex.DataSlice[list[list[float]]]:
+    return text.transform(
+        cocoindex.functions.ColPaliEmbedQuery(model=COLPALI_MODEL_NAME)
+    )
+
+
+def _main() -> None:
+    # Initialize Qdrant client
+    client = QdrantClient(url=QDRANT_GRPC_URL, prefer_grpc=True)
+
+    # Run queries in a loop to demonstrate the query capabilities.
+    while True:
+        query = input("Enter search query (or Enter to quit): ")
+        if query == "":
+            break
+
+        # Get the embedding for the query
+        query_embedding = query_to_colpali_embedding.eval(query)
+
+        search_results = client.query_points(
+            collection_name=QDRANT_COLLECTION,
+            query=query_embedding,  # Multi-vector format: list[list[float]]
+            using="embedding",  # Specify the vector field name
+            limit=5,
+            with_payload=True,
+        )
+        print("\nSearch results:")
+        for result in search_results.points:
+            score = result.score
+            payload = result.payload
+            if payload is None:
+                continue
+            page_number = payload["page"]
+            page_number_str = f"Page:{page_number}" if page_number is not None else ""
+            print(f"[{score:.3f}] {payload['filename']} {page_number_str}")
+            print("---")
+        print()
+
+
+if __name__ == "__main__":
+    load_dotenv()
+    cocoindex.init()
+    _main()
@@ -0,0 +1,14 @@
+[project]
+name = "pdf-embedding"
+version = "0.1.0"
+description = "Simple example for cocoindex: build embedding index based on local PDF files."
+requires-python = ">=3.11"
+dependencies = [
+    "cocoindex[colpali]>=0.1.75",
+    "python-dotenv>=1.0.1",
+    "pdf2image>=1.17.0",
+    "qdrant-client>=1.15.0",
+]
+
+[tool.setuptools]
+packages = []
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Postgres database address for cocoindex`
	`2`	`+COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex`