Update ColPali image search example

badmonster0 · badmonster0 · commit 00bebb248bf5 · 2025-07-28T14:45:33.000-07:00
Add PREFER_GRPC config at the top of main.py for easy switching between gRPC (default, port 6334) and HTTP (port 6333) Qdrant connections via environment variable.

Frontend (App.jsx): Use window.location.hostname for API and image URLs, so devices on the same LAN can access the backend and images when the frontend is served on 0.0.0.0. This enables seamless LAN access to search and image results.
diff --git a/examples/image_search/.env b/examples/image_search/.env
diff --git a/examples/image_search_colpali/README.md b/examples/image_search_colpali/README.md
@@ -1,7 +1,7 @@
-# Image Search with CocoIndex
+# Image Search with CocoIndex (ColPali Edition)
 [![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex)
 
-We will build live image search and query it with natural language, using multimodal embedding model. We are going use CocoIndex to build real-time indexing flow. During running, you can add new files to the folder and it only process changed files and will be indexed within a minute.
+We will build live image search and query it with natural language, using a multimodal embedding model (ColPali). We use CocoIndex to build a real-time indexing flow. During running, you can add new files to the folder and it only processes changed files, indexing them within a minute.
 
 We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful.
 
@@ -10,10 +10,10 @@ We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/c
 
 ## Technologies
 - CocoIndex for ETL and live update
-- CLIP ViT-L/14 - Embeddings Model for images and query
-- Qdrant for Vector Storage
-- FastApi for backend
-- Ollama (Optional) for generating image captions using `gemma3`.
+- **ColPali** - Multimodal Embeddings Model for images and query
+- Qdrant for Vector Storage (supports both gRPC and HTTP)
+- FastAPI for backend
+- Ollama (Optional) for generating image captions using `gemma3` or other models
 
 ## Setup
 - Make sure Postgres and Qdrant are running
@@ -22,8 +22,27 @@ We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/c
   export COCOINDEX_DATABASE_URL="postgres://cocoindex:cocoindex@localhost/cocoindex"
   ```
 
-## (Optional) Run Ollama
+## Qdrant Protocol Configuration
+- By default, the app uses **gRPC** (port 6334) to connect to Qdrant for best performance.
+- To use HTTP (port 6333) instead, change the config at the top of `main.py`:
+  ```python
+  # Use GRPC (default)
+  QDRANT_URL = os.getenv("QDRANT_URL", "localhost:6334")
+  PREFER_GRPC = os.getenv("QDRANT_PREFER_GRPC", "true").lower() == "true"
+  # Use HTTP (uncomment below to use HTTP)
+  #QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333/")
+  #PREFER_GRPC = os.getenv("QDRANT_PREFER_GRPC", "false").lower() == "true"
+  ```
+- You can also override these with environment variables:
+  ```sh
+  export QDRANT_URL="localhost:6334"           # for gRPC (default)
+  export QDRANT_PREFER_GRPC=true                # for gRPC (default)
+  # or for HTTP:
+  # export QDRANT_URL="http://localhost:6333/"
+  # export QDRANT_PREFER_GRPC=false
+  ```
 
+## (Optional) Run Ollama
 - This enables automatic image captioning
 ```
 ollama pull gemma3
diff --git a/examples/image_search_colpali/frontend/src/App.jsx b/examples/image_search_colpali/frontend/src/App.jsx
@@ -1,6 +1,6 @@
 import React, { useState } from 'react';
 
-const API_URL = 'http://localhost:8000/search'; // Adjust this to your backend search endpoint
+const API_URL = `http://${window.location.hostname}:8000/search`; // Use same IP as frontend
 
 export default function App() {
   const [query, setQuery] = useState('');
@@ -42,7 +42,7 @@ export default function App() {
         {results.length === 0 && !loading && <div>No results</div>}
         {results.map((result, idx) => (
           <div key={idx} className="result-card">
-            <img src={`http://localhost:8000/img/${result.filename}`} alt={result.filename} className="result-img" />
+            <img src={`http://${window.location.hostname}:8000/img/${result.filename}`} alt={result.filename} className="result-img" />
             <div className="score">Score: {result.score?.toFixed(3)}</div>
           </div>
         ))}
diff --git a/examples/image_search_colpali/main.py b/examples/image_search_colpali/main.py
@@ -6,67 +6,110 @@
 from typing import Any, Literal
 
 import cocoindex
-import torch
+import numpy as np
 from dotenv import load_dotenv
-from fastapi import FastAPI, Query
+from fastapi import FastAPI, Query, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
 from PIL import Image
 from qdrant_client import QdrantClient
-from transformers import CLIPModel, CLIPProcessor
+from colpali_engine.models import ColPali, ColPaliProcessor
+
+
+# --- Config ---
+
+# Use GRPC
+QDRANT_URL = os.getenv("QDRANT_URL", "localhost:6334")
+PREFER_GRPC = os.getenv("QDRANT_PREFER_GRPC", "true").lower() == "true"
+
+# Use HTTP
+# QDRANT_URL = os.getenv("QDRANT_URL", "localhost:6333")
+# PREFER_GRPC = os.getenv("QDRANT_PREFER_GRPC", "false").lower() == "true"
 
 OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434/")
-QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6334/")
-QDRANT_COLLECTION = "ImageSearch"
-CLIP_MODEL_NAME = "openai/clip-vit-large-patch14"
-CLIP_MODEL_DIMENSION = 768
+QDRANT_COLLECTION = "ImageSearchColpali"
+COLPALI_MODEL_NAME = os.getenv("COLPALI_MODEL", "vidore/colpali-v1.2")
+COLPALI_MODEL_DIMENSION = 1031  # Set to match ColPali's output
+
+# --- ColPali model cache and embedding functions ---
+_colpali_model_cache = {}
+
+
+def get_colpali_model(model: str = COLPALI_MODEL_NAME):
+    global _colpali_model_cache
+    if model not in _colpali_model_cache:
+        print(f"Loading ColPali model: {model}")
+        _colpali_model_cache[model] = {
+            "model": ColPali.from_pretrained(model),
+            "processor": ColPaliProcessor.from_pretrained(model),
+        }
+    return _colpali_model_cache[model]["model"], _colpali_model_cache[model][
+        "processor"
+    ]
+
 
+def colpali_embed_image(
+    img_bytes: bytes, model: str = COLPALI_MODEL_NAME
+) -> list[float]:
+    from PIL import Image
+    import torch
+    import io
 
-@functools.cache
-def get_clip_model() -> tuple[CLIPModel, CLIPProcessor]:
-    model = CLIPModel.from_pretrained(CLIP_MODEL_NAME)
-    processor = CLIPProcessor.from_pretrained(CLIP_MODEL_NAME)
-    return model, processor
+    colpali_model, processor = get_colpali_model(model)
+    pil_image = Image.open(io.BytesIO(img_bytes)).convert("RGB")
+    inputs = processor.process_images([pil_image])
+    with torch.no_grad():
+        embeddings = colpali_model(**inputs)
+    pooled_embedding = embeddings.mean(dim=-1)
+    result = pooled_embedding[0].cpu().numpy()  # [1031]
+    return result.tolist()
+
+
+def colpali_embed_query(query: str, model: str = COLPALI_MODEL_NAME) -> list[float]:
+    import torch
+    import numpy as np
+
+    colpali_model, processor = get_colpali_model(model)
+    inputs = processor.process_queries([query])
+    with torch.no_grad():
+        embeddings = colpali_model(**inputs)
+    pooled_embedding = embeddings.mean(dim=-1)
+    query_tokens = pooled_embedding[0].cpu().numpy()  # [15]
+    target_length = COLPALI_MODEL_DIMENSION
+    result = np.zeros(target_length, dtype=np.float32)
+    result[: min(len(query_tokens), target_length)] = query_tokens[:target_length]
+    return result.tolist()
+
+
+# --- End ColPali embedding functions ---
 
 
 def embed_query(text: str) -> list[float]:
     """
-    Embed the caption using CLIP model.
+    Embed the caption using ColPali model.
     """
-    model, processor = get_clip_model()
-    inputs = processor(text=[text], return_tensors="pt", padding=True)
-    with torch.no_grad():
-        features = model.get_text_features(**inputs)
-    return features[0].tolist()
+    return colpali_embed_query(text, model=COLPALI_MODEL_NAME)
 
 
 @cocoindex.op.function(cache=True, behavior_version=1, gpu=True)
 def embed_image(
     img_bytes: bytes,
-) -> cocoindex.Vector[cocoindex.Float32, Literal[CLIP_MODEL_DIMENSION]]:
+) -> cocoindex.Vector[cocoindex.Float32, Literal[COLPALI_MODEL_DIMENSION]]:
     """
-    Convert image to embedding using CLIP model.
+    Convert image to embedding using ColPali model.
     """
-    model, processor = get_clip_model()
-    image = Image.open(io.BytesIO(img_bytes)).convert("RGB")
-    inputs = processor(images=image, return_tensors="pt")
-    with torch.no_grad():
-        features = model.get_image_features(**inputs)
-    return features[0].tolist()
+    return colpali_embed_image(img_bytes, model=COLPALI_MODEL_NAME)
 
 
-# CocoIndex flow: Ingest images, extract captions, embed, export to Qdrant
-@cocoindex.flow_def(name="ImageObjectEmbedding")
+@cocoindex.flow_def(name="ImageObjectEmbeddingColpali")
 def image_object_embedding_flow(
     flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
 ) -> None:
     data_scope["images"] = flow_builder.add_source(
         cocoindex.sources.LocalFile(
             path="img", included_patterns=["*.jpg", "*.jpeg", "*.png"], binary=True
         ),
-        refresh_interval=datetime.timedelta(
-            minutes=1
-        ),  # Poll for changes every 1 minute
+        refresh_interval=datetime.timedelta(minutes=1),
     )
     img_embeddings = data_scope.add_collector()
     with data_scope["images"].row() as img:
@@ -117,7 +160,7 @@ async def lifespan(app: FastAPI) -> None:
     cocoindex.init()
     image_object_embedding_flow.setup(report_to_stdout=True)
 
-    app.state.qdrant_client = QdrantClient(url=QDRANT_URL, prefer_grpc=True)
+    app.state.qdrant_client = QdrantClient(url=QDRANT_URL, prefer_grpc=PREFER_GRPC)
 
     # Start updater
     app.state.live_updater = cocoindex.FlowLiveUpdater(image_object_embedding_flow)
@@ -162,9 +205,7 @@ def search(
             {
                 "filename": result.payload["filename"],
                 "score": result.score,
-                "caption": result.payload.get(
-                    "caption"
-                ),  # Include caption if available
+                "caption": result.payload.get("caption"),
             }
             for result in search_results
         ]
diff --git a/examples/image_search_colpali/pyproject.toml b/examples/image_search_colpali/pyproject.toml
@@ -1,16 +1,18 @@
 [project]
-name = "image-search"
+name = "image-search-colpali"
 version = "0.1.0"
-description = "Simple example for cocoindex: build embedding index based on images."
+description = "ColPali-based image search example for cocoindex."
 requires-python = ">=3.11"
 dependencies = [
     "cocoindex>=0.1.67",
     "python-dotenv>=1.0.1",
     "fastapi>=0.100.0",
     "torch>=2.0.0",
-    "transformers>=4.29.0",
     "qdrant-client>=1.14.2",
     "uvicorn>=0.34.3",
+    "colpali-engine>=0.1.0",
+    "Pillow>=10.0.0",
+    "numpy>=1.24.0",
 ]
 
 [tool.setuptools]
diff --git a/pyproject.toml b/pyproject.toml
@@ -31,11 +31,8 @@ features = ["pyo3/extension-module"]
 [project.optional-dependencies]
 dev = ["pytest", "ruff", "mypy", "pre-commit"]
 
-embeddings = ["sentence-transformers>=3.3.1"]
-
-# We need to repeat the dependency above to make it available for the `all` feature.
-# Indirect dependencies such as "cocoindex[embeddings]" will not work for local development.
-all = ["sentence-transformers>=3.3.1"]
+embeddings = ["sentence-transformers>=3.3.1", "colpali-engine"]
+all = ["cocoindex[embeddings]"]
 
 [tool.mypy]
 python_version = "3.11"