example(img-search): use CLIP for both image and text (#538)

badmonster0 · web-flow · commit cf03bcdee694 · 2025-05-21T14:59:47.000-07:00
* example(img-search): use CLIP for both image and text

* refactor: make the code cleaner
diff --git a/examples/image_search/README.md b/examples/image_search/README.md
@@ -3,8 +3,7 @@
 ![image](https://github.com/user-attachments/assets/3a696344-c9b4-46e8-9413-6229dbb8672a)
 
 - Qdrant for Vector Storage
-- Ollama Gemma3 (Image to Text)
-- CLIP ViT-L/14 - Embeddings Model
+- CLIP ViT-L/14 - Embeddings Model for both images and text
 - Live Update
 
 ## Make sure Postgres and Qdrant are running
@@ -27,16 +26,6 @@ curl -X PUT 'http://localhost:6333/collections/image_search' \
   }'
 ```
 
-## Run Ollama
-```
-ollama pull gemma3
-ollama serve
-```
-
-### Place your images in the `img` directory.
-- No need to update manually. CocoIndex will automatically update the index as new images are added to the directory.
-
-
 ## Run Backend
 - Install dependencies:
   ```
diff --git a/examples/image_search/main.py b/examples/image_search/main.py
@@ -1,62 +1,57 @@
 from dotenv import load_dotenv
+
 import cocoindex
 import datetime
+import functools
+import io
 import os
-import requests
-import base64
+import torch
+
+from typing import Literal
 from fastapi import FastAPI, Query
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
 from qdrant_client import QdrantClient
 
-OLLAMA_URL = "http://localhost:11434/api/generate"
-OLLAMA_MODEL = "gemma3"
+from PIL import Image
+from transformers import CLIPModel, CLIPProcessor
+
+
 QDRANT_GRPC_URL = os.getenv("QDRANT_GRPC_URL", "http://localhost:6334/")
+CLIP_MODEL_NAME = "openai/clip-vit-large-patch14"
+
+@functools.cache
+def get_clip_model() -> tuple[CLIPModel, CLIPProcessor]:
+    model = CLIPModel.from_pretrained(CLIP_MODEL_NAME)
+    processor = CLIPProcessor.from_pretrained(CLIP_MODEL_NAME)
+    return model, processor
 
-# 1. Extract caption from image using Ollama vision model
-@cocoindex.op.function(cache=True, behavior_version=1)
-def get_image_caption(img_bytes: bytes) -> str:
+
+def embed_query(text: str) -> list[float]:
     """
-    Use Ollama's gemma3 model to extract a detailed caption from an image.
-    Returns a full-sentence natural language description of the image.
+    Embed the caption using CLIP model.
     """
-    img_b64 = base64.b64encode(img_bytes).decode("utf-8")
-    prompt = (
-        "Describe this image in one detailed, natural language sentence. "
-        "Always explicitly name every visible animal species, object, and the main scene. "
-        "Be specific about the type, color, and any distinguishing features. "
-        "Avoid generic words like 'animal' or 'creature'—always use the most precise name (e.g., 'elephant', 'cat', 'lion', 'zebra'). "
-        "If an animal is present, mention its species and what it is doing. "
-        "For example: 'A large grey elephant standing in a grassy savanna, with trees in the background.'"
-    )
-    payload = {
-        "model": OLLAMA_MODEL,
-        "prompt": prompt,
-        "images": [img_b64],
-        "stream": False,
-    }
-    resp = requests.post(OLLAMA_URL, json=payload)
-    resp.raise_for_status()
-    result = resp.json()
-    text = result.get("response", "")
-    text = text.strip().replace("\n", "").rstrip(".")
-    return text
+    model, processor = get_clip_model()
+    inputs = processor(text=[text], return_tensors="pt", padding=True)
+    with torch.no_grad():
+        features = model.get_text_features(**inputs)
+    return features[0].tolist()
 
 
-# 2. Embed the caption string
-@cocoindex.transform_flow()
-def caption_to_embedding(caption: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:
+@cocoindex.op.function(cache=True, behavior_version=1, gpu=True)
+def embed_image(img_bytes: bytes) -> cocoindex.Vector[cocoindex.Float32, Literal[384]]:
     """
-    Embed the caption using a CLIP model.
-    This is shared logic between indexing and querying.
+    Convert image to embedding using CLIP model.
     """
-    return caption.transform(
-        cocoindex.functions.SentenceTransformerEmbed(
-            model="clip-ViT-L-14",
-        )
-    )
+    model, processor = get_clip_model()
+    image = Image.open(io.BytesIO(img_bytes)).convert("RGB")
+    inputs = processor(images=image, return_tensors="pt")
+    with torch.no_grad():
+        features = model.get_image_features(**inputs)
+    return features[0].tolist()
+    
 
-# 3. CocoIndex flow: Ingest images, extract captions, embed, export to Qdrant
+# CocoIndex flow: Ingest images, extract captions, embed, export to Qdrant
 @cocoindex.flow_def(name="ImageObjectEmbedding")
 def image_object_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
     data_scope["images"] = flow_builder.add_source(
@@ -65,12 +60,10 @@ def image_object_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope:
     )
     img_embeddings = data_scope.add_collector()
     with data_scope["images"].row() as img:
-        img["caption"] = img["content"].transform(get_image_caption)
-        img["embedding"] = caption_to_embedding(img["caption"])
+        img["embedding"] = img["content"].transform(embed_image)
         img_embeddings.collect(
             id=cocoindex.GeneratedField.UUID,
             filename=img["filename"],
-            caption=img["caption"],
             embedding=img["embedding"],
         )
     img_embeddings.export(
@@ -111,7 +104,7 @@ def startup_event():
 @app.get("/search")
 def search(q: str = Query(..., description="Search query"), limit: int = Query(5, description="Number of results")):
     # Get the embedding for the query
-    query_embedding = caption_to_embedding.eval(q)
+    query_embedding = embed_query(q)
     
     # Search in Qdrant
     search_results = app.state.qdrant_client.search(
diff --git a/examples/image_search/pyproject.toml b/examples/image_search/pyproject.toml
@@ -3,7 +3,13 @@ name = "image-search"
 version = "0.1.0"
 description = "Simple example for cocoindex: build embedding index based on images."
 requires-python = ">=3.11"
-dependencies = ["cocoindex>=0.1.42", "python-dotenv>=1.0.1", "fastapi>=0.100.0"]
+dependencies = [
+    "cocoindex>=0.1.42",
+    "python-dotenv>=1.0.1",
+    "fastapi>=0.100.0",
+    "torch>=2.0.0",
+    "transformers>=4.29.0",
+]
 
 [tool.setuptools]
 packages = []