feat: make image captioning support with Ollama integration optional

lemorage · lemorage · commit b11d6b46da3e · 2025-07-05T14:35:22.000+02:00
diff --git a/examples/image_search/README.md b/examples/image_search/README.md
@@ -13,6 +13,7 @@ We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/c
 - CLIP ViT-L/14 - Embeddings Model for images and query
 - Qdrant for Vector Storage
 - FastApi for backend
+- Ollama (Optional) for generating image captions using `gemma3`.
 
 ## Setup
 - Make sure Postgres and Qdrant are running
@@ -21,7 +22,15 @@ We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/c
   export COCOINDEX_DATABASE_URL="postgres://cocoindex:cocoindex@localhost/cocoindex"
   ```
 
-## Run
+## (Optional) Run Ollama
+
+- This enables automatic image captioning
+```
+ollama pull gemma3
+ollama serve
+```
+
+## Run the App
 - Install dependencies:
   ```
   pip install -e .
diff --git a/examples/image_search/main.py b/examples/image_search/main.py
@@ -6,6 +6,7 @@
 from typing import Any, Literal
 
 import cocoindex
+import requests
 import torch
 from dotenv import load_dotenv
 from fastapi import FastAPI, Query
@@ -17,10 +18,21 @@
 
 QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6334/")
 QDRANT_COLLECTION = "ImageSearch"
+OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434/")
+OLLAMA_MODEL = "gemma3"
 CLIP_MODEL_NAME = "openai/clip-vit-large-patch14"
 CLIP_MODEL_DIMENSION = 768
 
 
+def ollama_has_model(model) -> bool:
+    try:
+        r = requests.get(f"{OLLAMA_URL}/api/tags", timeout=1)
+        r.raise_for_status()
+        return any(m.get("name") == model for m in r.json().get("models", []))
+    except Exception:
+        return False
+
+
 @functools.cache
 def get_clip_model() -> tuple[CLIPModel, CLIPProcessor]:
     model = CLIPModel.from_pretrained(CLIP_MODEL_NAME)
@@ -69,37 +81,49 @@ def image_object_embedding_flow(
     )
     img_embeddings = data_scope.add_collector()
     with data_scope["images"].row() as img:
-        img["caption"] = flow_builder.transform(
-            cocoindex.functions.ExtractByLlm(
-                llm_spec=cocoindex.LlmSpec(
-                    api_type=cocoindex.LlmApiType.GEMINI, model="gemini-2.0-flash"
-                ),
-                # Replace by this spec below, to use OpenAI API model instead of gemini
-                #   llm_spec=cocoindex.LlmSpec(
-                #       api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
-                # Replace by this spec below, to use Ollama API model
-                #   llm_spec=cocoindex.llm.LlmSpec(
-                #       api_type=cocoindex.LlmApiType.OLLAMA, model="llama3.1"),
-                # Replace by this spec below, to use Anthropic API model
-                #   llm_spec=cocoindex.LlmSpec(
-                #       api_type=cocoindex.LlmApiType.ANTHROPIC, model="claude-3-5-sonnet-latest"),
-                instruction=(
-                    "Describe the image in one detailed sentence. "
-                    "Name all visible animal species, objects, and the main scene. "
-                    "Be specific about type, color, and notable features. "
-                    "Mention what each animal is doing."
+        has_gemma3 = ollama_has_model(OLLAMA_MODEL)
+        if has_gemma3:
+            img["caption"] = flow_builder.transform(
+                cocoindex.functions.ExtractByLlm(
+                    llm_spec=cocoindex.llm.LlmSpec(
+                        api_type=cocoindex.LlmApiType.OLLAMA, model=OLLAMA_MODEL
+                    ),
+                    # Replace by this spec below, to use OpenAI API model instead of ollama
+                    #   llm_spec=cocoindex.LlmSpec(
+                    #       api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
+                    # Replace by this spec below, to use Gemini API model
+                    #   llm_spec=cocoindex.LlmSpec(
+                    #       api_type=cocoindex.LlmApiType.GEMINI, model="gemini-2.0-flash"),
+                    # Replace by this spec below, to use Anthropic API model
+                    #   llm_spec=cocoindex.LlmSpec(
+                    #       api_type=cocoindex.LlmApiType.ANTHROPIC, model="claude-3-5-sonnet-latest"),
+                    instruction=(
+                        "Describe the image in one detailed sentence. "
+                        "Name all visible animal species, objects, and the main scene. "
+                        "Be specific about type, color, and notable features. "
+                        "Mention what each animal is doing."
+                    ),
+                    output_type=str,
                 ),
-                output_type=str,
-            ),
-            image=img["content"],
-        )
+                image=img["content"],
+            )
         img["embedding"] = img["content"].transform(embed_image)
-        img_embeddings.collect(
-            id=cocoindex.GeneratedField.UUID,
-            filename=img["filename"],
-            caption=img["caption"],
-            embedding=img["embedding"],
-        )
+
+        collect_fields = {
+            "id": cocoindex.GeneratedField.UUID,
+            "filename": img["filename"],
+            "embedding": img["embedding"],
+        }
+
+        if has_gemma3:
+            print(
+                f"Ollama model '{OLLAMA_MODEL}' is available — captions will be extracted."
+            )
+            collect_fields["caption"] = img["caption"]
+        else:
+            print(f"Ollama model '{OLLAMA_MODEL}' not found — skipping captioning.")
+
+        img_embeddings.collect(**collect_fields)
 
     img_embeddings.export(
         "img_embeddings",
@@ -151,11 +175,18 @@ def search(
         collection_name=QDRANT_COLLECTION,
         query_vector=("embedding", query_embedding),
         limit=limit,
+        with_payload=True,
     )
 
     return {
         "results": [
-            {"filename": result.payload["filename"], "score": result.score}
+            {
+                "filename": result.payload["filename"],
+                "score": result.score,
+                "caption": result.payload.get(
+                    "caption"
+                ),  # Include caption if available
+            }
             for result in search_results
         ]
     }
diff --git a/examples/image_search/pyproject.toml b/examples/image_search/pyproject.toml
@@ -11,6 +11,7 @@ dependencies = [
     "transformers>=4.29.0",
     "qdrant-client>=1.14.2",
     "uvicorn>=0.34.3",
+    "requests>=2.32.4",
 ]
 
 [tool.setuptools]
diff --git a/src/ops/functions/extract_by_llm.rs b/src/ops/functions/extract_by_llm.rs
@@ -34,7 +34,7 @@ fn get_system_prompt(instructions: &Option<String>, extra_instructions: Option<S
 Your task is to follow the provided instructions to generate or extract information and output valid JSON matching the specified schema. \
 Base your response solely on the content of the input. \
 For generative tasks, respond accurately and relevantly based on what is provided. \
-Unless explicitly instructed otherwise, output only the JSON—do not include explanations, descriptions, or formatting outside the JSON."
+Unless explicitly instructed otherwise, output only the JSON. DO NOT include explanations, descriptions, or formatting outside the JSON."
             .to_string();
 
     if let Some(custom_instructions) = instructions {
@@ -130,17 +130,20 @@ impl SimpleFunctionFactoryBase for Factory {
         args_resolver: &mut OpArgsResolver<'a>,
         _context: &FlowInstanceContext,
     ) -> Result<(Args, EnrichedValueType)> {
-        Ok((
-            Args {
-                text: args_resolver
-                    .next_optional_arg("text")?
-                    .expect_type(&ValueType::Basic(BasicValueType::Str))?,
-                image: args_resolver
-                    .next_optional_arg("image")?
-                    .expect_type(&ValueType::Basic(BasicValueType::Bytes))?,
-            },
-            spec.output_type.clone(),
-        ))
+        let args = Args {
+            text: args_resolver
+                .next_optional_arg("text")?
+                .expect_type(&ValueType::Basic(BasicValueType::Str))?,
+            image: args_resolver
+                .next_optional_arg("image")?
+                .expect_type(&ValueType::Basic(BasicValueType::Bytes))?,
+        };
+
+        if args.text.is_none() && args.image.is_none() {
+            api_bail!("At least one of 'text' or 'image' must be provided");
+        }
+
+        Ok((args, spec.output_type.clone()))
     }
 
     async fn build_executor(

Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,7 @@ dependencies = [`
`11`	`11`	`"transformers>=4.29.0",`
`12`	`12`	`"qdrant-client>=1.14.2",`
`13`	`13`	`"uvicorn>=0.34.3",`
	`14`	`+ "requests>=2.32.4",`
`14`	`15`	`]`
`15`	`16`
`16`	`17`	`[tool.setuptools]`