Skip to content

Commit b11d6b4

Browse files
committed
feat: make image captioning support with Ollama integration optional
1 parent 9c63daf commit b11d6b4

File tree

4 files changed

+87
-43
lines changed

4 files changed

+87
-43
lines changed

examples/image_search/README.md

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/c
1313
- CLIP ViT-L/14 - Embeddings Model for images and query
1414
- Qdrant for Vector Storage
1515
- FastApi for backend
16+
- Ollama (Optional) for generating image captions using `gemma3`.
1617

1718
## Setup
1819
- Make sure Postgres and Qdrant are running
@@ -21,7 +22,15 @@ We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/c
2122
export COCOINDEX_DATABASE_URL="postgres://cocoindex:cocoindex@localhost/cocoindex"
2223
```
2324

24-
## Run
25+
## (Optional) Run Ollama
26+
27+
- This enables automatic image captioning
28+
```
29+
ollama pull gemma3
30+
ollama serve
31+
```
32+
33+
## Run the App
2534
- Install dependencies:
2635
```
2736
pip install -e .

examples/image_search/main.py

Lines changed: 61 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from typing import Any, Literal
77

88
import cocoindex
9+
import requests
910
import torch
1011
from dotenv import load_dotenv
1112
from fastapi import FastAPI, Query
@@ -17,10 +18,21 @@
1718

1819
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6334/")
1920
QDRANT_COLLECTION = "ImageSearch"
21+
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434/")
22+
OLLAMA_MODEL = "gemma3"
2023
CLIP_MODEL_NAME = "openai/clip-vit-large-patch14"
2124
CLIP_MODEL_DIMENSION = 768
2225

2326

27+
def ollama_has_model(model) -> bool:
28+
try:
29+
r = requests.get(f"{OLLAMA_URL}/api/tags", timeout=1)
30+
r.raise_for_status()
31+
return any(m.get("name") == model for m in r.json().get("models", []))
32+
except Exception:
33+
return False
34+
35+
2436
@functools.cache
2537
def get_clip_model() -> tuple[CLIPModel, CLIPProcessor]:
2638
model = CLIPModel.from_pretrained(CLIP_MODEL_NAME)
@@ -69,37 +81,49 @@ def image_object_embedding_flow(
6981
)
7082
img_embeddings = data_scope.add_collector()
7183
with data_scope["images"].row() as img:
72-
img["caption"] = flow_builder.transform(
73-
cocoindex.functions.ExtractByLlm(
74-
llm_spec=cocoindex.LlmSpec(
75-
api_type=cocoindex.LlmApiType.GEMINI, model="gemini-2.0-flash"
76-
),
77-
# Replace by this spec below, to use OpenAI API model instead of gemini
78-
# llm_spec=cocoindex.LlmSpec(
79-
# api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
80-
# Replace by this spec below, to use Ollama API model
81-
# llm_spec=cocoindex.llm.LlmSpec(
82-
# api_type=cocoindex.LlmApiType.OLLAMA, model="llama3.1"),
83-
# Replace by this spec below, to use Anthropic API model
84-
# llm_spec=cocoindex.LlmSpec(
85-
# api_type=cocoindex.LlmApiType.ANTHROPIC, model="claude-3-5-sonnet-latest"),
86-
instruction=(
87-
"Describe the image in one detailed sentence. "
88-
"Name all visible animal species, objects, and the main scene. "
89-
"Be specific about type, color, and notable features. "
90-
"Mention what each animal is doing."
84+
has_gemma3 = ollama_has_model(OLLAMA_MODEL)
85+
if has_gemma3:
86+
img["caption"] = flow_builder.transform(
87+
cocoindex.functions.ExtractByLlm(
88+
llm_spec=cocoindex.llm.LlmSpec(
89+
api_type=cocoindex.LlmApiType.OLLAMA, model=OLLAMA_MODEL
90+
),
91+
# Replace by this spec below, to use OpenAI API model instead of ollama
92+
# llm_spec=cocoindex.LlmSpec(
93+
# api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
94+
# Replace by this spec below, to use Gemini API model
95+
# llm_spec=cocoindex.LlmSpec(
96+
# api_type=cocoindex.LlmApiType.GEMINI, model="gemini-2.0-flash"),
97+
# Replace by this spec below, to use Anthropic API model
98+
# llm_spec=cocoindex.LlmSpec(
99+
# api_type=cocoindex.LlmApiType.ANTHROPIC, model="claude-3-5-sonnet-latest"),
100+
instruction=(
101+
"Describe the image in one detailed sentence. "
102+
"Name all visible animal species, objects, and the main scene. "
103+
"Be specific about type, color, and notable features. "
104+
"Mention what each animal is doing."
105+
),
106+
output_type=str,
91107
),
92-
output_type=str,
93-
),
94-
image=img["content"],
95-
)
108+
image=img["content"],
109+
)
96110
img["embedding"] = img["content"].transform(embed_image)
97-
img_embeddings.collect(
98-
id=cocoindex.GeneratedField.UUID,
99-
filename=img["filename"],
100-
caption=img["caption"],
101-
embedding=img["embedding"],
102-
)
111+
112+
collect_fields = {
113+
"id": cocoindex.GeneratedField.UUID,
114+
"filename": img["filename"],
115+
"embedding": img["embedding"],
116+
}
117+
118+
if has_gemma3:
119+
print(
120+
f"Ollama model '{OLLAMA_MODEL}' is available — captions will be extracted."
121+
)
122+
collect_fields["caption"] = img["caption"]
123+
else:
124+
print(f"Ollama model '{OLLAMA_MODEL}' not found — skipping captioning.")
125+
126+
img_embeddings.collect(**collect_fields)
103127

104128
img_embeddings.export(
105129
"img_embeddings",
@@ -151,11 +175,18 @@ def search(
151175
collection_name=QDRANT_COLLECTION,
152176
query_vector=("embedding", query_embedding),
153177
limit=limit,
178+
with_payload=True,
154179
)
155180

156181
return {
157182
"results": [
158-
{"filename": result.payload["filename"], "score": result.score}
183+
{
184+
"filename": result.payload["filename"],
185+
"score": result.score,
186+
"caption": result.payload.get(
187+
"caption"
188+
), # Include caption if available
189+
}
159190
for result in search_results
160191
]
161192
}

examples/image_search/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ dependencies = [
1111
"transformers>=4.29.0",
1212
"qdrant-client>=1.14.2",
1313
"uvicorn>=0.34.3",
14+
"requests>=2.32.4",
1415
]
1516

1617
[tool.setuptools]

src/ops/functions/extract_by_llm.rs

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ fn get_system_prompt(instructions: &Option<String>, extra_instructions: Option<S
3434
Your task is to follow the provided instructions to generate or extract information and output valid JSON matching the specified schema. \
3535
Base your response solely on the content of the input. \
3636
For generative tasks, respond accurately and relevantly based on what is provided. \
37-
Unless explicitly instructed otherwise, output only the JSON—do not include explanations, descriptions, or formatting outside the JSON."
37+
Unless explicitly instructed otherwise, output only the JSON. DO NOT include explanations, descriptions, or formatting outside the JSON."
3838
.to_string();
3939

4040
if let Some(custom_instructions) = instructions {
@@ -130,17 +130,20 @@ impl SimpleFunctionFactoryBase for Factory {
130130
args_resolver: &mut OpArgsResolver<'a>,
131131
_context: &FlowInstanceContext,
132132
) -> Result<(Args, EnrichedValueType)> {
133-
Ok((
134-
Args {
135-
text: args_resolver
136-
.next_optional_arg("text")?
137-
.expect_type(&ValueType::Basic(BasicValueType::Str))?,
138-
image: args_resolver
139-
.next_optional_arg("image")?
140-
.expect_type(&ValueType::Basic(BasicValueType::Bytes))?,
141-
},
142-
spec.output_type.clone(),
143-
))
133+
let args = Args {
134+
text: args_resolver
135+
.next_optional_arg("text")?
136+
.expect_type(&ValueType::Basic(BasicValueType::Str))?,
137+
image: args_resolver
138+
.next_optional_arg("image")?
139+
.expect_type(&ValueType::Basic(BasicValueType::Bytes))?,
140+
};
141+
142+
if args.text.is_none() && args.image.is_none() {
143+
api_bail!("At least one of 'text' or 'image' must be provided");
144+
}
145+
146+
Ok((args, spec.output_type.clone()))
144147
}
145148

146149
async fn build_executor(

0 commit comments

Comments
 (0)