|
6 | 6 | from typing import Any, Literal |
7 | 7 |
|
8 | 8 | import cocoindex |
| 9 | +import requests |
9 | 10 | import torch |
10 | 11 | from dotenv import load_dotenv |
11 | 12 | from fastapi import FastAPI, Query |
|
17 | 18 |
|
18 | 19 | QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6334/") |
19 | 20 | QDRANT_COLLECTION = "ImageSearch" |
| 21 | +OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434/") |
| 22 | +OLLAMA_MODEL = "gemma3" |
20 | 23 | CLIP_MODEL_NAME = "openai/clip-vit-large-patch14" |
21 | 24 | CLIP_MODEL_DIMENSION = 768 |
22 | 25 |
|
23 | 26 |
|
| 27 | +def ollama_has_model(model) -> bool: |
| 28 | + try: |
| 29 | + r = requests.get(f"{OLLAMA_URL}/api/tags", timeout=1) |
| 30 | + r.raise_for_status() |
| 31 | + return any(m.get("name") == model for m in r.json().get("models", [])) |
| 32 | + except Exception: |
| 33 | + return False |
| 34 | + |
| 35 | + |
24 | 36 | @functools.cache |
25 | 37 | def get_clip_model() -> tuple[CLIPModel, CLIPProcessor]: |
26 | 38 | model = CLIPModel.from_pretrained(CLIP_MODEL_NAME) |
@@ -69,37 +81,49 @@ def image_object_embedding_flow( |
69 | 81 | ) |
70 | 82 | img_embeddings = data_scope.add_collector() |
71 | 83 | with data_scope["images"].row() as img: |
72 | | - img["caption"] = flow_builder.transform( |
73 | | - cocoindex.functions.ExtractByLlm( |
74 | | - llm_spec=cocoindex.LlmSpec( |
75 | | - api_type=cocoindex.LlmApiType.GEMINI, model="gemini-2.0-flash" |
76 | | - ), |
77 | | - # Replace by this spec below, to use OpenAI API model instead of gemini |
78 | | - # llm_spec=cocoindex.LlmSpec( |
79 | | - # api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"), |
80 | | - # Replace by this spec below, to use Ollama API model |
81 | | - # llm_spec=cocoindex.llm.LlmSpec( |
82 | | - # api_type=cocoindex.LlmApiType.OLLAMA, model="llama3.1"), |
83 | | - # Replace by this spec below, to use Anthropic API model |
84 | | - # llm_spec=cocoindex.LlmSpec( |
85 | | - # api_type=cocoindex.LlmApiType.ANTHROPIC, model="claude-3-5-sonnet-latest"), |
86 | | - instruction=( |
87 | | - "Describe the image in one detailed sentence. " |
88 | | - "Name all visible animal species, objects, and the main scene. " |
89 | | - "Be specific about type, color, and notable features. " |
90 | | - "Mention what each animal is doing." |
| 84 | + has_gemma3 = ollama_has_model(OLLAMA_MODEL) |
| 85 | + if has_gemma3: |
| 86 | + img["caption"] = flow_builder.transform( |
| 87 | + cocoindex.functions.ExtractByLlm( |
| 88 | + llm_spec=cocoindex.llm.LlmSpec( |
| 89 | + api_type=cocoindex.LlmApiType.OLLAMA, model=OLLAMA_MODEL |
| 90 | + ), |
| 91 | + # Replace by this spec below, to use OpenAI API model instead of ollama |
| 92 | + # llm_spec=cocoindex.LlmSpec( |
| 93 | + # api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"), |
| 94 | + # Replace by this spec below, to use Gemini API model |
| 95 | + # llm_spec=cocoindex.LlmSpec( |
| 96 | + # api_type=cocoindex.LlmApiType.GEMINI, model="gemini-2.0-flash"), |
| 97 | + # Replace by this spec below, to use Anthropic API model |
| 98 | + # llm_spec=cocoindex.LlmSpec( |
| 99 | + # api_type=cocoindex.LlmApiType.ANTHROPIC, model="claude-3-5-sonnet-latest"), |
| 100 | + instruction=( |
| 101 | + "Describe the image in one detailed sentence. " |
| 102 | + "Name all visible animal species, objects, and the main scene. " |
| 103 | + "Be specific about type, color, and notable features. " |
| 104 | + "Mention what each animal is doing." |
| 105 | + ), |
| 106 | + output_type=str, |
91 | 107 | ), |
92 | | - output_type=str, |
93 | | - ), |
94 | | - image=img["content"], |
95 | | - ) |
| 108 | + image=img["content"], |
| 109 | + ) |
96 | 110 | img["embedding"] = img["content"].transform(embed_image) |
97 | | - img_embeddings.collect( |
98 | | - id=cocoindex.GeneratedField.UUID, |
99 | | - filename=img["filename"], |
100 | | - caption=img["caption"], |
101 | | - embedding=img["embedding"], |
102 | | - ) |
| 111 | + |
| 112 | + collect_fields = { |
| 113 | + "id": cocoindex.GeneratedField.UUID, |
| 114 | + "filename": img["filename"], |
| 115 | + "embedding": img["embedding"], |
| 116 | + } |
| 117 | + |
| 118 | + if has_gemma3: |
| 119 | + print( |
| 120 | + f"Ollama model '{OLLAMA_MODEL}' is available — captions will be extracted." |
| 121 | + ) |
| 122 | + collect_fields["caption"] = img["caption"] |
| 123 | + else: |
| 124 | + print(f"Ollama model '{OLLAMA_MODEL}' not found — skipping captioning.") |
| 125 | + |
| 126 | + img_embeddings.collect(**collect_fields) |
103 | 127 |
|
104 | 128 | img_embeddings.export( |
105 | 129 | "img_embeddings", |
@@ -151,11 +175,18 @@ def search( |
151 | 175 | collection_name=QDRANT_COLLECTION, |
152 | 176 | query_vector=("embedding", query_embedding), |
153 | 177 | limit=limit, |
| 178 | + with_payload=True, |
154 | 179 | ) |
155 | 180 |
|
156 | 181 | return { |
157 | 182 | "results": [ |
158 | | - {"filename": result.payload["filename"], "score": result.score} |
| 183 | + { |
| 184 | + "filename": result.payload["filename"], |
| 185 | + "score": result.score, |
| 186 | + "caption": result.payload.get( |
| 187 | + "caption" |
| 188 | + ), # Include caption if available |
| 189 | + } |
159 | 190 | for result in search_results |
160 | 191 | ] |
161 | 192 | } |
0 commit comments