Skip to content

Commit 42cb84c

Browse files
committed
support vision input for Ollama
1 parent 120d772 commit 42cb84c

File tree

1 file changed

+181
-9
lines changed

1 file changed

+181
-9
lines changed

scripts/ollama_operator.py

Lines changed: 181 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import logging
2+
from base64 import b64encode
3+
from logging import basicConfig
24

35
import typer
46
from dotenv import load_dotenv
57
from langchain_core.messages import HumanMessage
68
from pydantic import BaseModel, Field
79

8-
from template_langgraph.llms.ollamas import OllamaWrapper
10+
from template_langgraph.llms.ollamas import OllamaWrapper, Settings
911
from template_langgraph.loggers import get_logger
1012

1113

@@ -29,6 +31,17 @@ class Profile(BaseModel):
2931
logger = get_logger(__name__)
3032

3133

34+
def load_image_to_base64(image_path: str) -> str:
35+
with open(image_path, "rb") as image_file:
36+
return b64encode(image_file.read()).decode("utf-8")
37+
38+
39+
def set_verbose_logging(verbose: bool):
40+
if verbose:
41+
logger.setLevel(logging.DEBUG)
42+
basicConfig(level=logging.DEBUG)
43+
44+
3245
@app.command()
3346
def chat(
3447
query: str = typer.Option(
@@ -37,6 +50,12 @@ def chat(
3750
"-q",
3851
help="Query to run against the Ollama model",
3952
),
53+
model: str = typer.Option(
54+
"gemma3:270m",
55+
"--model",
56+
"-m",
57+
help="Model to use for structured output",
58+
),
4059
verbose: bool = typer.Option(
4160
False,
4261
"--verbose",
@@ -50,12 +69,14 @@ def chat(
5069
help="Enable streaming output",
5170
),
5271
):
53-
# Set up logging
54-
if verbose:
55-
logger.setLevel(logging.DEBUG)
72+
set_verbose_logging(verbose)
5673

5774
logger.info("Running...")
58-
chat_model = OllamaWrapper().chat_model
75+
chat_model = OllamaWrapper(
76+
settings=Settings(
77+
ollama_model_chat=model,
78+
),
79+
).chat_model
5980

6081
if stream:
6182
response = ""
@@ -94,19 +115,27 @@ def structured_output(
94115
"-q",
95116
help="Query to run against the Ollama model",
96117
),
118+
model: str = typer.Option(
119+
"gemma3:270m",
120+
"--model",
121+
"-m",
122+
help="Model to use for structured output",
123+
),
97124
verbose: bool = typer.Option(
98125
False,
99126
"--verbose",
100127
"-v",
101128
help="Enable verbose output",
102129
),
103130
):
104-
# Set up logging
105-
if verbose:
106-
logger.setLevel(logging.DEBUG)
131+
set_verbose_logging(verbose)
107132

108133
logger.info("Running...")
109-
chat_model = OllamaWrapper().chat_model
134+
chat_model = OllamaWrapper(
135+
settings=Settings(
136+
ollama_model_chat=model,
137+
),
138+
).chat_model
110139
profile = chat_model.with_structured_output(
111140
schema=Profile,
112141
).invoke(
@@ -117,6 +146,149 @@ def structured_output(
117146
logger.info(f"Output: {profile.model_dump_json(indent=2, exclude_none=True)}")
118147

119148

149+
@app.command()
150+
def image(
151+
query: str = typer.Option(
152+
"Please analyze the following image and answer the question",
153+
"--query",
154+
"-q",
155+
help="Query to run with the image",
156+
),
157+
file_path: str = typer.Option(
158+
"./docs/images/streamlit.png",
159+
"--file",
160+
"-f",
161+
help="Path to the image file to analyze",
162+
),
163+
model: str = typer.Option(
164+
"gemma3:4b-it-q4_K_M",
165+
"--model",
166+
"-m",
167+
help="Model to use for image analysis",
168+
),
169+
verbose: bool = typer.Option(
170+
False,
171+
"--verbose",
172+
"-v",
173+
help="Enable verbose output",
174+
),
175+
):
176+
set_verbose_logging(verbose)
177+
178+
base64_image = load_image_to_base64(file_path)
179+
messages = {
180+
"role": "user",
181+
"content": [
182+
{
183+
"type": "text",
184+
"text": query,
185+
},
186+
{
187+
"type": "image_url",
188+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
189+
},
190+
],
191+
}
192+
193+
logger.info("Running...")
194+
chat_model = OllamaWrapper(
195+
settings=Settings(
196+
ollama_model_chat=model,
197+
)
198+
).chat_model
199+
response = chat_model.invoke(
200+
input=[
201+
messages,
202+
],
203+
)
204+
logger.debug(
205+
response.model_dump_json(
206+
indent=2,
207+
exclude_none=True,
208+
)
209+
)
210+
logger.info(f"Output: {response.content}")
211+
212+
213+
@app.command()
214+
def ocr(
215+
query: str = typer.Option(
216+
"Please extract all available details from the receipt image, including merchant/store name, transaction date (YYYY-MM-DD), total amount, and a fully itemized list (name, quantity, unit price, subtotal for each item).", # noqa
217+
"--query",
218+
"-q",
219+
help="Query for OCR and comprehensive structured extraction from the receipt image",
220+
),
221+
file_path: str = typer.Option(
222+
"./docs/images/streamlit.png",
223+
"--file",
224+
"-f",
225+
help="Path to the receipt image file for analysis",
226+
),
227+
model: str = typer.Option(
228+
"gemma3:4b-it-q4_K_M",
229+
"--model",
230+
"-m",
231+
help="Model to use for OCR and structured information extraction",
232+
),
233+
verbose: bool = typer.Option(
234+
False,
235+
"--verbose",
236+
"-v",
237+
help="Enable verbose output",
238+
),
239+
):
240+
set_verbose_logging(verbose)
241+
from pydantic import BaseModel, Field
242+
243+
class Item(BaseModel):
244+
item_name: str = Field(..., description="Exact name of the purchased item")
245+
quantity: int = Field(..., description="Number of units purchased")
246+
unit_price: float = Field(..., description="Unit price per item")
247+
total_price: float = Field(..., description="Subtotal for this item")
248+
249+
class ReceiptInfo(BaseModel):
250+
merchant_name: str = Field(..., description="Full name of the merchant/store")
251+
transaction_date: str = Field(..., description="Transaction date in ISO format YYYY-MM-DD")
252+
total_amount: float = Field(..., description="Total amount paid, including tax")
253+
items: list[Item] = Field(
254+
...,
255+
description="Detailed list of all purchased items with name, quantity, unit price, and subtotal",
256+
)
257+
258+
base64_image = load_image_to_base64(file_path)
259+
messages = {
260+
"role": "user",
261+
"content": [
262+
{
263+
"type": "text",
264+
"text": query,
265+
},
266+
{
267+
"type": "image_url",
268+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
269+
},
270+
],
271+
}
272+
273+
logger.info("Running OCR and extracting detailed structured receipt information...")
274+
chat_model = OllamaWrapper(
275+
settings=Settings(
276+
ollama_model_chat=model,
277+
)
278+
).chat_model
279+
response = chat_model.with_structured_output(ReceiptInfo).invoke(
280+
input=[
281+
messages,
282+
],
283+
)
284+
logger.info(
285+
response.model_dump_json(
286+
indent=2,
287+
exclude_none=True,
288+
)
289+
)
290+
291+
120292
if __name__ == "__main__":
121293
load_dotenv(
122294
override=True,

0 commit comments

Comments
 (0)