Skip to content

Commit f0c108a

Browse files
committed
feat: add support for dynamic image MIME type detection
1 parent a73d9ab commit f0c108a

File tree

8 files changed

+67
-35
lines changed

8 files changed

+67
-35
lines changed

Cargo.lock

Lines changed: 21 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,3 +115,4 @@ aws-config = "1.6.2"
115115
aws-sdk-s3 = "1.85.0"
116116
aws-sdk-sqs = "1.67.0"
117117
numpy = "0.25.0"
118+
infer = "0.19.0"

examples/image_search/main.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -72,16 +72,23 @@ def image_object_embedding_flow(
7272
caption_ds = img["content"].transform(
7373
cocoindex.functions.ExtractByLlm(
7474
llm_spec=cocoindex.llm.LlmSpec(
75-
api_type=cocoindex.LlmApiType.GEMINI,
76-
model="gemini-1.5-flash",
75+
api_type=cocoindex.LlmApiType.OLLAMA,
76+
model="llama3.1",
7777
),
78+
# Replace by this spec below, to use OpenAI API model instead of ollama
79+
# llm_spec=cocoindex.LlmSpec(
80+
# api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
81+
# Replace by this spec below, to use Gemini API model
82+
# llm_spec=cocoindex.LlmSpec(
83+
# api_type=cocoindex.LlmApiType.GEMINI, model="gemini-2.0-flash"),
84+
# Replace by this spec below, to use Anthropic API model
85+
# llm_spec=cocoindex.LlmSpec(
86+
# api_type=cocoindex.LlmApiType.ANTHROPIC, model="claude-3-5-sonnet-latest"),
7887
instruction=(
79-
"Describe this image in one detailed, natural language sentence. "
80-
"Always explicitly name every visible animal species, object, and the main scene. "
81-
"Be specific about the type, color, and any distinguishing features. "
82-
"Avoid generic words like 'animal' or 'creature'—always use the most precise name (e.g., 'elephant', 'cat', 'lion', 'zebra'). "
83-
"If an animal is present, mention its species and what it is doing. "
84-
"For example: 'A large grey elephant standing in a grassy savanna, with trees in the background.'"
88+
"Describe the image in one detailed sentence. "
89+
"Name all visible animal species, objects, and the main scene. "
90+
"Be specific about type, color, and notable features. "
91+
"Mention what each animal is doing."
8592
),
8693
output_type=str,
8794
)

src/llm/anthropic.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use crate::llm::{
2-
LlmGenerateRequest, LlmGenerateResponse, LlmGenerationClient, OutputFormat, ToJsonSchemaOptions,
2+
LlmGenerateRequest, LlmGenerateResponse, LlmGenerationClient, OutputFormat,
3+
ToJsonSchemaOptions, detect_mime_type,
34
};
45
use anyhow::{Context, Result, bail};
56
use async_trait::async_trait;
@@ -43,11 +44,12 @@ impl LlmGenerationClient for Client {
4344
if let Some(image_bytes) = &request.image {
4445
let base64_image =
4546
base64::engine::general_purpose::STANDARD.encode(image_bytes.as_ref());
47+
let mime_type = detect_mime_type(image_bytes.as_ref())?;
4648
user_content_parts.push(serde_json::json!({
4749
"type": "image",
4850
"source": {
4951
"type": "base64",
50-
"media_type": "image/jpeg", // Assuming JPEG
52+
"media_type": mime_type,
5153
"data": base64_image,
5254
}
5355
}));

src/llm/gemini.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ use crate::prelude::*;
22

33
use crate::llm::{
44
LlmEmbeddingClient, LlmGenerateRequest, LlmGenerateResponse, LlmGenerationClient, OutputFormat,
5-
ToJsonSchemaOptions,
5+
ToJsonSchemaOptions, detect_mime_type,
66
};
77
use base64::Engine;
88
use phf::phf_map;
@@ -80,9 +80,10 @@ impl LlmGenerationClient for Client {
8080
if let Some(image_bytes) = &request.image {
8181
let base64_image =
8282
base64::engine::general_purpose::STANDARD.encode(image_bytes.as_ref());
83+
let mime_type = detect_mime_type(image_bytes.as_ref())?;
8384
user_parts.push(serde_json::json!({
8485
"inlineData": {
85-
"mimeType": "image/jpeg", // Assuming JPEG
86+
"mimeType": mime_type,
8687
"data": base64_image
8788
}
8889
}));

src/llm/mod.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
use crate::prelude::*;
22

33
use crate::base::json_schema::ToJsonSchemaOptions;
4+
use infer::Infer;
45
use schemars::schema::SchemaObject;
56
use std::borrow::Cow;
67

8+
static INFER: OnceLock<Infer> = OnceLock::new();
9+
710
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
811
pub enum LlmApiType {
912
Ollama,
@@ -136,3 +139,11 @@ pub fn new_llm_embedding_client(
136139
};
137140
Ok(client)
138141
}
142+
143+
pub fn detect_mime_type(bytes: &[u8]) -> Result<&'static str> {
144+
let infer = INFER.get_or_init(Infer::new);
145+
match infer.get(bytes) {
146+
Some(info) if info.mime_type().starts_with("image/") => Ok(info.mime_type()),
147+
_ => bail!("Unknown or unsupported image format"),
148+
}
149+
}

src/llm/openai.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use crate::api_bail;
22

3-
use super::{LlmEmbeddingClient, LlmGenerationClient};
3+
use super::{LlmEmbeddingClient, LlmGenerationClient, detect_mime_type};
44
use anyhow::Result;
55
use async_openai::{
66
Client as OpenAIClient,
@@ -70,8 +70,8 @@ impl LlmGenerationClient for Client {
7070
Some(img_bytes) => {
7171
use base64::{Engine as _, engine::general_purpose::STANDARD};
7272
let base64_image = STANDARD.encode(img_bytes.as_ref());
73-
// TODO: Using jpeg for now.
74-
let image_url = format!("data:image/jpeg;base64,{}", base64_image);
73+
let mime_type = detect_mime_type(img_bytes.as_ref())?;
74+
let image_url = format!("data:{};base64,{}", mime_type, base64_image);
7575
ChatCompletionRequestUserMessageContent::Array(vec![
7676
ChatCompletionRequestUserMessageContentPart::Text(
7777
ChatCompletionRequestMessageContentPartText {

src/ops/functions/extract_by_llm.rs

Lines changed: 9 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -129,39 +129,28 @@ impl SimpleFunctionFactoryBase for Factory {
129129
args_resolver: &mut OpArgsResolver<'_>,
130130
_context: &FlowInstanceContext,
131131
) -> Result<(Args, EnrichedValueType)> {
132-
let mut text: Option<ResolvedOpArg> = None;
133-
let mut image: Option<ResolvedOpArg> = None;
132+
let mut args = Args {
133+
text: None,
134+
image: None,
135+
};
134136

135-
// Handle the first positional argument
137+
// Handle positional argument
136138
if let Some(arg) = args_resolver.next_optional_arg("")? {
137139
match arg.typ.typ {
138-
ValueType::Basic(BasicValueType::Str) => text = Some(arg),
139-
ValueType::Basic(BasicValueType::Bytes) => image = Some(arg),
140+
ValueType::Basic(BasicValueType::Str) => args.text = Some(arg),
141+
ValueType::Basic(BasicValueType::Bytes) => args.image = Some(arg),
140142
_ => api_bail!(
141143
"Positional argument must be of type 'Str' or 'Bytes', got {}",
142144
arg.typ.typ
143145
),
144146
}
145147
}
146148

147-
// Named arguments
148-
for (name, slot, expected_type) in [
149-
("text", &mut text, BasicValueType::Str),
150-
("image", &mut image, BasicValueType::Bytes),
151-
] {
152-
if let Some(arg) = args_resolver.next_optional_arg(name)? {
153-
if slot.is_some() {
154-
api_bail!("'{}' argument provided multiple times", name);
155-
}
156-
*slot = Some(arg.expect_type(&ValueType::Basic(expected_type))?);
157-
}
158-
}
159-
160-
if text.is_none() && image.is_none() {
149+
if args.text.is_none() && args.image.is_none() {
161150
api_bail!("At least one of 'text' or 'image' must be provided");
162151
}
163152

164-
Ok((Args { text, image }, spec.output_type.clone()))
153+
Ok((args, spec.output_type.clone()))
165154
}
166155

167156
async fn build_executor(

0 commit comments

Comments
 (0)