feat: add support for dynamic image MIME type detection

lemorage · lemorage · commit f0c108a7bfcc · 2025-06-24T05:22:00.000+02:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -115,3 +115,4 @@ aws-config = "1.6.2"
 aws-sdk-s3 = "1.85.0"
 aws-sdk-sqs = "1.67.0"
 numpy = "0.25.0"
+infer = "0.19.0"
diff --git a/examples/image_search/main.py b/examples/image_search/main.py
@@ -72,16 +72,23 @@ def image_object_embedding_flow(
         caption_ds = img["content"].transform(
             cocoindex.functions.ExtractByLlm(
                 llm_spec=cocoindex.llm.LlmSpec(
-                    api_type=cocoindex.LlmApiType.GEMINI,
-                    model="gemini-1.5-flash",
+                    api_type=cocoindex.LlmApiType.OLLAMA,
+                    model="llama3.1",
                 ),
+                # Replace by this spec below, to use OpenAI API model instead of ollama
+                #   llm_spec=cocoindex.LlmSpec(
+                #       api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
+                # Replace by this spec below, to use Gemini API model
+                #   llm_spec=cocoindex.LlmSpec(
+                #       api_type=cocoindex.LlmApiType.GEMINI, model="gemini-2.0-flash"),
+                # Replace by this spec below, to use Anthropic API model
+                #   llm_spec=cocoindex.LlmSpec(
+                #       api_type=cocoindex.LlmApiType.ANTHROPIC, model="claude-3-5-sonnet-latest"),
                 instruction=(
-                    "Describe this image in one detailed, natural language sentence. "
-                    "Always explicitly name every visible animal species, object, and the main scene. "
-                    "Be specific about the type, color, and any distinguishing features. "
-                    "Avoid generic words like 'animal' or 'creature'—always use the most precise name (e.g., 'elephant', 'cat', 'lion', 'zebra'). "
-                    "If an animal is present, mention its species and what it is doing. "
-                    "For example: 'A large grey elephant standing in a grassy savanna, with trees in the background.'"
+                    "Describe the image in one detailed sentence. "
+                    "Name all visible animal species, objects, and the main scene. "
+                    "Be specific about type, color, and notable features. "
+                    "Mention what each animal is doing."
                 ),
                 output_type=str,
             )
diff --git a/src/llm/anthropic.rs b/src/llm/anthropic.rs
@@ -1,5 +1,6 @@
 use crate::llm::{
-    LlmGenerateRequest, LlmGenerateResponse, LlmGenerationClient, OutputFormat, ToJsonSchemaOptions,
+    LlmGenerateRequest, LlmGenerateResponse, LlmGenerationClient, OutputFormat,
+    ToJsonSchemaOptions, detect_mime_type,
 };
 use anyhow::{Context, Result, bail};
 use async_trait::async_trait;
@@ -43,11 +44,12 @@ impl LlmGenerationClient for Client {
         if let Some(image_bytes) = &request.image {
             let base64_image =
                 base64::engine::general_purpose::STANDARD.encode(image_bytes.as_ref());
+            let mime_type = detect_mime_type(image_bytes.as_ref())?;
             user_content_parts.push(serde_json::json!({
                 "type": "image",
                 "source": {
                     "type": "base64",
-                    "media_type": "image/jpeg", // Assuming JPEG
+                    "media_type": mime_type,
                     "data": base64_image,
                 }
             }));
diff --git a/src/llm/gemini.rs b/src/llm/gemini.rs
@@ -2,7 +2,7 @@ use crate::prelude::*;
 
 use crate::llm::{
     LlmEmbeddingClient, LlmGenerateRequest, LlmGenerateResponse, LlmGenerationClient, OutputFormat,
-    ToJsonSchemaOptions,
+    ToJsonSchemaOptions, detect_mime_type,
 };
 use base64::Engine;
 use phf::phf_map;
@@ -80,9 +80,10 @@ impl LlmGenerationClient for Client {
         if let Some(image_bytes) = &request.image {
             let base64_image =
                 base64::engine::general_purpose::STANDARD.encode(image_bytes.as_ref());
+            let mime_type = detect_mime_type(image_bytes.as_ref())?;
             user_parts.push(serde_json::json!({
                 "inlineData": {
-                    "mimeType": "image/jpeg", // Assuming JPEG
+                    "mimeType": mime_type,
                     "data": base64_image
                 }
             }));
diff --git a/src/llm/mod.rs b/src/llm/mod.rs
@@ -1,9 +1,12 @@
 use crate::prelude::*;
 
 use crate::base::json_schema::ToJsonSchemaOptions;
+use infer::Infer;
 use schemars::schema::SchemaObject;
 use std::borrow::Cow;
 
+static INFER: OnceLock<Infer> = OnceLock::new();
+
 #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
 pub enum LlmApiType {
     Ollama,
@@ -136,3 +139,11 @@ pub fn new_llm_embedding_client(
     };
     Ok(client)
 }
+
+pub fn detect_mime_type(bytes: &[u8]) -> Result<&'static str> {
+    let infer = INFER.get_or_init(Infer::new);
+    match infer.get(bytes) {
+        Some(info) if info.mime_type().starts_with("image/") => Ok(info.mime_type()),
+        _ => bail!("Unknown or unsupported image format"),
+    }
+}
diff --git a/src/llm/openai.rs b/src/llm/openai.rs
@@ -1,6 +1,6 @@
 use crate::api_bail;
 
-use super::{LlmEmbeddingClient, LlmGenerationClient};
+use super::{LlmEmbeddingClient, LlmGenerationClient, detect_mime_type};
 use anyhow::Result;
 use async_openai::{
     Client as OpenAIClient,
@@ -70,8 +70,8 @@ impl LlmGenerationClient for Client {
             Some(img_bytes) => {
                 use base64::{Engine as _, engine::general_purpose::STANDARD};
                 let base64_image = STANDARD.encode(img_bytes.as_ref());
-                // TODO: Using jpeg for now.
-                let image_url = format!("data:image/jpeg;base64,{}", base64_image);
+                let mime_type = detect_mime_type(img_bytes.as_ref())?;
+                let image_url = format!("data:{};base64,{}", mime_type, base64_image);
                 ChatCompletionRequestUserMessageContent::Array(vec![
                     ChatCompletionRequestUserMessageContentPart::Text(
                         ChatCompletionRequestMessageContentPartText {
diff --git a/src/ops/functions/extract_by_llm.rs b/src/ops/functions/extract_by_llm.rs
@@ -129,39 +129,28 @@ impl SimpleFunctionFactoryBase for Factory {
         args_resolver: &mut OpArgsResolver<'_>,
         _context: &FlowInstanceContext,
     ) -> Result<(Args, EnrichedValueType)> {
-        let mut text: Option<ResolvedOpArg> = None;
-        let mut image: Option<ResolvedOpArg> = None;
+        let mut args = Args {
+            text: None,
+            image: None,
+        };
 
-        // Handle the first positional argument
+        // Handle positional argument
         if let Some(arg) = args_resolver.next_optional_arg("")? {
             match arg.typ.typ {
-                ValueType::Basic(BasicValueType::Str) => text = Some(arg),
-                ValueType::Basic(BasicValueType::Bytes) => image = Some(arg),
+                ValueType::Basic(BasicValueType::Str) => args.text = Some(arg),
+                ValueType::Basic(BasicValueType::Bytes) => args.image = Some(arg),
                 _ => api_bail!(
                     "Positional argument must be of type 'Str' or 'Bytes', got {}",
                     arg.typ.typ
                 ),
             }
         }
 
-        // Named arguments
-        for (name, slot, expected_type) in [
-            ("text", &mut text, BasicValueType::Str),
-            ("image", &mut image, BasicValueType::Bytes),
-        ] {
-            if let Some(arg) = args_resolver.next_optional_arg(name)? {
-                if slot.is_some() {
-                    api_bail!("'{}' argument provided multiple times", name);
-                }
-                *slot = Some(arg.expect_type(&ValueType::Basic(expected_type))?);
-            }
-        }
-
-        if text.is_none() && image.is_none() {
+        if args.text.is_none() && args.image.is_none() {
             api_bail!("At least one of 'text' or 'image' must be provided");
         }
 
-        Ok((Args { text, image }, spec.output_type.clone()))
+        Ok((args, spec.output_type.clone()))
     }
 
     async fn build_executor(