utilityai
diff --git a/‎Cargo.lock
Lines changed: 8 additions & 0 deletions b/‎Cargo.lock
Lines changed: 8 additions & 0 deletions
diff --git a/‎Cargo.toml
Lines changed: 1 addition & 0 deletions b/‎Cargo.toml
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/mtmd/Cargo.toml
Lines changed: 24 additions & 0 deletions b/‎examples/mtmd/Cargo.toml
Lines changed: 24 additions & 0 deletions
diff --git a/‎examples/mtmd/src/mtmd.rs
Lines changed: 298 additions & 0 deletions b/‎examples/mtmd/src/mtmd.rs
Lines changed: 298 additions & 0 deletions
diff --git a/‎examples/reranker/Cargo.toml
Lines changed: 1 addition & 1 deletion b/‎examples/reranker/Cargo.toml
Lines changed: 1 addition & 1 deletion
diff --git a/‎llama-cpp-2/src/lib.rs
Lines changed: 1 addition & 0 deletions b/‎llama-cpp-2/src/lib.rs
Lines changed: 1 addition & 0 deletions
diff --git a/‎llama-cpp-2/src/model.rs
Lines changed: 5 additions & 4 deletions b/‎llama-cpp-2/src/model.rs
Lines changed: 5 additions & 4 deletions
@@ -6,6 +6,7 @@ members = [
   "examples/embeddings",
   "examples/simple",
   "examples/reranker",
+  "examples/mtmd",
 ]
 
 [workspace.dependencies]
 
@@ -0,0 +1,24 @@
+[package]
+name = "mtmd"
+version = "0.1.86"
+edition = "2021"
+
+[dependencies]
+llama-cpp-2 = { path = "../../llama-cpp-2", version = "0.1.86" }
+clap = { workspace = true, features = ["derive"] }
+# hf-hub = { workspace = true }
+# anyhow = { workspace = true }
+# encoding_rs = { workspace = true }
+
+[features]
+cuda = ["llama-cpp-2/cuda"]
+metal = ["llama-cpp-2/metal"]
+native = ["llama-cpp-2/native"]
+vulkan = ["llama-cpp-2/vulkan"]
+
+[lints]
+workspace = true
+
+[[example]]
+name = "mtmd"
+path = "src/mtmd.rs"
@@ -0,0 +1,298 @@
+//! Based on the mtmd cli example from llama.cpp.
+
+use std::ffi::CString;
+use std::io::{self, Write};
+use std::path::Path;
+
+use clap::Parser;
+
+use llama_cpp_2::context::params::LlamaContextParams;
+use llama_cpp_2::context::LlamaContext;
+use llama_cpp_2::llama_batch::LlamaBatch;
+use llama_cpp_2::model::params::LlamaModelParams;
+use llama_cpp_2::mtmd::*;
+
+use llama_cpp_2::llama_backend::LlamaBackend;
+use llama_cpp_2::model::{LlamaChatMessage, LlamaChatTemplate, LlamaModel, Special};
+use llama_cpp_2::sampling::LlamaSampler;
+
+/// Command line parameters for the MTMD CLI application
+#[derive(clap::Parser, Debug)]
+#[command(name = "mtmd-cli")]
+#[command(about = "Experimental CLI for multimodal llama.cpp")]
+pub struct MtmdCliParams {
+    /// Path to the model file
+    #[arg(short = 'm', long = "model", value_name = "PATH")]
+    pub model_path: String,
+    /// Path to the multimodal projection file
+    #[arg(long = "mmproj", value_name = "PATH")]
+    pub mmproj_path: String,
+    /// Path to image file(s)
+    #[arg(long = "image", value_name = "PATH")]
+    pub images: Vec<String>,
+    /// Path to audio file(s)
+    #[arg(long = "audio", value_name = "PATH")]
+    pub audio: Vec<String>,
+    /// Text prompt to use as input to the model. May include media markers - else they will be added automatically.
+    #[arg(short = 'p', long = "prompt", value_name = "TEXT")]
+    pub prompt: String,
+    /// Number of tokens to predict (-1 for unlimited)
+    #[arg(
+        short = 'n',
+        long = "n-predict",
+        value_name = "N",
+        default_value = "-1"
+    )]
+    pub n_predict: i32,
+    /// Number of threads
+    #[arg(short = 't', long = "threads", value_name = "N", default_value = "4")]
+    pub n_threads: i32,
+    /// Maximum number of tokens in context
+    #[arg(long = "n-tokens", value_name = "N", default_value = "2048")]
+    pub n_tokens: usize,
+    /// Chat template to use, default template if not provided
+    #[arg(long = "chat-template", value_name = "TEMPLATE")]
+    pub chat_template: Option<String>,
+    /// Disable GPU acceleration
+    #[arg(long = "no-gpu")]
+    pub no_gpu: bool,
+    /// Disable GPU offload for multimodal projection
+    #[arg(long = "no-mmproj-offload")]
+    pub no_mmproj_offload: bool,
+    /// Media marker. If not provided, the default marker will be used.
+    #[arg(long = "marker", value_name = "TEXT")]
+    pub media_marker: Option<String>,
+}
+
+/// State of the MTMD CLI application.
+#[allow(missing_debug_implementations)]
+pub struct MtmdCliContext {
+    /// The MTMD context for multimodal processing.
+    pub mtmd_ctx: MtmdContext,
+    /// The batch used for processing tokens.
+    pub batch: LlamaBatch,
+    /// The list of loaded bitmaps (images/audio).
+    pub bitmaps: Vec<MtmdBitmap>,
+    /// The number of past tokens processed.
+    pub n_past: i32,
+    /// The chat template used for formatting messages.
+    pub chat_template: LlamaChatTemplate,
+    /// The current chat messages history.
+    pub chat: Vec<LlamaChatMessage>,
+}
+
+impl MtmdCliContext {
+    /// Creates a new MTMD CLI context
+    pub fn new(
+        params: &MtmdCliParams,
+        model: &LlamaModel,
+    ) -> Result<Self, Box<dyn std::error::Error>> {
+        // Initialize MTMD context
+        let mtmd_params = MtmdContextParams {
+            use_gpu: !params.no_gpu && !params.no_mmproj_offload,
+            print_timings: true,
+            n_threads: params.n_threads,
+            media_marker: CString::new(
+                params
+                    .media_marker
+                    .as_ref()
+                    .unwrap_or(&llama_cpp_2::mtmd::mtmd_default_marker().to_string())
+                    .clone(),
+            )?,
+        };
+
+        let mtmd_ctx = MtmdContext::init_from_file(&params.mmproj_path, model, mtmd_params)?;
+
+        let chat_template = model
+            .chat_template(params.chat_template.as_deref())
+            .map_err(|e| format!("Failed to get chat template: {}", e))?;
+
+        let batch = LlamaBatch::new(params.n_tokens, 1);
+
+        Ok(Self {
+            mtmd_ctx,
+            batch,
+            chat: Vec::new(),
+            bitmaps: Vec::new(),
+            n_past: 0,
+            chat_template,
+        })
+    }
+
+    /// Loads media (image or audio) from the specified file path
+    pub fn load_media(&mut self, path: &str) -> Result<(), MtmdBitmapError> {
+        let bitmap = MtmdBitmap::from_file(&self.mtmd_ctx, path)?;
+        self.bitmaps.push(bitmap);
+        Ok(())
+    }
+
+    /// Evaluates a chat message, tokenizing and processing it through the model
+    pub fn eval_message(
+        &mut self,
+        model: &LlamaModel,
+        context: &mut LlamaContext,
+        msg: LlamaChatMessage,
+        add_bos: bool,
+    ) -> Result<(), Box<dyn std::error::Error>> {
+        self.chat.push(msg);
+
+        // Format the message using chat template (simplified)
+        let formatted_prompt = model.apply_chat_template(&self.chat_template, &self.chat, true)?;
+
+        let input_text = MtmdInputText {
+            text: formatted_prompt,
+            add_special: add_bos,
+            parse_special: true,
+        };
+
+        let bitmap_refs: Vec<&MtmdBitmap> = self.bitmaps.iter().collect();
+
+        if bitmap_refs.is_empty() {
+            println!("No bitmaps provided, only tokenizing text");
+        } else {
+            println!("Tokenizing with {} bitmaps", bitmap_refs.len());
+        }
+
+        // Tokenize the input
+        let chunks = self.mtmd_ctx.tokenize(input_text, &bitmap_refs)?;
+
+        println!("Tokenization complete, {} chunks created", chunks.len());
+
+        // Clear bitmaps after tokenization
+        self.bitmaps.clear();
+
+        self.n_past = chunks.eval_chunks(&self.mtmd_ctx, &context, 0, 0, 1, true)?;
+        Ok(())
+    }
+
+    /// Generates a response by sampling tokens from the model
+    pub fn generate_response(
+        &mut self,
+        model: &LlamaModel,
+        context: &mut LlamaContext,
+        sampler: &mut LlamaSampler,
+        n_predict: i32,
+    ) -> Result<(), Box<dyn std::error::Error>> {
+        let mut generated_tokens = Vec::new();
+        let max_predict = if n_predict < 0 { i32::MAX } else { n_predict };
+
+        for _i in 0..max_predict {
+            // Sample next token
+            let token = sampler.sample(context, 0);
+            generated_tokens.push(token);
+            sampler.accept(token);
+
+            // Check for end of generation
+            if model.is_eog_token(token) {
+                println!();
+                break;
+            }
+
+            // Print token
+            let piece = model.token_to_str(token, Special::Tokenize)?;
+            print!("{}", piece);
+            io::stdout().flush()?;
+
+            // Prepare next batch
+            self.batch.clear();
+            self.batch.add(token, self.n_past, &[0], true)?;
+            self.n_past += 1;
+
+            // Decode
+            context.decode(&mut self.batch)?;
+        }
+
+        Ok(())
+    }
+}
+
+fn run_single_turn(
+    ctx: &mut MtmdCliContext,
+    model: &LlamaModel,
+    context: &mut LlamaContext,
+    sampler: &mut LlamaSampler,
+    params: &MtmdCliParams,
+) -> Result<(), Box<dyn std::error::Error>> {
+    // Add media marker if not present
+    let mut prompt = params.prompt.clone();
+    let default_marker = llama_cpp_2::mtmd::mtmd_default_marker().to_string();
+    let media_marker = params.media_marker.as_ref().unwrap_or(&default_marker);
+    if !prompt.contains(media_marker) {
+        prompt.push_str(media_marker);
+    }
+
+    // Load media files
+    for image_path in &params.images {
+        println!("Loading image: {}", image_path);
+        ctx.load_media(image_path)?;
+    }
+    for audio_path in &params.audio {
+        ctx.load_media(audio_path)?;
+    }
+
+    // Create user message
+    let msg = LlamaChatMessage::new("user".to_string(), prompt)?;
+
+    println!("Evaluating message: {:?}", msg);
+
+    // Evaluate the message (prefill)
+    ctx.eval_message(model, context, msg, true)?;
+
+    // Generate response (decode)
+    ctx.generate_response(model, context, sampler, params.n_predict)?;
+
+    Ok(())
+}
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let params = MtmdCliParams::parse();
+
+    // Validate required parameters
+    if !Path::new(&params.model_path).exists() {
+        eprintln!("Error: Model file not found: {}", params.model_path);
+        return Err("Model file not found".into());
+    }
+
+    if !Path::new(&params.mmproj_path).exists() {
+        eprintln!(
+            "Error: Multimodal projection file not found: {}",
+            params.mmproj_path
+        );
+        return Err("Multimodal projection file not found".into());
+    }
+
+    println!("Loading model: {}", params.model_path);
+
+    // Initialize backend
+    let backend = LlamaBackend::init()?;
+
+    // Setup model parameters
+    let mut model_params = LlamaModelParams::default();
+    if !params.no_gpu {
+        model_params = model_params.with_n_gpu_layers(1000000); // Use all layers on GPU
+    }
+
+    // Load model
+    let model = LlamaModel::load_from_file(&backend, &params.model_path, &model_params)?;
+
+    // Create context
+    let context_params = LlamaContextParams::default()
+        .with_n_threads(params.n_threads)
+        .with_n_batch(1);
+    let mut context = model.new_context(&backend, context_params)?;
+
+    // Create sampler
+    let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);
+
+    println!("Model loaded successfully");
+    println!("Loading mtmd projection: {}", params.mmproj_path);
+
+    // Create the MTMD context
+    let mut ctx = MtmdCliContext::new(&params, &model)?;
+
+    run_single_turn(&mut ctx, &model, &mut context, &mut sampler, &params)?;
+
+    println!("\n");
+
+    Ok(())
+}
@@ -17,4 +17,4 @@ native = ["llama-cpp-2/native"]
 vulkan = ["llama-cpp-2/vulkan"]
 
 [lints]
-workspace = true
+workspace = true
@@ -27,6 +27,7 @@ pub mod llama_backend;
 pub mod llama_batch;
 mod log;
 pub mod model;
+pub mod mtmd;
 pub mod sampling;
 pub mod timing;
 pub mod token;
 
@@ -13,9 +13,9 @@ use crate::model::params::LlamaModelParams;
 use crate::token::LlamaToken;
 use crate::token_type::{LlamaTokenAttr, LlamaTokenAttrs};
 use crate::{
-    ApplyChatTemplateError, ChatTemplateError, LlamaContextLoadError,
-    LlamaLoraAdapterInitError, LlamaModelLoadError, MetaValError, NewLlamaChatMessageError,
-    StringToTokenError, TokenToStringError,
+    ApplyChatTemplateError, ChatTemplateError, LlamaContextLoadError, LlamaLoraAdapterInitError,
+    LlamaModelLoadError, MetaValError, NewLlamaChatMessageError, StringToTokenError,
+    TokenToStringError,
 };
 
 pub mod params;
@@ -488,7 +488,8 @@ impl LlamaModel {
     pub fn n_head_kv(&self) -> u32 {
         // It's never possible for this to panic because while the API interface is defined as an int32_t,
         // the field it's accessing is a uint32_t.
-        u32::try_from(unsafe { llama_cpp_sys_2::llama_model_n_head_kv(self.model.as_ptr()) }).unwrap()
+        u32::try_from(unsafe { llama_cpp_sys_2::llama_model_n_head_kv(self.model.as_ptr()) })
+            .unwrap()
     }
 
     /// Get metadata value as a string by key name
Original file line number	Diff line number	Diff line change
`@@ -6,6 +6,7 @@ members = [`
`6`	`6`	`"examples/embeddings",`
`7`	`7`	`"examples/simple",`
`8`	`8`	`"examples/reranker",`
	`9`	`+ "examples/mtmd",`
`9`	`10`	`]`
`10`	`11`
`11`	`12`	`[workspace.dependencies]`