Add split model loading support

hongkongkiwi · claude · hongkongkiwi · commit 5bbf90d0bc86 · 2025-08-23T16:26:25.000+08:00
This commit introduces comprehensive support for loading models from multiple split files: - Added `load_from_splits()` method to LlamaModel for loading models split across multiple files - Added utility functions `split_path()` and `split_prefix()` for working with split file naming conventions - Added split_model example demonstrating usage of the split loading functionality - Updated workspace Cargo.toml to include the new split_model example This feature enables loading very large models that have been split due to filesystem limitations or distribution requirements. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/Cargo.toml b/Cargo.toml
@@ -7,6 +7,8 @@ members = [
   "examples/simple",
   "examples/reranker",
   "examples/mtmd",
+  "examples/split_model",
+  "examples/rpc",
 ]
 
 [workspace.dependencies]
diff --git a/examples/split_model/Cargo.toml b/examples/split_model/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "split_model"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+llama-cpp-2 = { path = "../../llama-cpp-2" }
+anyhow = "1.0"
+clap = { version = "4", features = ["derive"] }
diff --git a/examples/split_model/src/main.rs b/examples/split_model/src/main.rs
@@ -0,0 +1,195 @@
+//! Example demonstrating how to load split GGUF models.
+//!
+//! This example shows how to:
+//! - Load a model split across multiple files
+//! - Use utility functions to work with split file naming conventions
+//! - Generate text from a split model
+
+use anyhow::Result;
+use clap::Parser;
+use llama_cpp_2::{
+    context::params::LlamaContextParams,
+    llama_backend::LlamaBackend,
+    llama_batch::LlamaBatch,
+    model::{params::LlamaModelParams, AddBos, LlamaModel},
+    sampling::LlamaSampler,
+};
+use std::io::{self, Write};
+use std::num::NonZeroU32;
+use std::path::{Path, PathBuf};
+
+/// Command line arguments for the split model example
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    /// Paths to the split model files (can be specified multiple times)
+    #[arg(short = 'm', long = "model", required = true, num_args = 1..)]
+    model_paths: Vec<PathBuf>,
+
+    /// Alternatively, provide a prefix and the program will auto-detect splits
+    #[arg(short = 'p', long = "prefix", conflicts_with = "model_paths")]
+    prefix: Option<String>,
+
+    /// Number of splits (required if using --prefix)
+    #[arg(short = 'n', long = "num-splits", requires = "prefix")]
+    num_splits: Option<u32>,
+
+    /// Prompt to use for generation
+    #[arg(short = 't', long = "prompt", default_value = "Once upon a time")]
+    prompt: String,
+
+    /// Number of tokens to generate
+    #[arg(short = 'g', long = "n-predict", default_value_t = 128)]
+    n_predict: i32,
+
+    /// Number of GPU layers
+    #[arg(short = 'l', long = "n-gpu-layers", default_value_t = 0)]
+    n_gpu_layers: u32,
+
+    /// Context size
+    #[arg(short = 'c', long = "ctx-size", default_value_t = 2048)]
+    ctx_size: u32,
+
+    /// Temperature for sampling
+    #[arg(long = "temp", default_value_t = 0.8)]
+    temperature: f32,
+
+    /// Top-P for sampling
+    #[arg(long = "top-p", default_value_t = 0.95)]
+    top_p: f32,
+
+    /// Seed for random number generation
+    #[arg(long = "seed", default_value_t = 1234)]
+    seed: u32,
+}
+
+fn main() -> Result<()> {
+    let args = Args::parse();
+
+    // Determine the model paths
+    let model_paths = if let Some(prefix) = args.prefix {
+        let num_splits = args.num_splits.expect("num-splits required with prefix");
+        
+        // Generate split paths using the utility function
+        let mut paths = Vec::new();
+        for i in 1..=num_splits {
+            let path = LlamaModel::split_path(&prefix, i as i32, num_splits as i32);
+            paths.push(PathBuf::from(path));
+        }
+        
+        println!("Generated split paths:");
+        for path in &paths {
+            println!("  - {}", path.display());
+        }
+        
+        paths
+    } else {
+        args.model_paths
+    };
+
+    // Verify all split files exist
+    for path in &model_paths {
+        if !path.exists() {
+            eprintln!("Error: Split file not found: {}", path.display());
+            std::process::exit(1);
+        }
+    }
+
+    println!("Loading model from {} splits...", model_paths.len());
+
+    // Initialize the backend
+    let backend = LlamaBackend::init()?;
+
+    // Set up model parameters
+    let mut model_params = LlamaModelParams::default();
+    if args.n_gpu_layers > 0 {
+        model_params = model_params.with_n_gpu_layers(args.n_gpu_layers);
+    }
+
+    // Load the model from splits
+    let model = LlamaModel::load_from_splits(&backend, &model_paths, &model_params)?;
+    println!("Model loaded successfully!");
+
+    // Get model info
+    let n_vocab = model.n_vocab();
+    println!("Model vocabulary size: {}", n_vocab);
+
+    // Create context
+    let ctx_params = LlamaContextParams::default()
+        .with_n_ctx(Some(NonZeroU32::new(args.ctx_size).unwrap()));
+
+    let mut ctx = model.new_context(&backend, ctx_params)?;
+    println!("Context created with size: {}", args.ctx_size);
+
+    // Tokenize the prompt
+    let tokens = model.str_to_token(&args.prompt, AddBos::Always)?;
+    println!("Prompt tokenized into {} tokens", tokens.len());
+
+    // Create batch
+    let mut batch = LlamaBatch::new(512, 1);
+
+    // Add tokens to batch
+    let last_index = tokens.len() - 1;
+    for (i, token) in tokens.iter().enumerate() {
+        let is_last = i == last_index;
+        batch.add(*token, i as i32, &[0], is_last)?;
+    }
+
+    // Decode the batch
+    ctx.decode(&mut batch)?;
+    println!("Initial prompt processed");
+
+    // Set up sampling
+    let mut sampler = LlamaSampler::chain_simple([
+        LlamaSampler::temp(args.temperature),
+        LlamaSampler::top_p(args.top_p, 1),
+    ]);
+
+    // Generate text
+    print!("{}", args.prompt);
+    io::stdout().flush()?;
+
+    let mut n_cur = batch.n_tokens();
+    let mut n_decode = 0;
+
+    while n_decode < args.n_predict {
+        // Sample the next token
+        let new_token = sampler.sample(&ctx, batch.n_tokens() - 1);
+        sampler.accept(new_token);
+
+        // Check for EOS
+        if model.is_eog_token(new_token) {
+            println!();
+            break;
+        }
+
+        // Print the token
+        let piece = model.token_to_str(new_token, llama_cpp_2::model::Special::Tokenize)?;
+        print!("{}", piece);
+        io::stdout().flush()?;
+
+        // Prepare the next batch
+        batch.clear();
+        batch.add(new_token, n_cur, &[0], true)?;
+        n_cur += 1;
+
+        // Decode
+        ctx.decode(&mut batch)?;
+        n_decode += 1;
+    }
+
+    println!("\n\nGeneration complete!");
+    println!("Generated {} tokens", n_decode);
+
+    // Demonstrate the split_prefix utility
+    if let Some(first_path) = model_paths.first() {
+        if let Some(path_str) = first_path.to_str() {
+            // Try to extract the prefix from the first split file
+            if let Some(prefix) = LlamaModel::split_prefix(path_str, 1, model_paths.len() as i32) {
+                println!("\nExtracted prefix from first split: {}", prefix);
+            }
+        }
+    }
+
+    Ok(())
+}
diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
@@ -622,6 +622,174 @@ impl LlamaModel {
         Ok(LlamaModel { model })
     }
 
+    /// Load a model from multiple split files.
+    ///
+    /// This function loads a model that has been split across multiple files. This is useful for
+    /// very large models that exceed filesystem limitations or need to be distributed across
+    /// multiple storage devices.
+    ///
+    /// # Arguments
+    ///
+    /// * `paths` - A slice of paths to the split model files
+    /// * `params` - The model parameters
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    /// - Any of the paths cannot be converted to a C string
+    /// - The model fails to load from the splits
+    /// - Any path doesn't exist or isn't accessible
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    /// use llama_cpp_2::model::{LlamaModel, params::LlamaModelParams};
+    /// use llama_cpp_2::llama_backend::LlamaBackend;
+    /// use std::path::Path;
+    ///
+    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
+    /// let backend = LlamaBackend::init()?;
+    /// let params = LlamaModelParams::default();
+    /// 
+    /// let paths = vec![
+    ///     Path::new("model-00001-of-00003.gguf"),
+    ///     Path::new("model-00002-of-00003.gguf"),
+    ///     Path::new("model-00003-of-00003.gguf"),
+    /// ];
+    /// 
+    /// let model = LlamaModel::load_from_splits(&backend, &paths, &params)?;
+    /// # Ok(())
+    /// # }
+    /// ```
+    #[tracing::instrument(skip_all)]
+    pub fn load_from_splits(
+        _: &LlamaBackend,
+        paths: &[impl AsRef<Path>],
+        params: &LlamaModelParams,
+    ) -> Result<Self, LlamaModelLoadError> {
+        // Convert paths to C strings
+        let c_strings: Vec<CString> = paths
+            .iter()
+            .map(|p| {
+                let path = p.as_ref();
+                debug_assert!(path.exists(), "{path:?} does not exist");
+                let path_str = path
+                    .to_str()
+                    .ok_or(LlamaModelLoadError::PathToStrError(path.to_path_buf()))?;
+                CString::new(path_str).map_err(LlamaModelLoadError::from)
+            })
+            .collect::<Result<Vec<_>, _>>()?;
+
+        // Create array of pointers to C strings
+        let c_ptrs: Vec<*const c_char> = c_strings.iter().map(|s| s.as_ptr()).collect();
+
+        // Load the model from splits
+        let llama_model = unsafe {
+            llama_cpp_sys_2::llama_model_load_from_splits(
+                c_ptrs.as_ptr() as *mut *const c_char,
+                c_ptrs.len(),
+                params.params,
+            )
+        };
+
+        let model = NonNull::new(llama_model).ok_or(LlamaModelLoadError::NullResult)?;
+
+        tracing::debug!("Loaded model from {} splits", paths.len());
+        Ok(LlamaModel { model })
+    }
+
+    /// Build a split GGUF file path for a specific chunk.
+    ///
+    /// This utility function creates the standardized filename for a split model chunk
+    /// following the pattern: `{prefix}-{split_no:05d}-of-{split_count:05d}.gguf`
+    ///
+    /// # Arguments
+    ///
+    /// * `path_prefix` - The base path and filename prefix
+    /// * `split_no` - The split number (1-indexed)
+    /// * `split_count` - The total number of splits
+    ///
+    /// # Returns
+    ///
+    /// Returns the formatted split path as a String
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use llama_cpp_2::model::LlamaModel;
+    ///
+    /// let path = LlamaModel::split_path("/models/llama", 2, 4);
+    /// assert_eq!(path, "/models/llama-00002-of-00004.gguf");
+    /// ```
+    pub fn split_path(path_prefix: &str, split_no: i32, split_count: i32) -> String {
+        let mut buffer = vec![0u8; 1024];
+        let path_prefix_cstr = CString::new(path_prefix).unwrap_or_else(|_| CString::new("").unwrap());
+        let len = unsafe {
+            llama_cpp_sys_2::llama_split_path(
+                buffer.as_mut_ptr() as *mut c_char,
+                buffer.len(),
+                path_prefix_cstr.as_ptr(),
+                split_no,
+                split_count,
+            )
+        };
+        
+        if len > 0 && len < buffer.len() as i32 {
+            buffer.truncate(len as usize);
+            String::from_utf8(buffer).unwrap_or_else(|_| String::new())
+        } else {
+            String::new()
+        }
+    }
+
+    /// Extract the path prefix from a split filename.
+    ///
+    /// This function extracts the base path prefix from a split model filename,
+    /// but only if the split_no and split_count match the pattern in the filename.
+    ///
+    /// # Arguments
+    ///
+    /// * `split_path` - The full path to a split file
+    /// * `split_no` - The expected split number
+    /// * `split_count` - The expected total number of splits
+    ///
+    /// # Returns
+    ///
+    /// Returns `Some(prefix)` if the split pattern matches, `None` otherwise
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use llama_cpp_2::model::LlamaModel;
+    ///
+    /// let prefix = LlamaModel::split_prefix("/models/llama-00002-of-00004.gguf", 2, 4);
+    /// assert_eq!(prefix, Some("/models/llama".to_string()));
+    ///
+    /// // Returns None if the pattern doesn't match
+    /// let prefix = LlamaModel::split_prefix("/models/llama-00002-of-00004.gguf", 3, 4);
+    /// assert_eq!(prefix, None);
+    /// ```
+    pub fn split_prefix(split_path: &str, split_no: i32, split_count: i32) -> Option<String> {
+        let mut buffer = vec![0u8; 1024];
+        let split_path_cstr = CString::new(split_path).ok()?;
+        let len = unsafe {
+            llama_cpp_sys_2::llama_split_prefix(
+                buffer.as_mut_ptr() as *mut c_char,
+                buffer.len(),
+                split_path_cstr.as_ptr(),
+                split_no,
+                split_count,
+            )
+        };
+        
+        if len > 0 && len < buffer.len() as i32 {
+            buffer.truncate(len as usize);
+            String::from_utf8(buffer).ok()
+        } else {
+            None
+        }
+    }
+
     /// Initializes a lora adapter from a file.
     ///
     /// # Errors

Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,8 @@ members = [`
`7`	`7`	`"examples/simple",`
`8`	`8`	`"examples/reranker",`
`9`	`9`	`"examples/mtmd",`
	`10`	`+ "examples/split_model",`
	`11`	`+ "examples/rpc",`
`10`	`12`	`]`
`11`	`13`
`12`	`14`	`[workspace.dependencies]`