From 00d4297bee3c268f3ee7caabdc54b9979a217e2d Mon Sep 17 00:00:00 2001
From: vineet <vineet.suryan@collabora.com>
Date: Wed, 8 Oct 2025 17:25:05 -0400
Subject: [PATCH 1/4] finetune-lora: Add checkpoint saving & resuming from
 saved checkpoint This commit adds checkpointing for fine-tuning: - Add
 checkpoint saving every N steps with --checkpoint-save-steps - Save complete
 training state: model weights, optimizer state, metadata - Implement
 two-phase optimizer state loading to avoid memory issues - Add --resume-from
 and --auto-resume functionality - Store optimizer momentum/variance tensors
 in GGUF format - Add checkpoint validation for rank, alpha, and target
 modules - Update README.md with checkpointing documentation

The optimizer state loading: iteration count is loaded during initialization,
while tensor data (grad_m, grad_v) is loaded after ggml_opt_alloc creates
the proper tensor structures.
---
 examples/training/README.md         |  31 ++
 examples/training/finetune-lora.cpp | 420 +++++++++++++++++++++++++---
 examples/training/finetune.cpp      |  19 +-
 ggml/include/ggml-opt.h             |  13 +
 ggml/src/ggml-opt.cpp               | 154 ++++++++++
 include/llama.h                     |  16 +-
 src/llama-context.cpp               |  55 +++-
 src/llama-context.h                 |  15 +-
 src/llama-lora-training.cpp         |  46 ++-
 src/llama-lora-training.h           |   1 +
 10 files changed, 716 insertions(+), 54 deletions(-)

diff --git a/examples/training/README.md b/examples/training/README.md
index ed255a0e1af..7d1cda2a9ca 100644
--- a/examples/training/README.md
+++ b/examples/training/README.md
@@ -36,6 +36,14 @@ the base model frozen, making it memory-efficient.
 # Fine-tune existing LoRA adapter
 ./build/bin/llama-finetune-lora -m base_model.gguf -f dataset.txt --lora existing_adapter.gguf \
   --output-adapter improved_adapter.gguf -ngl 999 -c 512 -b 512 -ub 512
+
+# Training with checkpointing
+./build/bin/llama-finetune-lora -m model.gguf -f dataset.txt -ngl 999 -c 512 -b 512 -ub 512 \
+  --checkpoint-save-steps 50 --checkpoint-save-dir "./lora_checkpoints"
+
+# Resume training from checkpoint
+./build/bin/llama-finetune-lora -m model.gguf -f dataset.txt -ngl 999 -c 512 -b 512 -ub 512 \
+  --resume-from "./lora_checkpoints/checkpoint_step_00000150/"
 ```
 
 
@@ -53,6 +61,12 @@ the base model frozen, making it memory-efficient.
   - Default: `attn_q,attn_k,attn_v,attn_o` (attention modules)
 - `--output-adapter PATH` - Output adapter filename (default: auto-generated)
 
+#### Checkpointing
+- `--checkpoint-save-steps N` - Save checkpoint every N training steps (default: 100)
+- `--checkpoint-save-dir PATH` - Directory for checkpoints (default: `./checkpoints`)
+- `--resume-from PATH` - Resume training from specific checkpoint directory
+- `--auto-resume` - Automatically resume from latest checkpoint in save directory
+
 #### Standard Parameters
 - `-m MODEL` - Base model file (.gguf)
 - `-f FILE` - Training dataset
@@ -68,11 +82,28 @@ After training, you'll get a small adapter file. Use it with the original base m
 ./build/bin/llama-cli -m base_model.gguf --lora trained_adapter.gguf -ngl 999
 ```
 
+### Checkpointing
+
+The LoRA fine-tuning supports automatic checkpointing to save and resume training progress:
+
+#### Features
+- **Automatic saving**: Model and optimizer state saved every N training steps
+- **Complete state**: Includes LoRA weights, optimizer momentum, and training metadata
+- **Resume capability**: Continue training from exact step with full optimizer state
+- **Auto-resume**: Automatically find and resume from latest checkpoint
+
+#### Checkpoint Structure
+Each checkpoint directory contains:
+- `model.gguf` - LoRA adapter weights
+- `optimizer.gguf` - Optimizer state (momentum, variance, iteration)
+- `metadata.json` - Training parameters and step information
+
 ### Troubleshooting
 
 - **Out of memory**: Reduce context length (`-c 256`), lower rank, or use fewer target modules
 - **Poor quality**: Increase rank, add more target modules, or train longer
 - **Large adapter**: Reduce rank or limit target modules
+- **Checkpoint issues**: Ensure checkpoint directory contains all required files (model.gguf, optimizer.gguf, metadata.json)
 
 ### Help
 
diff --git a/examples/training/finetune-lora.cpp b/examples/training/finetune-lora.cpp
index c12e119613b..0f166757246 100644
--- a/examples/training/finetune-lora.cpp
+++ b/examples/training/finetune-lora.cpp
@@ -3,18 +3,19 @@
 #include "log.h"
 #include "llama.h"
 
-#include <cmath>
-#include <cstdio>
 #include <cstring>
-#include <ctime>
 #include <vector>
 #include <fstream>
+#include <filesystem>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
 
+struct checkpoint_callback_data;
+static checkpoint_callback_data* g_checkpoint_data = nullptr;
+
 static uint32_t parse_lora_modules(const std::string& modules_str) {
     if (modules_str.empty()) {
         return LLAMA_LORA_TARGET_ATTN_Q | LLAMA_LORA_TARGET_ATTN_K | LLAMA_LORA_TARGET_ATTN_V | LLAMA_LORA_TARGET_ATTN_O;
@@ -55,13 +56,20 @@ static uint32_t parse_lora_modules(const std::string& modules_str) {
 }
 
 static void print_lora_usage() {
-    printf("\nLoRA Fine-tuning Parameters:\n");
+    printf("\n----- LoRA Fine-tuning Parameters -----\n");
     printf("  --lora-rank N              LoRA rank (default: 8, range: 1-512)\n");
     printf("  --lora-alpha N             LoRA alpha scaling factor (default: 16.0, range: 0.1-1000.0)\n");
     printf("  --lora-modules MODULES     Target modules as comma-separated list (default: attn_q,attn_k,attn_v,attn_o)\n");
     printf("                             Available modules: attn_q, attn_k, attn_v, attn_o, ffn_gate, ffn_up, ffn_down, output, all\n");
     printf("                             Examples: \"attn_q,attn_v\" or \"all\" or \"attn_q,attn_k,attn_v,attn_o,ffn_gate,ffn_up,ffn_down\"\n");
     printf("  --output-adapter PATH      Output path for trained adapter (default: auto-generated)\n");
+    printf("\nTraining Options:\n");
+    printf("  --num-epochs N             Number of training epochs (default: 1)\n");
+    printf("\nCheckpointing Options:\n");
+    printf("  --checkpoint-save-steps N  Save checkpoint every N training steps (default: 100)\n");
+    printf("  --checkpoint-save-dir PATH Directory for checkpoints (default: ./checkpoints)\n");
+    printf("  --resume-from PATH         Resume training from specific checkpoint file\n");
+    printf("  --auto-resume              Automatically resume from latest checkpoint in save dir\n");
     printf("\nExamples:\n");
     printf("  # Train with rank=16, alpha=32, all attention modules\n");
     printf("  %s -m model.gguf -f dataset.txt --lora-rank 16 --lora-alpha 32 --lora-modules attn_q,attn_k,attn_v,attn_o\n", "finetune-lora");
@@ -70,16 +78,207 @@ static void print_lora_usage() {
     printf("\n");
 }
 
-int main(int argc, char ** argv) {
-    common_params params;
+struct checkpoint_metadata {
+    int32_t epoch;
+    int32_t lora_rank;
+    float lora_alpha;
+    uint32_t target_modules;
+};
+
+static std::string get_checkpoint_filename(const std::string& checkpoint_dir, int64_t step) {
+    std::ostringstream oss;
+    oss << checkpoint_dir << "/checkpoint_step_" << std::setfill('0') << std::setw(8) << step;
+    return oss.str();
+}
+
+static std::string find_latest_checkpoint(const std::string& checkpoint_dir) {
+    if (!std::filesystem::exists(checkpoint_dir)) {
+        return "";
+    }
+    
+    std::string latest_checkpoint;
+    int64_t latest_step = -1;
+    
+    for (const auto& entry : std::filesystem::directory_iterator(checkpoint_dir)) {
+        if (entry.is_directory()) {
+            std::string dirname = entry.path().filename().string();
+            if (dirname.find("checkpoint_step_") == 0 && dirname.size() >= 16) {
+                std::string step_str = dirname.substr(16, 8);
+                try {
+                    int64_t step = std::stoll(step_str);
+                    if (step > latest_step) {
+                        latest_step = step;
+                        latest_checkpoint = entry.path().string();
+                    }
+                } catch (const std::exception&) {
+                    continue;
+                }
+            }
+        }
+    }
+    
+    return latest_checkpoint;
+}
+
+static bool save_checkpoint(llama_context* ctx, llama_adapter_lora* adapter,  const checkpoint_metadata& metadata, const std::string& checkpoint_dir) {
+    if (!std::filesystem::exists(checkpoint_dir)) {
+        if (!std::filesystem::create_directories(checkpoint_dir)) {
+            LOG_ERR("Failed to create checkpoint directory: %s\n", checkpoint_dir.c_str());
+            return false;
+        }
+    }
+    
+    if (!llama_lora_save_checkpoint(adapter, checkpoint_dir.c_str(), llama_get_model(ctx), ctx)) {
+        LOG_ERR("Failed to save LoRA checkpoint\n");
+        return false;
+    }
+    
+    std::string meta_path = checkpoint_dir + "/metadata.json";
+    std::ofstream meta_file(meta_path);
+    if (meta_file.is_open()) {
+        meta_file << "epoch=" << metadata.epoch << "\n";
+        meta_file << "lora_rank=" << metadata.lora_rank << "\n";
+        meta_file << "lora_alpha=" << metadata.lora_alpha << "\n";
+        meta_file << "target_modules=" << metadata.target_modules << "\n";
+        meta_file.close();
+    } else {
+        LOG_ERR("Failed to save checkpoint metadata\n");
+        return false;
+    }
+    
+    LOG_INF("Checkpoint saved successfully to %s\n", checkpoint_dir.c_str());
+    return true;
+}
+
+static bool validate_checkpoint_metadata(const std::string& checkpoint_path, checkpoint_metadata& metadata) {
+    std::string checkpoint_dir = checkpoint_path;
+    
+    if (!std::filesystem::exists(checkpoint_dir)) {
+        LOG_ERR("Checkpoint directory does not exist: %s\n", checkpoint_dir.c_str());
+        return false;
+    }
+    
+    LOG_INF("Loading checkpoint from: %s\n", checkpoint_dir.c_str());
+    
+    std::string meta_path = checkpoint_dir + "/metadata.json";
+    if (std::filesystem::exists(meta_path)) {
+        std::ifstream meta_file(meta_path);
+        if (meta_file.is_open()) {
+            std::string line;
+            while (std::getline(meta_file, line)) {
+                size_t eq_pos = line.find('=');
+                if (eq_pos != std::string::npos) {
+                    std::string key = line.substr(0, eq_pos);
+                    std::string value = line.substr(eq_pos + 1);
+                    
+                    if (key == "epoch") {
+                        metadata.epoch = std::stoi(value);
+                    } else if (key == "lora_rank") {
+                        metadata.lora_rank = std::stoi(value);
+                    } else if (key == "lora_alpha") {
+                        metadata.lora_alpha = std::stof(value);
+                    } else if (key == "target_modules") {
+                        metadata.target_modules = std::stoul(value);
+                    }
+                }
+            }
+            meta_file.close();
+        } else {
+            LOG_ERR("Failed to open checkpoint metadata file\n");
+            return false;
+        }
+    } else {
+        LOG_ERR("Checkpoint metadata file not found: %s\n", meta_path.c_str());
+        return false;
+    }
+    
+    LOG_INF("Checkpoint loaded successfully\n");
+    return true;
+}
+
+
+struct checkpoint_callback_data {
+    llama_context* ctx;
+    llama_adapter_lora* adapter;
+    int32_t checkpoint_save_steps;
+    std::string checkpoint_save_dir;
+    int64_t global_step;
+    int64_t initial_step;
+    int32_t current_epoch;
+    int32_t lora_rank;
+    float lora_alpha;
+    uint32_t target_modules;
+    float learning_rate;
+    std::string model_path;
+    std::string dataset_path;
+};
+
+static void checkpoint_progress_callback(
+        bool               train,
+        ggml_opt_context_t opt_ctx,
+        ggml_opt_dataset_t dataset,
+        ggml_opt_result_t  result,
+        int64_t            ibatch,
+        int64_t            ibatch_max,
+        int64_t            t_start_us) {
+    ggml_opt_epoch_callback_progress_bar(train, opt_ctx, dataset, result, ibatch, ibatch_max, t_start_us);
+    
+    if (!train) return;
+    
+    checkpoint_callback_data* cb_data = g_checkpoint_data;
+    
+    if (!cb_data) {
+        LOG_ERR("Checkpoint callback data is null!\n");
+        return;
+    }
+    
+    if (cb_data->checkpoint_save_steps <= 0) {
+        return;
+    }
+    
+    cb_data->global_step++;
+    
+    if (cb_data->global_step % cb_data->checkpoint_save_steps == 0) {
+        if (!cb_data->ctx) {
+            LOG_ERR("Context is null in checkpoint callback!\n");
+            return;
+        }
+        
+        if (!cb_data->adapter) {
+            LOG_ERR("LoRA adapter is null in checkpoint callback!\n");
+            return;
+        }
+        
+        checkpoint_metadata meta = {
+            /*epoch          =*/ cb_data->current_epoch,
+            /*lora_rank      =*/ cb_data->lora_rank,
+            /*lora_alpha     =*/ cb_data->lora_alpha,
+            /*target_modules =*/ cb_data->target_modules,
+        };
+        
+        std::string checkpoint_path = get_checkpoint_filename(cb_data->checkpoint_save_dir, cb_data->global_step);
+        
+        if (!save_checkpoint(cb_data->ctx, cb_data->adapter, meta, checkpoint_path)) {
+            LOG_ERR("Failed to save checkpoint at step %ld\n", cb_data->global_step);
+        }
+    }
+}
 
+struct finetune_params {
     int32_t lora_rank = 8;
     float lora_alpha = 16.0f;
     std::string lora_modules_str;
     std::string output_adapter_path;
-
-    params.escape = false;
-
+    
+    int32_t num_epochs = 1;
+    
+    int32_t checkpoint_save_steps = 100;
+    std::string checkpoint_save_dir = "./checkpoints";
+    std::string resume_from_checkpoint;
+    bool auto_resume = false;
+};
+
+static bool parse_finetune_args(int& argc, char** argv, finetune_params& ft_params) {
     auto remove_arg_pair = [&](int i) {
         for (int j = i; j < argc - 2; j++) {
             argv[j] = argv[j + 2];
@@ -87,39 +286,96 @@ int main(int argc, char ** argv) {
         argc -= 2;
     };
 
-    for (int i = 1; i < argc - 1; i++) {
-        if (strcmp(argv[i], "--lora-rank") == 0) {
-            lora_rank = std::atoi(argv[i + 1]);
+    for (int i = 1; i < argc; i++) {
+        if (strcmp(argv[i], "--lora-rank") == 0 && i + 1 < argc) {
+            ft_params.lora_rank = std::atoi(argv[i + 1]);
+            remove_arg_pair(i);
+            i--;
+        } else if (strcmp(argv[i], "--lora-alpha") == 0 && i + 1 < argc) {
+            ft_params.lora_alpha = std::atof(argv[i + 1]);
             remove_arg_pair(i);
             i--;
-        } else if (strcmp(argv[i], "--lora-alpha") == 0) {
-            lora_alpha = std::atof(argv[i + 1]);
+        } else if (strcmp(argv[i], "--lora-modules") == 0 && i + 1 < argc) {
+            ft_params.lora_modules_str = argv[i + 1];
             remove_arg_pair(i);
             i--;
-        } else if (strcmp(argv[i], "--lora-modules") == 0) {
-            lora_modules_str = argv[i + 1];
+        } else if (strcmp(argv[i], "--output-adapter") == 0 && i + 1 < argc) {
+            ft_params.output_adapter_path = argv[i + 1];
             remove_arg_pair(i);
             i--;
-        } else if (strcmp(argv[i], "--output-adapter") == 0) {
-            output_adapter_path = argv[i + 1];
+        } else if (strcmp(argv[i], "--num-epochs") == 0 && i + 1 < argc) {
+            ft_params.num_epochs = std::atoi(argv[i + 1]);
             remove_arg_pair(i);
             i--;
+        } else if (strcmp(argv[i], "--checkpoint-save-steps") == 0 && i + 1 < argc) {
+            ft_params.checkpoint_save_steps = std::atoi(argv[i + 1]);
+            remove_arg_pair(i);
+            i--;
+        } else if (strcmp(argv[i], "--checkpoint-save-dir") == 0 && i + 1 < argc) {
+            ft_params.checkpoint_save_dir = argv[i + 1];
+            remove_arg_pair(i);
+            i--;
+        } else if (strcmp(argv[i], "--resume-from") == 0 && i + 1 < argc) {
+            ft_params.resume_from_checkpoint = argv[i + 1];
+            remove_arg_pair(i);
+            i--;
+        } else if (strcmp(argv[i], "--auto-resume") == 0) {
+            ft_params.auto_resume = true;
+            for (int j = i; j < argc - 1; j++) {
+                argv[j] = argv[j + 1];
+            }
+            argc--;
+            i--;
         }
     }
 
-    LOG_INF("Using LoRA parameters: rank=%d, alpha=%.1f\n", lora_rank, lora_alpha);
-
     for (int i = 1; i < argc; i++) {
         if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
             print_lora_usage();
         }
     }
+    
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    common_params params;
+    finetune_params ft_params;
+
+    params.escape = false;
+    parse_finetune_args(argc, argv, ft_params);
 
     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
-        print_lora_usage();
         return 1;
     }
 
+    LOG_INF("Using LoRA parameters: rank=%d, alpha=%.1f\n", ft_params.lora_rank, ft_params.lora_alpha);
+    LOG_INF("Training for %d epochs\n", ft_params.num_epochs);
+    
+    // Handle checkpoint auto-resume before model initialization
+    if (ft_params.auto_resume && ft_params.resume_from_checkpoint.empty()) {
+        std::string latest_checkpoint = find_latest_checkpoint(ft_params.checkpoint_save_dir);
+        if (!latest_checkpoint.empty()) {
+            ft_params.resume_from_checkpoint = latest_checkpoint;
+            LOG_INF("Auto-resume: found checkpoint %s\n", ft_params.resume_from_checkpoint.c_str());
+        }
+    }
+    
+    // Load checkpoint LoRA adapter from directory structure (model.gguf)
+    if (!ft_params.resume_from_checkpoint.empty()) {
+        std::filesystem::path checkpoint_dir(ft_params.resume_from_checkpoint);
+        std::filesystem::path model_path = checkpoint_dir / "model.gguf";
+        
+        LOG_INF("Loading checkpoint LoRA adapter: %s\n", model_path.c_str());
+        common_adapter_lora_info lora_adapter;
+        lora_adapter.path = model_path.string();
+        lora_adapter.scale = 1.0f;
+        lora_adapter.ptr = nullptr;
+        params.lora_adapters.clear(); // Remove any existing adapters
+        params.lora_adapters.push_back(lora_adapter);
+        LOG_INF("Checkpoint LoRA adapter added to params\n");
+    }
+
     if (params.use_mmap) {
         LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n", __func__);
         params.use_mmap = false;
@@ -151,15 +407,15 @@ int main(int argc, char ** argv) {
         LOG_INF("%s\n", common_params_get_system_info(params).c_str());
     }
 
-    uint32_t target_modules = parse_lora_modules(lora_modules_str);
+    uint32_t target_modules = parse_lora_modules(ft_params.lora_modules_str);
     if (target_modules == 0) {
         return 1;
     }
 
     struct llama_lora_training_params lora_params = {
         /*target_modules =*/ target_modules,
-        /*rank           =*/ lora_rank,
-        /*alpha          =*/ lora_alpha,
+        /*rank           =*/ ft_params.lora_rank,
+        /*alpha          =*/ ft_params.lora_alpha,
         /*dropout        =*/ 0.0f,
         /*init_std       =*/ 0.02f,
     };
@@ -201,38 +457,132 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true);
     ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get())/2);
 
+    int start_epoch = 0;
+    int64_t start_step = 0;
+    checkpoint_metadata checkpoint_meta = {};
+    bool checkpoint_loaded = false;
+    
+    if (!ft_params.resume_from_checkpoint.empty()) {
+        if (validate_checkpoint_metadata(ft_params.resume_from_checkpoint, checkpoint_meta)) {
+            start_epoch = checkpoint_meta.epoch;
+            checkpoint_loaded = true;
+            
+            if (checkpoint_meta.lora_rank != ft_params.lora_rank) {
+                LOG_ERR("Checkpoint LoRA rank (%d) doesn't match current rank (%d). Use --resume-from to manually specify a compatible checkpoint.\n", 
+                        checkpoint_meta.lora_rank, ft_params.lora_rank);
+                return 1;
+            }
+            if (checkpoint_meta.lora_alpha != ft_params.lora_alpha) {
+                LOG_ERR("Checkpoint LoRA alpha (%.3f) doesn't match current alpha (%.3f)\n", 
+                        checkpoint_meta.lora_alpha, ft_params.lora_alpha);
+                return 1;
+            }
+            if (checkpoint_meta.target_modules != target_modules) {
+                LOG_ERR("Checkpoint target_modules doesn't match current target_modules\n");
+                return 1;
+            }
+            
+        } else {
+            LOG_ERR("Failed to load checkpoint, starting from scratch\n");
+        }
+    }
+    
     struct ggml_opt_optimizer_params optimizer_params = ggml_opt_get_default_optimizer_params(nullptr);
     optimizer_params.adamw.alpha = 1e-5f; // learning rate
 
+    std::string optimizer_checkpoint_path;
+    if (checkpoint_loaded && !ft_params.resume_from_checkpoint.empty()) {
+        std::filesystem::path checkpoint_dir(ft_params.resume_from_checkpoint);
+        optimizer_checkpoint_path = (checkpoint_dir / "optimizer.gguf").string();
+    }
+
     struct llama_opt_params lopt_params {
-        /*n_ctx_train     =*/ 0,
-        /*param_filter    =*/ llama_opt_param_filter_lora,
-        /*param_filter_ud =*/ nullptr,
-        /*get_opt_pars    =*/ ggml_opt_get_constant_optimizer_params,
-        /*get_opt_pars_ud =*/ &optimizer_params,
-        /*optimizer_type  =*/ GGML_OPT_OPTIMIZER_TYPE_ADAMW,
+        /*n_ctx_train          =*/  0,
+        /*param_filter         =*/  llama_opt_param_filter_lora,
+        /*param_filter_ud      =*/  nullptr,
+        /*get_opt_pars         =*/  ggml_opt_get_constant_optimizer_params,
+        /*get_opt_pars_ud      =*/  &optimizer_params,
+        /*optimizer_type       =*/  GGML_OPT_OPTIMIZER_TYPE_ADAMW,
+        /*checkpoint_path      =*/  checkpoint_loaded ? optimizer_checkpoint_path.c_str() : nullptr,
+        /*load_optimizer_state =*/  checkpoint_loaded,
     };
+    
     llama_opt_init(ctx.get(), model.get(), lopt_params);
-
+    
+    if (checkpoint_loaded) {
+        start_step = llama_opt_get_iter(ctx.get());
+    }
+    
+    if (!trained_adapter) {
+        LOG_ERR("No trained adapter available for checkpointing\n");
+        return 1;
+    }
+    
     const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - val_split);
+    const int64_t training_batches_per_epoch = idata_split;
+
+    if (start_step > 0) {
+        int64_t completed_epochs = start_step / training_batches_per_epoch;
+        start_epoch = (int)completed_epochs;
+    }
+
+    checkpoint_callback_data cb_data = {
+        /*ctx                   =*/ ctx.get(),
+        /*adapter               =*/ trained_adapter,
+        /*checkpoint_save_steps =*/ ft_params.checkpoint_save_steps,
+        /*checkpoint_save_dir   =*/ ft_params.checkpoint_save_dir,
+        /*global_step           =*/ start_step,
+        /*initial_step          =*/ start_step,
+        /*current_epoch         =*/ start_epoch,
+        /*lora_rank             =*/ ft_params.lora_rank,
+        /*lora_alpha            =*/ ft_params.lora_alpha,
+        /*target_modules        =*/ target_modules,
+        /*learning_rate         =*/ optimizer_params.adamw.alpha,
+        /*model_path            =*/ params.model.path,
+        /*dataset_path          =*/ params.prompt_file,
+    };
+    g_checkpoint_data = &cb_data;
 
     ggml_opt_result_t result_train = ggml_opt_result_init();
     ggml_opt_result_t result_eval  = ggml_opt_result_init();
 
-    for (int epoch = 0; epoch < 2; ++epoch) {
+    for (int epoch = start_epoch; epoch < ft_params.num_epochs; ++epoch) {
+        LOG_INF("Starting epoch %d (step %ld)\n", epoch, cb_data.global_step);
+        cb_data.current_epoch = epoch;
+        
+        int64_t resume_batch = 0;
+        if (start_step > 0 && epoch == start_epoch) {
+            resume_batch = start_step % training_batches_per_epoch;
+        }
+        
+        ggml_opt_epoch_callback train_callback = (ft_params.checkpoint_save_steps <= 0) ? 
+            ggml_opt_epoch_callback_progress_bar : checkpoint_progress_callback;
+        ggml_opt_epoch_callback eval_callback = (ft_params.checkpoint_save_steps <= 0) ? 
+            ggml_opt_epoch_callback_progress_bar : checkpoint_progress_callback;
+
+        if (resume_batch > 0) {
+            LOG_INF("Resuming training from epoch %d, step %ld \n", epoch, resume_batch);
+        } else if (ft_params.checkpoint_save_steps > 0) {
+            LOG_INF("Checkpointing enabled, saving every %d steps\n", ft_params.checkpoint_save_steps);
+        } else {
+            LOG_INF("Checkpointing disabled, using standard progress callback\n");
+        }
+
         llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split,
-            ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
+            train_callback, eval_callback, resume_batch);
         fprintf(stderr, "\n");
 
         ggml_opt_result_reset(result_train);
         ggml_opt_result_reset(result_eval);
     }
+
+    g_checkpoint_data = nullptr;
     ggml_opt_result_free(result_train);
     ggml_opt_result_free(result_eval);
 
     std::string adapter_filename;
-    if (!output_adapter_path.empty()) {
-        adapter_filename = output_adapter_path;
+    if (!ft_params.output_adapter_path.empty()) {
+        adapter_filename = ft_params.output_adapter_path;
     } else if (has_existing_lora) {
         adapter_filename = "finetuned-lora-adapter.gguf";
         LOG_INF("Finetuned existing lora adapter, saving as: %s\n", adapter_filename.c_str());
diff --git a/examples/training/finetune.cpp b/examples/training/finetune.cpp
index 561e61f8a21..bf179864281 100644
--- a/examples/training/finetune.cpp
+++ b/examples/training/finetune.cpp
@@ -62,14 +62,15 @@ int main(int argc, char ** argv) {
             ggml_opt_optimizer_name(params.optimizer), (double) lr.lr0, (double) lr.wd, (double) lr.lr_min, (double) lr.decay_epochs,
             (unsigned) lr.epochs, (double) params.n_batch / params.n_ubatch, (double) params.val_split);
 
-    struct llama_opt_params lopt_params{
-        /*n_ctx_train     =*/0,
-        // /*param_filter    =*/llama_opt_param_filter_all,
-                              llama_opt_param_filter_lora,
-        /*param_filter_ud =*/nullptr,
-        /*get_opt_pars    =*/common_opt_lr_pars,
-        /*get_opt_pars_ud =*/&params.lr,
-        /*optimizer_type  =*/params.optimizer,
+    struct llama_opt_params lopt_params {
+        /*n_ctx_train     =*/ 0,
+        /*param_filter    =*/ llama_opt_param_filter_all,
+        /*param_filter_ud =*/ nullptr,
+        /*get_opt_pars    =*/ common_opt_lr_pars,
+        /*get_opt_pars_ud =*/ &params.lr,
+        /*optimizer_type  =*/ params.optimizer,
+        /*checkpoint_path =*/ nullptr,
+        /*load_optimizer_state =*/ false,
     };
     llama_opt_init(ctx.get(), model.get(), lopt_params);
 
@@ -80,7 +81,7 @@ int main(int argc, char ** argv) {
 
     for (lr.epoch = 0; lr.epoch < lr.epochs; ++lr.epoch) {
         llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split,
-                        ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
+            ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar, -1);
         fprintf(stderr, "\n");
 
         ggml_opt_result_reset(result_train);
diff --git a/ggml/include/ggml-opt.h b/ggml/include/ggml-opt.h
index 4703a05afe1..05f4482e414 100644
--- a/ggml/include/ggml-opt.h
+++ b/ggml/include/ggml-opt.h
@@ -154,6 +154,19 @@ extern "C" {
 
     // get the gradient accumulator for a node from the forward graph
     GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
+    
+    // get optimizer state tensors (momentum and variance for AdamW)
+    GGML_API int64_t ggml_opt_get_iter(ggml_opt_context_t opt_ctx);
+    GGML_API void    ggml_opt_set_iter(ggml_opt_context_t opt_ctx, int64_t iter);
+    GGML_API int32_t ggml_opt_get_nparams(ggml_opt_context_t opt_ctx);
+    GGML_API struct ggml_tensor * ggml_opt_get_grad_m(ggml_opt_context_t opt_ctx, int32_t index);
+    GGML_API struct ggml_tensor * ggml_opt_get_grad_v(ggml_opt_context_t opt_ctx, int32_t index);
+
+    // ====== Optimizer State Persistence ======
+
+    GGML_API bool ggml_opt_save_state(ggml_opt_context_t opt_ctx, const char* filename);
+    GGML_API bool ggml_opt_load_state(ggml_opt_context_t opt_ctx, const char* filename);
+    GGML_API bool ggml_opt_load_tensors(ggml_opt_context_t opt_ctx, const char* filename);
 
     GGML_API enum ggml_opt_optimizer_type ggml_opt_context_optimizer_type(ggml_opt_context_t); //TODO consistent naming scheme
 
diff --git a/ggml/src/ggml-opt.cpp b/ggml/src/ggml-opt.cpp
index e078ad14a39..4aad7cb154e 100644
--- a/ggml/src/ggml-opt.cpp
+++ b/ggml/src/ggml-opt.cpp
@@ -633,6 +633,35 @@ struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_t
     return ggml_graph_get_grad_acc(opt_ctx->gb_opt, node);
 }
 
+int64_t ggml_opt_get_iter(ggml_opt_context_t opt_ctx) {
+    return opt_ctx->iter;
+}
+
+void ggml_opt_set_iter(ggml_opt_context_t opt_ctx, int64_t iter) {
+    opt_ctx->iter = iter;
+}
+
+int32_t ggml_opt_get_nparams(ggml_opt_context_t opt_ctx) {
+    if (!opt_ctx) {
+        return 0;
+    }
+    return (int32_t)opt_ctx->grad_m.size();
+}
+
+struct ggml_tensor * ggml_opt_get_grad_m(ggml_opt_context_t opt_ctx, int32_t index) {
+    if (index < 0 || index >= (int32_t)opt_ctx->grad_m.size()) {
+        return nullptr;
+    }
+    return opt_ctx->grad_m[index];
+}
+
+struct ggml_tensor * ggml_opt_get_grad_v(ggml_opt_context_t opt_ctx, int32_t index) {
+    if (index < 0 || index >= (int32_t)opt_ctx->grad_v.size()) {
+        return nullptr;
+    }
+    return opt_ctx->grad_v[index];
+}
+
 // ====== Optimization Result ======
 
 ggml_opt_result_t ggml_opt_result_init() {
@@ -1091,3 +1120,128 @@ GGML_API const char * ggml_opt_optimizer_name(enum ggml_opt_optimizer_type o) {
             return "undefined";
     };
 }
+
+// ====== Optimizer State Persistence ======
+
+bool ggml_opt_save_state(ggml_opt_context_t opt_ctx, const char* filename) {
+    if (!opt_ctx || !filename) {
+        return false;
+    }
+
+    struct gguf_context * gguf_ctx = gguf_init_empty();
+    if (!gguf_ctx) {
+        return false;
+    }
+
+    gguf_set_val_str(gguf_ctx, "general.type", "optimizer");
+    gguf_set_val_i64(gguf_ctx, "optimizer.iteration", ggml_opt_get_iter(opt_ctx));
+    gguf_set_val_i32(gguf_ctx, "optimizer.n_params", ggml_opt_get_nparams(opt_ctx));
+
+    int32_t total_params = ggml_opt_get_nparams(opt_ctx);
+    
+    for (int32_t i = 0; i < total_params; ++i) {
+        struct ggml_tensor * grad_m = ggml_opt_get_grad_m(opt_ctx, i);
+        struct ggml_tensor * grad_v = ggml_opt_get_grad_v(opt_ctx, i);
+        if (grad_m) {
+            gguf_add_tensor(gguf_ctx, grad_m);
+        }
+        if (grad_v) {
+            gguf_add_tensor(gguf_ctx, grad_v);
+        }
+    }
+    
+    bool success = gguf_write_to_file(gguf_ctx, filename, false);
+    gguf_free(gguf_ctx);
+    
+    return success;
+}
+
+bool ggml_opt_load_state(ggml_opt_context_t opt_ctx, const char* filename) {
+    if (!opt_ctx || !filename) {
+        return false;
+    }
+
+    struct ggml_context * gguf_ctx = nullptr;
+    struct gguf_init_params gguf_params = {
+        /* .no_alloc = */ false,
+        /* .ctx      = */ &gguf_ctx,
+    };
+    
+    struct gguf_context * gguf_context = gguf_init_from_file(filename, gguf_params);
+    if (!gguf_context) {
+        return false;
+    }
+    
+    int key_idx = gguf_find_key(gguf_context, "optimizer.iteration");
+    if (key_idx >= 0) {
+        int64_t saved_iter = gguf_get_val_i64(gguf_context, key_idx);
+        ggml_opt_set_iter(opt_ctx, saved_iter);
+    }
+        
+    gguf_free(gguf_context);
+    return true;
+}
+
+bool ggml_opt_load_tensors(ggml_opt_context_t opt_ctx, const char* filename) {
+    if (!opt_ctx || !filename) {
+        return false;
+    }
+
+    struct ggml_context * gguf_ctx = nullptr;
+    struct gguf_init_params gguf_params = {
+        /* .no_alloc = */ false,
+        /* .ctx      = */ &gguf_ctx,
+    };
+    
+    struct gguf_context * gguf_context = gguf_init_from_file(filename, gguf_params);
+    if (!gguf_context) {
+        return false;
+    }
+    
+    if (!gguf_ctx) {
+        gguf_free(gguf_context);
+        return false;
+    }
+    
+    int tensor_count = gguf_get_n_tensors(gguf_context);
+    int grad_m_loaded = 0, grad_v_loaded = 0;
+
+    for (int i = 0; i < tensor_count; ++i) {
+        const char* tensor_name = gguf_get_tensor_name(gguf_context, i);
+        if (!tensor_name) continue;
+        
+        struct ggml_tensor* gguf_tensor = ggml_get_tensor(gguf_ctx, tensor_name);
+        if (!gguf_tensor) continue;
+        
+        int32_t n_params = ggml_opt_get_nparams(opt_ctx);
+        
+        for (int32_t param_idx = 0; param_idx < n_params; ++param_idx) {
+            struct ggml_tensor* grad_m = ggml_opt_get_grad_m(opt_ctx, param_idx);
+            struct ggml_tensor* grad_v = ggml_opt_get_grad_v(opt_ctx, param_idx);
+            
+            if (grad_m && strlen(grad_m->name) > 0 && strcmp(tensor_name, grad_m->name) == 0) {
+                if (ggml_nelements(grad_m) == ggml_nelements(gguf_tensor)) {
+                    if (grad_m->data) {
+                        ggml_backend_tensor_set(grad_m, gguf_tensor->data, 0, ggml_nbytes(grad_m));
+                        grad_m_loaded++;
+                    }
+                }
+                break;
+            }
+            
+            if (grad_v && strlen(grad_v->name) > 0 && strcmp(tensor_name, grad_v->name) == 0) {
+                if (ggml_nelements(grad_v) == ggml_nelements(gguf_tensor)) {
+                    if (grad_v->data) {
+                        ggml_backend_tensor_set(grad_v, gguf_tensor->data, 0, ggml_nbytes(grad_v));
+                        grad_v_loaded++;
+                    }
+                }
+                break;
+            }
+        }
+        
+    }
+
+    gguf_free(gguf_context);
+    return (grad_m_loaded > 0 || grad_v_loaded > 0);
+}
diff --git a/include/llama.h b/include/llama.h
index 136d7690e45..ec8c54c164d 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1383,6 +1383,10 @@ extern "C" {
         void * get_opt_pars_ud;                     // userdata for calculating optimizer parameters
 
         enum ggml_opt_optimizer_type optimizer_type;
+
+        // Optional checkpoint loading
+        const char * checkpoint_path;        // path to checkpoint file to load optimizer state from (nullptr = don't load)
+        bool load_optimizer_state;          // whether to load optimizer state from checkpoint_path
     };
 
     LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
@@ -1394,7 +1398,8 @@ extern "C" {
             ggml_opt_result_t         result_eval,
             int64_t                   idata_split,
             ggml_opt_epoch_callback   callback_train,
-            ggml_opt_epoch_callback   callback_eval);
+            ggml_opt_epoch_callback   callback_eval,
+            int64_t                   resume_from_batch);
     
     // LoRA training parameters
     enum llama_lora_target_module {
@@ -1427,12 +1432,21 @@ extern "C" {
 
     // LoRA parameter filter (returns true for LoRA tensors only)
     LLAMA_API bool llama_opt_param_filter_lora(const struct ggml_tensor * tensor, void * userdata);
+    
+    LLAMA_API int64_t llama_opt_get_iter(struct llama_context * ctx);
 
     LLAMA_API bool llama_lora_save_adapter(
         const struct llama_adapter_lora * adapter,
         const char * filename,
         const struct llama_model * model
     );
+    
+    LLAMA_API bool llama_lora_save_checkpoint(
+        const struct llama_adapter_lora * adapter,
+        const char * filename,
+        const struct llama_model * model,
+        struct llama_context * ctx
+    );
 
 #ifdef __cplusplus
 }
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index d7f84d1d36b..07f9cc1a364 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2102,6 +2102,16 @@ void llama_context::opt_init(struct llama_model * model, struct llama_opt_params
             }
         }
     }
+
+    if (lopt_params.load_optimizer_state && lopt_params.checkpoint_path) {        
+        if (opt_load_state(lopt_params.checkpoint_path)) {
+            pending_optimizer_checkpoint_path = lopt_params.checkpoint_path;
+            should_load_optimizer_tensors = true;
+            optimizer_tensors_loaded = false;
+        } else {
+            LLAMA_LOG_ERROR("Failed to load optimizer state from: %s\n", lopt_params.checkpoint_path);
+        }
+    }
 }
 
 void llama_context::opt_epoch_iter(
@@ -2189,6 +2199,17 @@ void llama_context::opt_epoch_iter(
             }
             ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_tokens(), res->get_logits());
             ggml_opt_alloc(opt_ctx, train);
+            
+            // Load optimizer tensors on first training iteration if pending
+            if (train && should_load_optimizer_tensors && !optimizer_tensors_loaded) {
+                if (ggml_opt_load_tensors(opt_ctx, pending_optimizer_checkpoint_path.c_str())) {
+                    LLAMA_LOG_INFO("Successfully loaded optimizer state tensor data\n");
+                    optimizer_tensors_loaded = true;
+                } else {
+                    LLAMA_LOG_ERROR("Failed to load optimizer tensor data\n");
+                }
+                should_load_optimizer_tensors = false;  // Only try once
+            }
 
             res->set_inputs(&ubatch);
             {
@@ -2219,7 +2240,8 @@ void llama_context::opt_epoch(
         ggml_opt_result_t         result_eval,
         int64_t                   idata_split,
         ggml_opt_epoch_callback   callback_train,
-        ggml_opt_epoch_callback   callback_eval) {
+        ggml_opt_epoch_callback   callback_eval,
+        int64_t                   resume_from_batch) {
     const uint32_t n_ctx    = this->n_ctx();
     const uint32_t n_batch  = std::min(cparams.n_batch,  n_ctx);
     const uint32_t n_ubatch = std::min(cparams.n_ubatch, n_batch);
@@ -2234,7 +2256,7 @@ void llama_context::opt_epoch(
     std::vector<llama_token>        tokens(n_ctx);
     std::vector<llama_token> labels_sparse(n_ctx);
 
-    int64_t idata = 0;
+    int64_t idata = (resume_from_batch >= 0) ? resume_from_batch + 1 : 0;
 
     int64_t t_loop_start = ggml_time_us();
     int64_t ndata_in_loop = idata_split*ubatch_per_ctx;
@@ -2261,6 +2283,24 @@ void llama_context::opt_epoch(
     llama_batch_free(batch);
 }
 
+int64_t llama_context::opt_get_iter() {
+    return ggml_opt_get_iter(opt_ctx);
+}
+
+bool llama_context::opt_save_state(const char* filename) {
+    if (!opt_ctx) {
+        return false;
+    }
+    return ggml_opt_save_state(opt_ctx, filename);
+}
+
+bool llama_context::opt_load_state(const char* filename) {
+    if (!opt_ctx) {
+        return false;
+    }
+    return ggml_opt_load_state(opt_ctx, filename);
+}
+
 //
 // interface implementation
 //
@@ -2803,12 +2843,19 @@ void llama_opt_epoch(
         ggml_opt_result_t         result_eval,
         int64_t                   idata_split,
         ggml_opt_epoch_callback   callback_train,
-        ggml_opt_epoch_callback   callback_eval) {
+        ggml_opt_epoch_callback   callback_eval,
+        int64_t                   resume_from_batch) {
+    // Use the unified API that handles both normal and resume cases
     ctx->opt_epoch(
         dataset,
         result_train,
         result_eval,
         idata_split,
         callback_train,
-        callback_eval);
+        callback_eval,
+        resume_from_batch);
+}
+
+int64_t llama_opt_get_iter(struct llama_context * ctx) {
+    return ctx->opt_get_iter();
 }
diff --git a/src/llama-context.h b/src/llama-context.h
index f23aa8ee136..bbb8ab32272 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -157,7 +157,15 @@ struct llama_context {
             ggml_opt_result_t       result_eval,
             int64_t                 idata_split,
             ggml_opt_epoch_callback callback_train,
-            ggml_opt_epoch_callback callback_eval);
+            ggml_opt_epoch_callback callback_eval,
+            int64_t                 resume_from_batch = -1);
+
+    // Optimizer state access for checkpointing (delegated to ggml_opt API)
+    int64_t opt_get_iter();
+    
+    // Optimizer state persistence
+    bool opt_save_state(const char* filename);
+    bool opt_load_state(const char* filename);
 
     void opt_epoch_iter(
             ggml_opt_dataset_t               dataset,
@@ -262,6 +270,11 @@ struct llama_context {
 
     // training
     ggml_opt_context_t opt_ctx = nullptr;
+    
+    // optimizer state loading (deferred until after ggml_opt_build)
+    std::string pending_optimizer_checkpoint_path;
+    bool should_load_optimizer_tensors = false;
+    bool optimizer_tensors_loaded = false;
 
     ggml_threadpool_t threadpool       = nullptr;
     ggml_threadpool_t threadpool_batch = nullptr;
diff --git a/src/llama-lora-training.cpp b/src/llama-lora-training.cpp
index e7db81d591f..7ddc5307792 100644
--- a/src/llama-lora-training.cpp
+++ b/src/llama-lora-training.cpp
@@ -1,11 +1,8 @@
 #include "llama-lora-training.h"
 
 #include <cstring>
-#include <cmath>
 #include <random>
-#include <algorithm>
-#include <map>
-#include <stdexcept>
+#include <filesystem>
 
 
 ggml_context * llama_lora_create_context(size_t mem_size) {
@@ -357,3 +354,44 @@ bool llama_lora_save_adapter(
     gguf_free(gguf_ctx);
     return success;
 }
+
+bool llama_lora_save_checkpoint(
+    const struct llama_adapter_lora * adapter,
+    const char * checkpoint_path,
+    const struct llama_model * model,
+    struct llama_context * ctx
+) {
+    if (!adapter || !checkpoint_path || !model || !ctx) {
+        LLAMA_LOG_ERROR("llama_lora_save_checkpoint: invalid parameters\n");
+        return false;
+    }
+
+    std::filesystem::path checkpoint_dir = std::filesystem::path(checkpoint_path);
+    if (!checkpoint_dir.empty()) {
+        if (!std::filesystem::exists(checkpoint_dir)) {
+            if (!std::filesystem::create_directories(checkpoint_dir)) {
+                LLAMA_LOG_ERROR("llama_lora_save_checkpoint: failed to create checkpoint directory: %s\n", 
+                               checkpoint_dir.c_str());
+                return false;
+            }
+        }
+    }
+
+    std::filesystem::path model_path = checkpoint_dir / "model.gguf";
+    bool lora_saved = llama_lora_save_adapter(adapter, model_path.c_str(), model);
+    if (!lora_saved) {
+        LLAMA_LOG_ERROR("llama_lora_save_checkpoint: failed to save LoRA adapter weights to %s\n", 
+                        model_path.c_str());
+        return false;
+    }
+
+    std::filesystem::path optimizer_path = checkpoint_dir / "optimizer.gguf";    
+    bool optimizer_saved = ctx->opt_save_state(optimizer_path.c_str());
+    if (!optimizer_saved) {
+        LLAMA_LOG_ERROR("llama_lora_save_checkpoint: failed to save optimizer state to %s\n", 
+                        optimizer_path.c_str());
+        return false;
+    }
+
+    return true;
+}
diff --git a/src/llama-lora-training.h b/src/llama-lora-training.h
index ed777be7b36..3dd09ee58de 100644
--- a/src/llama-lora-training.h
+++ b/src/llama-lora-training.h
@@ -4,6 +4,7 @@
 #include "llama-model.h"
 #include "llama-adapter.h"
 #include "llama-impl.h"
+#include "llama-context.h"
 #include "ggml.h"
 
 

From d17ba354bd907b8bc672d650040ba7d64d01715f Mon Sep 17 00:00:00 2001
From: Marcus Edel <marcus.edel@collabora.com>
Date: Thu, 9 Oct 2025 11:21:13 -0400
Subject: [PATCH 2/4] Use the correct function call for the IM2COL_3D op.

Signed-off-by: Marcus Edel <marcus.edel@collabora.com>
---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 540254f31da..dbaa441b956 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -13176,16 +13176,16 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
     } else if (tensor->op == GGML_OP_IM2COL_3D) {
         const int32_t s0 = tensor->op_params[0];
         const int32_t s1 = tensor->op_params[1];
-        const int32_t s1 = tensor->op_params[2];
+        const int32_t s2 = tensor->op_params[2];
         const int32_t p0 = tensor->op_params[3];
         const int32_t p1 = tensor->op_params[4];
-        const int32_t p1 = tensor->op_params[5];
+        const int32_t p2 = tensor->op_params[5];
         const int32_t d0 = tensor->op_params[6];
         const int32_t d1 = tensor->op_params[7];
-        const int32_t d1 = tensor->op_params[8];
+        const int32_t d2 = tensor->op_params[8];
         const int32_t IC = tensor->op_params[9];
 
-        tensor_clone = ggml_im2col(ggml_ctx, src_clone[0], src_clone[1], IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, tensor->type);
+        tensor_clone = ggml_im2col_3d(ggml_ctx, src_clone[0], src_clone[1], IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, tensor->type);
     } else if (tensor->op == GGML_OP_TIMESTEP_EMBEDDING) {
         const int32_t dim = tensor->op_params[0];
         const int32_t max_period = tensor->op_params[1];

From 947d3f9cd0f90341ac7b7832b7200024f6df719f Mon Sep 17 00:00:00 2001
From: Marcus Edel <marcus.edel@collabora.com>
Date: Thu, 9 Oct 2025 11:22:15 -0400
Subject: [PATCH 3/4] Change format types to resolve warnings.

Signed-off-by: Marcus Edel <marcus.edel@collabora.com>
---
 examples/training/finetune-lora.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/training/finetune-lora.cpp b/examples/training/finetune-lora.cpp
index 0f166757246..b307dd34126 100644
--- a/examples/training/finetune-lora.cpp
+++ b/examples/training/finetune-lora.cpp
@@ -259,7 +259,7 @@ static void checkpoint_progress_callback(
         std::string checkpoint_path = get_checkpoint_filename(cb_data->checkpoint_save_dir, cb_data->global_step);
         
         if (!save_checkpoint(cb_data->ctx, cb_data->adapter, meta, checkpoint_path)) {
-            LOG_ERR("Failed to save checkpoint at step %ld\n", cb_data->global_step);
+            LOG_ERR("Failed to save checkpoint at step %lld\n", (long long)cb_data->global_step);
         }
     }
 }
@@ -547,7 +547,7 @@ int main(int argc, char ** argv) {
     ggml_opt_result_t result_eval  = ggml_opt_result_init();
 
     for (int epoch = start_epoch; epoch < ft_params.num_epochs; ++epoch) {
-        LOG_INF("Starting epoch %d (step %ld)\n", epoch, cb_data.global_step);
+    LOG_INF("Starting epoch %d (step %lld)\n", epoch, (long long)cb_data.global_step);
         cb_data.current_epoch = epoch;
         
         int64_t resume_batch = 0;
@@ -561,7 +561,7 @@ int main(int argc, char ** argv) {
             ggml_opt_epoch_callback_progress_bar : checkpoint_progress_callback;
 
         if (resume_batch > 0) {
-            LOG_INF("Resuming training from epoch %d, step %ld \n", epoch, resume_batch);
+            LOG_INF("Resuming training from epoch %d, step %lld \n", epoch, (long long)resume_batch);
         } else if (ft_params.checkpoint_save_steps > 0) {
             LOG_INF("Checkpointing enabled, saving every %d steps\n", ft_params.checkpoint_save_steps);
         } else {

From 4610e02ce6563e626a2c4aae065d54c49b9049ec Mon Sep 17 00:00:00 2001
From: Marcus Edel <marcus.edel@collabora.com>
Date: Thu, 9 Oct 2025 12:23:39 -0400
Subject: [PATCH 4/4] Fix cross-platform compilation errors: Windows filesystem
 path conversion using .string().c_str().

Signed-off-by: Marcus Edel <marcus.edel@collabora.com>
---
 src/llama-lora-training.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/llama-lora-training.cpp b/src/llama-lora-training.cpp
index 7ddc5307792..79edc62c70b 100644
--- a/src/llama-lora-training.cpp
+++ b/src/llama-lora-training.cpp
@@ -371,25 +371,25 @@ bool llama_lora_save_checkpoint(
         if (!std::filesystem::exists(checkpoint_dir)) {
             if (!std::filesystem::create_directories(checkpoint_dir)) {
                 LLAMA_LOG_ERROR("llama_lora_save_checkpoint: failed to create checkpoint directory: %s\n", 
-                               checkpoint_dir.c_str());
+                               checkpoint_dir.string().c_str());
                 return false;
             }
         }
     }
 
     std::filesystem::path model_path = checkpoint_dir / "model.gguf";
-    bool lora_saved = llama_lora_save_adapter(adapter, model_path.c_str(), model);
+    bool lora_saved = llama_lora_save_adapter(adapter, model_path.string().c_str(), model);
     if (!lora_saved) {
         LLAMA_LOG_ERROR("llama_lora_save_checkpoint: failed to save LoRA adapter weights to %s\n", 
-                        model_path.c_str());
+                        model_path.string().c_str());
         return false;
     }
 
     std::filesystem::path optimizer_path = checkpoint_dir / "optimizer.gguf";    
-    bool optimizer_saved = ctx->opt_save_state(optimizer_path.c_str());
+    bool optimizer_saved = ctx->opt_save_state(optimizer_path.string().c_str());
     if (!optimizer_saved) {
         LLAMA_LOG_ERROR("llama_lora_save_checkpoint: failed to save optimizer state to %s\n", 
-                        optimizer_path.c_str());
+                        optimizer_path.string().c_str());
         return false;
     }