From 00d4297bee3c268f3ee7caabdc54b9979a217e2d Mon Sep 17 00:00:00 2001 From: vineet Date: Wed, 8 Oct 2025 17:25:05 -0400 Subject: [PATCH 1/4] finetune-lora: Add checkpoint saving & resuming from saved checkpoint This commit adds checkpointing for fine-tuning: - Add checkpoint saving every N steps with --checkpoint-save-steps - Save complete training state: model weights, optimizer state, metadata - Implement two-phase optimizer state loading to avoid memory issues - Add --resume-from and --auto-resume functionality - Store optimizer momentum/variance tensors in GGUF format - Add checkpoint validation for rank, alpha, and target modules - Update README.md with checkpointing documentation The optimizer state loading: iteration count is loaded during initialization, while tensor data (grad_m, grad_v) is loaded after ggml_opt_alloc creates the proper tensor structures. --- examples/training/README.md | 31 ++ examples/training/finetune-lora.cpp | 420 +++++++++++++++++++++++++--- examples/training/finetune.cpp | 19 +- ggml/include/ggml-opt.h | 13 + ggml/src/ggml-opt.cpp | 154 ++++++++++ include/llama.h | 16 +- src/llama-context.cpp | 55 +++- src/llama-context.h | 15 +- src/llama-lora-training.cpp | 46 ++- src/llama-lora-training.h | 1 + 10 files changed, 716 insertions(+), 54 deletions(-) diff --git a/examples/training/README.md b/examples/training/README.md index ed255a0e1af..7d1cda2a9ca 100644 --- a/examples/training/README.md +++ b/examples/training/README.md @@ -36,6 +36,14 @@ the base model frozen, making it memory-efficient. # Fine-tune existing LoRA adapter ./build/bin/llama-finetune-lora -m base_model.gguf -f dataset.txt --lora existing_adapter.gguf \ --output-adapter improved_adapter.gguf -ngl 999 -c 512 -b 512 -ub 512 + +# Training with checkpointing +./build/bin/llama-finetune-lora -m model.gguf -f dataset.txt -ngl 999 -c 512 -b 512 -ub 512 \ + --checkpoint-save-steps 50 --checkpoint-save-dir "./lora_checkpoints" + +# Resume training from checkpoint +./build/bin/llama-finetune-lora -m model.gguf -f dataset.txt -ngl 999 -c 512 -b 512 -ub 512 \ + --resume-from "./lora_checkpoints/checkpoint_step_00000150/" ``` @@ -53,6 +61,12 @@ the base model frozen, making it memory-efficient. - Default: `attn_q,attn_k,attn_v,attn_o` (attention modules) - `--output-adapter PATH` - Output adapter filename (default: auto-generated) +#### Checkpointing +- `--checkpoint-save-steps N` - Save checkpoint every N training steps (default: 100) +- `--checkpoint-save-dir PATH` - Directory for checkpoints (default: `./checkpoints`) +- `--resume-from PATH` - Resume training from specific checkpoint directory +- `--auto-resume` - Automatically resume from latest checkpoint in save directory + #### Standard Parameters - `-m MODEL` - Base model file (.gguf) - `-f FILE` - Training dataset @@ -68,11 +82,28 @@ After training, you'll get a small adapter file. Use it with the original base m ./build/bin/llama-cli -m base_model.gguf --lora trained_adapter.gguf -ngl 999 ``` +### Checkpointing + +The LoRA fine-tuning supports automatic checkpointing to save and resume training progress: + +#### Features +- **Automatic saving**: Model and optimizer state saved every N training steps +- **Complete state**: Includes LoRA weights, optimizer momentum, and training metadata +- **Resume capability**: Continue training from exact step with full optimizer state +- **Auto-resume**: Automatically find and resume from latest checkpoint + +#### Checkpoint Structure +Each checkpoint directory contains: +- `model.gguf` - LoRA adapter weights +- `optimizer.gguf` - Optimizer state (momentum, variance, iteration) +- `metadata.json` - Training parameters and step information + ### Troubleshooting - **Out of memory**: Reduce context length (`-c 256`), lower rank, or use fewer target modules - **Poor quality**: Increase rank, add more target modules, or train longer - **Large adapter**: Reduce rank or limit target modules +- **Checkpoint issues**: Ensure checkpoint directory contains all required files (model.gguf, optimizer.gguf, metadata.json) ### Help diff --git a/examples/training/finetune-lora.cpp b/examples/training/finetune-lora.cpp index c12e119613b..0f166757246 100644 --- a/examples/training/finetune-lora.cpp +++ b/examples/training/finetune-lora.cpp @@ -3,18 +3,19 @@ #include "log.h" #include "llama.h" -#include -#include #include -#include #include #include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data #endif +struct checkpoint_callback_data; +static checkpoint_callback_data* g_checkpoint_data = nullptr; + static uint32_t parse_lora_modules(const std::string& modules_str) { if (modules_str.empty()) { return LLAMA_LORA_TARGET_ATTN_Q | LLAMA_LORA_TARGET_ATTN_K | LLAMA_LORA_TARGET_ATTN_V | LLAMA_LORA_TARGET_ATTN_O; @@ -55,13 +56,20 @@ static uint32_t parse_lora_modules(const std::string& modules_str) { } static void print_lora_usage() { - printf("\nLoRA Fine-tuning Parameters:\n"); + printf("\n----- LoRA Fine-tuning Parameters -----\n"); printf(" --lora-rank N LoRA rank (default: 8, range: 1-512)\n"); printf(" --lora-alpha N LoRA alpha scaling factor (default: 16.0, range: 0.1-1000.0)\n"); printf(" --lora-modules MODULES Target modules as comma-separated list (default: attn_q,attn_k,attn_v,attn_o)\n"); printf(" Available modules: attn_q, attn_k, attn_v, attn_o, ffn_gate, ffn_up, ffn_down, output, all\n"); printf(" Examples: \"attn_q,attn_v\" or \"all\" or \"attn_q,attn_k,attn_v,attn_o,ffn_gate,ffn_up,ffn_down\"\n"); printf(" --output-adapter PATH Output path for trained adapter (default: auto-generated)\n"); + printf("\nTraining Options:\n"); + printf(" --num-epochs N Number of training epochs (default: 1)\n"); + printf("\nCheckpointing Options:\n"); + printf(" --checkpoint-save-steps N Save checkpoint every N training steps (default: 100)\n"); + printf(" --checkpoint-save-dir PATH Directory for checkpoints (default: ./checkpoints)\n"); + printf(" --resume-from PATH Resume training from specific checkpoint file\n"); + printf(" --auto-resume Automatically resume from latest checkpoint in save dir\n"); printf("\nExamples:\n"); printf(" # Train with rank=16, alpha=32, all attention modules\n"); printf(" %s -m model.gguf -f dataset.txt --lora-rank 16 --lora-alpha 32 --lora-modules attn_q,attn_k,attn_v,attn_o\n", "finetune-lora"); @@ -70,16 +78,207 @@ static void print_lora_usage() { printf("\n"); } -int main(int argc, char ** argv) { - common_params params; +struct checkpoint_metadata { + int32_t epoch; + int32_t lora_rank; + float lora_alpha; + uint32_t target_modules; +}; + +static std::string get_checkpoint_filename(const std::string& checkpoint_dir, int64_t step) { + std::ostringstream oss; + oss << checkpoint_dir << "/checkpoint_step_" << std::setfill('0') << std::setw(8) << step; + return oss.str(); +} + +static std::string find_latest_checkpoint(const std::string& checkpoint_dir) { + if (!std::filesystem::exists(checkpoint_dir)) { + return ""; + } + + std::string latest_checkpoint; + int64_t latest_step = -1; + + for (const auto& entry : std::filesystem::directory_iterator(checkpoint_dir)) { + if (entry.is_directory()) { + std::string dirname = entry.path().filename().string(); + if (dirname.find("checkpoint_step_") == 0 && dirname.size() >= 16) { + std::string step_str = dirname.substr(16, 8); + try { + int64_t step = std::stoll(step_str); + if (step > latest_step) { + latest_step = step; + latest_checkpoint = entry.path().string(); + } + } catch (const std::exception&) { + continue; + } + } + } + } + + return latest_checkpoint; +} + +static bool save_checkpoint(llama_context* ctx, llama_adapter_lora* adapter, const checkpoint_metadata& metadata, const std::string& checkpoint_dir) { + if (!std::filesystem::exists(checkpoint_dir)) { + if (!std::filesystem::create_directories(checkpoint_dir)) { + LOG_ERR("Failed to create checkpoint directory: %s\n", checkpoint_dir.c_str()); + return false; + } + } + + if (!llama_lora_save_checkpoint(adapter, checkpoint_dir.c_str(), llama_get_model(ctx), ctx)) { + LOG_ERR("Failed to save LoRA checkpoint\n"); + return false; + } + + std::string meta_path = checkpoint_dir + "/metadata.json"; + std::ofstream meta_file(meta_path); + if (meta_file.is_open()) { + meta_file << "epoch=" << metadata.epoch << "\n"; + meta_file << "lora_rank=" << metadata.lora_rank << "\n"; + meta_file << "lora_alpha=" << metadata.lora_alpha << "\n"; + meta_file << "target_modules=" << metadata.target_modules << "\n"; + meta_file.close(); + } else { + LOG_ERR("Failed to save checkpoint metadata\n"); + return false; + } + + LOG_INF("Checkpoint saved successfully to %s\n", checkpoint_dir.c_str()); + return true; +} + +static bool validate_checkpoint_metadata(const std::string& checkpoint_path, checkpoint_metadata& metadata) { + std::string checkpoint_dir = checkpoint_path; + + if (!std::filesystem::exists(checkpoint_dir)) { + LOG_ERR("Checkpoint directory does not exist: %s\n", checkpoint_dir.c_str()); + return false; + } + + LOG_INF("Loading checkpoint from: %s\n", checkpoint_dir.c_str()); + + std::string meta_path = checkpoint_dir + "/metadata.json"; + if (std::filesystem::exists(meta_path)) { + std::ifstream meta_file(meta_path); + if (meta_file.is_open()) { + std::string line; + while (std::getline(meta_file, line)) { + size_t eq_pos = line.find('='); + if (eq_pos != std::string::npos) { + std::string key = line.substr(0, eq_pos); + std::string value = line.substr(eq_pos + 1); + + if (key == "epoch") { + metadata.epoch = std::stoi(value); + } else if (key == "lora_rank") { + metadata.lora_rank = std::stoi(value); + } else if (key == "lora_alpha") { + metadata.lora_alpha = std::stof(value); + } else if (key == "target_modules") { + metadata.target_modules = std::stoul(value); + } + } + } + meta_file.close(); + } else { + LOG_ERR("Failed to open checkpoint metadata file\n"); + return false; + } + } else { + LOG_ERR("Checkpoint metadata file not found: %s\n", meta_path.c_str()); + return false; + } + + LOG_INF("Checkpoint loaded successfully\n"); + return true; +} + + +struct checkpoint_callback_data { + llama_context* ctx; + llama_adapter_lora* adapter; + int32_t checkpoint_save_steps; + std::string checkpoint_save_dir; + int64_t global_step; + int64_t initial_step; + int32_t current_epoch; + int32_t lora_rank; + float lora_alpha; + uint32_t target_modules; + float learning_rate; + std::string model_path; + std::string dataset_path; +}; + +static void checkpoint_progress_callback( + bool train, + ggml_opt_context_t opt_ctx, + ggml_opt_dataset_t dataset, + ggml_opt_result_t result, + int64_t ibatch, + int64_t ibatch_max, + int64_t t_start_us) { + ggml_opt_epoch_callback_progress_bar(train, opt_ctx, dataset, result, ibatch, ibatch_max, t_start_us); + + if (!train) return; + + checkpoint_callback_data* cb_data = g_checkpoint_data; + + if (!cb_data) { + LOG_ERR("Checkpoint callback data is null!\n"); + return; + } + + if (cb_data->checkpoint_save_steps <= 0) { + return; + } + + cb_data->global_step++; + + if (cb_data->global_step % cb_data->checkpoint_save_steps == 0) { + if (!cb_data->ctx) { + LOG_ERR("Context is null in checkpoint callback!\n"); + return; + } + + if (!cb_data->adapter) { + LOG_ERR("LoRA adapter is null in checkpoint callback!\n"); + return; + } + + checkpoint_metadata meta = { + /*epoch =*/ cb_data->current_epoch, + /*lora_rank =*/ cb_data->lora_rank, + /*lora_alpha =*/ cb_data->lora_alpha, + /*target_modules =*/ cb_data->target_modules, + }; + + std::string checkpoint_path = get_checkpoint_filename(cb_data->checkpoint_save_dir, cb_data->global_step); + + if (!save_checkpoint(cb_data->ctx, cb_data->adapter, meta, checkpoint_path)) { + LOG_ERR("Failed to save checkpoint at step %ld\n", cb_data->global_step); + } + } +} +struct finetune_params { int32_t lora_rank = 8; float lora_alpha = 16.0f; std::string lora_modules_str; std::string output_adapter_path; - - params.escape = false; - + + int32_t num_epochs = 1; + + int32_t checkpoint_save_steps = 100; + std::string checkpoint_save_dir = "./checkpoints"; + std::string resume_from_checkpoint; + bool auto_resume = false; +}; + +static bool parse_finetune_args(int& argc, char** argv, finetune_params& ft_params) { auto remove_arg_pair = [&](int i) { for (int j = i; j < argc - 2; j++) { argv[j] = argv[j + 2]; @@ -87,39 +286,96 @@ int main(int argc, char ** argv) { argc -= 2; }; - for (int i = 1; i < argc - 1; i++) { - if (strcmp(argv[i], "--lora-rank") == 0) { - lora_rank = std::atoi(argv[i + 1]); + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "--lora-rank") == 0 && i + 1 < argc) { + ft_params.lora_rank = std::atoi(argv[i + 1]); + remove_arg_pair(i); + i--; + } else if (strcmp(argv[i], "--lora-alpha") == 0 && i + 1 < argc) { + ft_params.lora_alpha = std::atof(argv[i + 1]); remove_arg_pair(i); i--; - } else if (strcmp(argv[i], "--lora-alpha") == 0) { - lora_alpha = std::atof(argv[i + 1]); + } else if (strcmp(argv[i], "--lora-modules") == 0 && i + 1 < argc) { + ft_params.lora_modules_str = argv[i + 1]; remove_arg_pair(i); i--; - } else if (strcmp(argv[i], "--lora-modules") == 0) { - lora_modules_str = argv[i + 1]; + } else if (strcmp(argv[i], "--output-adapter") == 0 && i + 1 < argc) { + ft_params.output_adapter_path = argv[i + 1]; remove_arg_pair(i); i--; - } else if (strcmp(argv[i], "--output-adapter") == 0) { - output_adapter_path = argv[i + 1]; + } else if (strcmp(argv[i], "--num-epochs") == 0 && i + 1 < argc) { + ft_params.num_epochs = std::atoi(argv[i + 1]); remove_arg_pair(i); i--; + } else if (strcmp(argv[i], "--checkpoint-save-steps") == 0 && i + 1 < argc) { + ft_params.checkpoint_save_steps = std::atoi(argv[i + 1]); + remove_arg_pair(i); + i--; + } else if (strcmp(argv[i], "--checkpoint-save-dir") == 0 && i + 1 < argc) { + ft_params.checkpoint_save_dir = argv[i + 1]; + remove_arg_pair(i); + i--; + } else if (strcmp(argv[i], "--resume-from") == 0 && i + 1 < argc) { + ft_params.resume_from_checkpoint = argv[i + 1]; + remove_arg_pair(i); + i--; + } else if (strcmp(argv[i], "--auto-resume") == 0) { + ft_params.auto_resume = true; + for (int j = i; j < argc - 1; j++) { + argv[j] = argv[j + 1]; + } + argc--; + i--; } } - LOG_INF("Using LoRA parameters: rank=%d, alpha=%.1f\n", lora_rank, lora_alpha); - for (int i = 1; i < argc; i++) { if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) { print_lora_usage(); } } + + return true; +} + +int main(int argc, char ** argv) { + common_params params; + finetune_params ft_params; + + params.escape = false; + parse_finetune_args(argc, argv, ft_params); if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) { - print_lora_usage(); return 1; } + LOG_INF("Using LoRA parameters: rank=%d, alpha=%.1f\n", ft_params.lora_rank, ft_params.lora_alpha); + LOG_INF("Training for %d epochs\n", ft_params.num_epochs); + + // Handle checkpoint auto-resume before model initialization + if (ft_params.auto_resume && ft_params.resume_from_checkpoint.empty()) { + std::string latest_checkpoint = find_latest_checkpoint(ft_params.checkpoint_save_dir); + if (!latest_checkpoint.empty()) { + ft_params.resume_from_checkpoint = latest_checkpoint; + LOG_INF("Auto-resume: found checkpoint %s\n", ft_params.resume_from_checkpoint.c_str()); + } + } + + // Load checkpoint LoRA adapter from directory structure (model.gguf) + if (!ft_params.resume_from_checkpoint.empty()) { + std::filesystem::path checkpoint_dir(ft_params.resume_from_checkpoint); + std::filesystem::path model_path = checkpoint_dir / "model.gguf"; + + LOG_INF("Loading checkpoint LoRA adapter: %s\n", model_path.c_str()); + common_adapter_lora_info lora_adapter; + lora_adapter.path = model_path.string(); + lora_adapter.scale = 1.0f; + lora_adapter.ptr = nullptr; + params.lora_adapters.clear(); // Remove any existing adapters + params.lora_adapters.push_back(lora_adapter); + LOG_INF("Checkpoint LoRA adapter added to params\n"); + } + if (params.use_mmap) { LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n", __func__); params.use_mmap = false; @@ -151,15 +407,15 @@ int main(int argc, char ** argv) { LOG_INF("%s\n", common_params_get_system_info(params).c_str()); } - uint32_t target_modules = parse_lora_modules(lora_modules_str); + uint32_t target_modules = parse_lora_modules(ft_params.lora_modules_str); if (target_modules == 0) { return 1; } struct llama_lora_training_params lora_params = { /*target_modules =*/ target_modules, - /*rank =*/ lora_rank, - /*alpha =*/ lora_alpha, + /*rank =*/ ft_params.lora_rank, + /*alpha =*/ ft_params.lora_alpha, /*dropout =*/ 0.0f, /*init_std =*/ 0.02f, }; @@ -201,38 +457,132 @@ int main(int argc, char ** argv) { std::vector tokens = common_tokenize(ctx.get(), params.prompt, true); ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get())/2); + int start_epoch = 0; + int64_t start_step = 0; + checkpoint_metadata checkpoint_meta = {}; + bool checkpoint_loaded = false; + + if (!ft_params.resume_from_checkpoint.empty()) { + if (validate_checkpoint_metadata(ft_params.resume_from_checkpoint, checkpoint_meta)) { + start_epoch = checkpoint_meta.epoch; + checkpoint_loaded = true; + + if (checkpoint_meta.lora_rank != ft_params.lora_rank) { + LOG_ERR("Checkpoint LoRA rank (%d) doesn't match current rank (%d). Use --resume-from to manually specify a compatible checkpoint.\n", + checkpoint_meta.lora_rank, ft_params.lora_rank); + return 1; + } + if (checkpoint_meta.lora_alpha != ft_params.lora_alpha) { + LOG_ERR("Checkpoint LoRA alpha (%.3f) doesn't match current alpha (%.3f)\n", + checkpoint_meta.lora_alpha, ft_params.lora_alpha); + return 1; + } + if (checkpoint_meta.target_modules != target_modules) { + LOG_ERR("Checkpoint target_modules doesn't match current target_modules\n"); + return 1; + } + + } else { + LOG_ERR("Failed to load checkpoint, starting from scratch\n"); + } + } + struct ggml_opt_optimizer_params optimizer_params = ggml_opt_get_default_optimizer_params(nullptr); optimizer_params.adamw.alpha = 1e-5f; // learning rate + std::string optimizer_checkpoint_path; + if (checkpoint_loaded && !ft_params.resume_from_checkpoint.empty()) { + std::filesystem::path checkpoint_dir(ft_params.resume_from_checkpoint); + optimizer_checkpoint_path = (checkpoint_dir / "optimizer.gguf").string(); + } + struct llama_opt_params lopt_params { - /*n_ctx_train =*/ 0, - /*param_filter =*/ llama_opt_param_filter_lora, - /*param_filter_ud =*/ nullptr, - /*get_opt_pars =*/ ggml_opt_get_constant_optimizer_params, - /*get_opt_pars_ud =*/ &optimizer_params, - /*optimizer_type =*/ GGML_OPT_OPTIMIZER_TYPE_ADAMW, + /*n_ctx_train =*/ 0, + /*param_filter =*/ llama_opt_param_filter_lora, + /*param_filter_ud =*/ nullptr, + /*get_opt_pars =*/ ggml_opt_get_constant_optimizer_params, + /*get_opt_pars_ud =*/ &optimizer_params, + /*optimizer_type =*/ GGML_OPT_OPTIMIZER_TYPE_ADAMW, + /*checkpoint_path =*/ checkpoint_loaded ? optimizer_checkpoint_path.c_str() : nullptr, + /*load_optimizer_state =*/ checkpoint_loaded, }; + llama_opt_init(ctx.get(), model.get(), lopt_params); - + + if (checkpoint_loaded) { + start_step = llama_opt_get_iter(ctx.get()); + } + + if (!trained_adapter) { + LOG_ERR("No trained adapter available for checkpointing\n"); + return 1; + } + const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - val_split); + const int64_t training_batches_per_epoch = idata_split; + + if (start_step > 0) { + int64_t completed_epochs = start_step / training_batches_per_epoch; + start_epoch = (int)completed_epochs; + } + + checkpoint_callback_data cb_data = { + /*ctx =*/ ctx.get(), + /*adapter =*/ trained_adapter, + /*checkpoint_save_steps =*/ ft_params.checkpoint_save_steps, + /*checkpoint_save_dir =*/ ft_params.checkpoint_save_dir, + /*global_step =*/ start_step, + /*initial_step =*/ start_step, + /*current_epoch =*/ start_epoch, + /*lora_rank =*/ ft_params.lora_rank, + /*lora_alpha =*/ ft_params.lora_alpha, + /*target_modules =*/ target_modules, + /*learning_rate =*/ optimizer_params.adamw.alpha, + /*model_path =*/ params.model.path, + /*dataset_path =*/ params.prompt_file, + }; + g_checkpoint_data = &cb_data; ggml_opt_result_t result_train = ggml_opt_result_init(); ggml_opt_result_t result_eval = ggml_opt_result_init(); - for (int epoch = 0; epoch < 2; ++epoch) { + for (int epoch = start_epoch; epoch < ft_params.num_epochs; ++epoch) { + LOG_INF("Starting epoch %d (step %ld)\n", epoch, cb_data.global_step); + cb_data.current_epoch = epoch; + + int64_t resume_batch = 0; + if (start_step > 0 && epoch == start_epoch) { + resume_batch = start_step % training_batches_per_epoch; + } + + ggml_opt_epoch_callback train_callback = (ft_params.checkpoint_save_steps <= 0) ? + ggml_opt_epoch_callback_progress_bar : checkpoint_progress_callback; + ggml_opt_epoch_callback eval_callback = (ft_params.checkpoint_save_steps <= 0) ? + ggml_opt_epoch_callback_progress_bar : checkpoint_progress_callback; + + if (resume_batch > 0) { + LOG_INF("Resuming training from epoch %d, step %ld \n", epoch, resume_batch); + } else if (ft_params.checkpoint_save_steps > 0) { + LOG_INF("Checkpointing enabled, saving every %d steps\n", ft_params.checkpoint_save_steps); + } else { + LOG_INF("Checkpointing disabled, using standard progress callback\n"); + } + llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split, - ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar); + train_callback, eval_callback, resume_batch); fprintf(stderr, "\n"); ggml_opt_result_reset(result_train); ggml_opt_result_reset(result_eval); } + + g_checkpoint_data = nullptr; ggml_opt_result_free(result_train); ggml_opt_result_free(result_eval); std::string adapter_filename; - if (!output_adapter_path.empty()) { - adapter_filename = output_adapter_path; + if (!ft_params.output_adapter_path.empty()) { + adapter_filename = ft_params.output_adapter_path; } else if (has_existing_lora) { adapter_filename = "finetuned-lora-adapter.gguf"; LOG_INF("Finetuned existing lora adapter, saving as: %s\n", adapter_filename.c_str()); diff --git a/examples/training/finetune.cpp b/examples/training/finetune.cpp index 561e61f8a21..bf179864281 100644 --- a/examples/training/finetune.cpp +++ b/examples/training/finetune.cpp @@ -62,14 +62,15 @@ int main(int argc, char ** argv) { ggml_opt_optimizer_name(params.optimizer), (double) lr.lr0, (double) lr.wd, (double) lr.lr_min, (double) lr.decay_epochs, (unsigned) lr.epochs, (double) params.n_batch / params.n_ubatch, (double) params.val_split); - struct llama_opt_params lopt_params{ - /*n_ctx_train =*/0, - // /*param_filter =*/llama_opt_param_filter_all, - llama_opt_param_filter_lora, - /*param_filter_ud =*/nullptr, - /*get_opt_pars =*/common_opt_lr_pars, - /*get_opt_pars_ud =*/¶ms.lr, - /*optimizer_type =*/params.optimizer, + struct llama_opt_params lopt_params { + /*n_ctx_train =*/ 0, + /*param_filter =*/ llama_opt_param_filter_all, + /*param_filter_ud =*/ nullptr, + /*get_opt_pars =*/ common_opt_lr_pars, + /*get_opt_pars_ud =*/ ¶ms.lr, + /*optimizer_type =*/ params.optimizer, + /*checkpoint_path =*/ nullptr, + /*load_optimizer_state =*/ false, }; llama_opt_init(ctx.get(), model.get(), lopt_params); @@ -80,7 +81,7 @@ int main(int argc, char ** argv) { for (lr.epoch = 0; lr.epoch < lr.epochs; ++lr.epoch) { llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split, - ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar); + ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar, -1); fprintf(stderr, "\n"); ggml_opt_result_reset(result_train); diff --git a/ggml/include/ggml-opt.h b/ggml/include/ggml-opt.h index 4703a05afe1..05f4482e414 100644 --- a/ggml/include/ggml-opt.h +++ b/ggml/include/ggml-opt.h @@ -154,6 +154,19 @@ extern "C" { // get the gradient accumulator for a node from the forward graph GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node); + + // get optimizer state tensors (momentum and variance for AdamW) + GGML_API int64_t ggml_opt_get_iter(ggml_opt_context_t opt_ctx); + GGML_API void ggml_opt_set_iter(ggml_opt_context_t opt_ctx, int64_t iter); + GGML_API int32_t ggml_opt_get_nparams(ggml_opt_context_t opt_ctx); + GGML_API struct ggml_tensor * ggml_opt_get_grad_m(ggml_opt_context_t opt_ctx, int32_t index); + GGML_API struct ggml_tensor * ggml_opt_get_grad_v(ggml_opt_context_t opt_ctx, int32_t index); + + // ====== Optimizer State Persistence ====== + + GGML_API bool ggml_opt_save_state(ggml_opt_context_t opt_ctx, const char* filename); + GGML_API bool ggml_opt_load_state(ggml_opt_context_t opt_ctx, const char* filename); + GGML_API bool ggml_opt_load_tensors(ggml_opt_context_t opt_ctx, const char* filename); GGML_API enum ggml_opt_optimizer_type ggml_opt_context_optimizer_type(ggml_opt_context_t); //TODO consistent naming scheme diff --git a/ggml/src/ggml-opt.cpp b/ggml/src/ggml-opt.cpp index e078ad14a39..4aad7cb154e 100644 --- a/ggml/src/ggml-opt.cpp +++ b/ggml/src/ggml-opt.cpp @@ -633,6 +633,35 @@ struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_t return ggml_graph_get_grad_acc(opt_ctx->gb_opt, node); } +int64_t ggml_opt_get_iter(ggml_opt_context_t opt_ctx) { + return opt_ctx->iter; +} + +void ggml_opt_set_iter(ggml_opt_context_t opt_ctx, int64_t iter) { + opt_ctx->iter = iter; +} + +int32_t ggml_opt_get_nparams(ggml_opt_context_t opt_ctx) { + if (!opt_ctx) { + return 0; + } + return (int32_t)opt_ctx->grad_m.size(); +} + +struct ggml_tensor * ggml_opt_get_grad_m(ggml_opt_context_t opt_ctx, int32_t index) { + if (index < 0 || index >= (int32_t)opt_ctx->grad_m.size()) { + return nullptr; + } + return opt_ctx->grad_m[index]; +} + +struct ggml_tensor * ggml_opt_get_grad_v(ggml_opt_context_t opt_ctx, int32_t index) { + if (index < 0 || index >= (int32_t)opt_ctx->grad_v.size()) { + return nullptr; + } + return opt_ctx->grad_v[index]; +} + // ====== Optimization Result ====== ggml_opt_result_t ggml_opt_result_init() { @@ -1091,3 +1120,128 @@ GGML_API const char * ggml_opt_optimizer_name(enum ggml_opt_optimizer_type o) { return "undefined"; }; } + +// ====== Optimizer State Persistence ====== + +bool ggml_opt_save_state(ggml_opt_context_t opt_ctx, const char* filename) { + if (!opt_ctx || !filename) { + return false; + } + + struct gguf_context * gguf_ctx = gguf_init_empty(); + if (!gguf_ctx) { + return false; + } + + gguf_set_val_str(gguf_ctx, "general.type", "optimizer"); + gguf_set_val_i64(gguf_ctx, "optimizer.iteration", ggml_opt_get_iter(opt_ctx)); + gguf_set_val_i32(gguf_ctx, "optimizer.n_params", ggml_opt_get_nparams(opt_ctx)); + + int32_t total_params = ggml_opt_get_nparams(opt_ctx); + + for (int32_t i = 0; i < total_params; ++i) { + struct ggml_tensor * grad_m = ggml_opt_get_grad_m(opt_ctx, i); + struct ggml_tensor * grad_v = ggml_opt_get_grad_v(opt_ctx, i); + if (grad_m) { + gguf_add_tensor(gguf_ctx, grad_m); + } + if (grad_v) { + gguf_add_tensor(gguf_ctx, grad_v); + } + } + + bool success = gguf_write_to_file(gguf_ctx, filename, false); + gguf_free(gguf_ctx); + + return success; +} + +bool ggml_opt_load_state(ggml_opt_context_t opt_ctx, const char* filename) { + if (!opt_ctx || !filename) { + return false; + } + + struct ggml_context * gguf_ctx = nullptr; + struct gguf_init_params gguf_params = { + /* .no_alloc = */ false, + /* .ctx = */ &gguf_ctx, + }; + + struct gguf_context * gguf_context = gguf_init_from_file(filename, gguf_params); + if (!gguf_context) { + return false; + } + + int key_idx = gguf_find_key(gguf_context, "optimizer.iteration"); + if (key_idx >= 0) { + int64_t saved_iter = gguf_get_val_i64(gguf_context, key_idx); + ggml_opt_set_iter(opt_ctx, saved_iter); + } + + gguf_free(gguf_context); + return true; +} + +bool ggml_opt_load_tensors(ggml_opt_context_t opt_ctx, const char* filename) { + if (!opt_ctx || !filename) { + return false; + } + + struct ggml_context * gguf_ctx = nullptr; + struct gguf_init_params gguf_params = { + /* .no_alloc = */ false, + /* .ctx = */ &gguf_ctx, + }; + + struct gguf_context * gguf_context = gguf_init_from_file(filename, gguf_params); + if (!gguf_context) { + return false; + } + + if (!gguf_ctx) { + gguf_free(gguf_context); + return false; + } + + int tensor_count = gguf_get_n_tensors(gguf_context); + int grad_m_loaded = 0, grad_v_loaded = 0; + + for (int i = 0; i < tensor_count; ++i) { + const char* tensor_name = gguf_get_tensor_name(gguf_context, i); + if (!tensor_name) continue; + + struct ggml_tensor* gguf_tensor = ggml_get_tensor(gguf_ctx, tensor_name); + if (!gguf_tensor) continue; + + int32_t n_params = ggml_opt_get_nparams(opt_ctx); + + for (int32_t param_idx = 0; param_idx < n_params; ++param_idx) { + struct ggml_tensor* grad_m = ggml_opt_get_grad_m(opt_ctx, param_idx); + struct ggml_tensor* grad_v = ggml_opt_get_grad_v(opt_ctx, param_idx); + + if (grad_m && strlen(grad_m->name) > 0 && strcmp(tensor_name, grad_m->name) == 0) { + if (ggml_nelements(grad_m) == ggml_nelements(gguf_tensor)) { + if (grad_m->data) { + ggml_backend_tensor_set(grad_m, gguf_tensor->data, 0, ggml_nbytes(grad_m)); + grad_m_loaded++; + } + } + break; + } + + if (grad_v && strlen(grad_v->name) > 0 && strcmp(tensor_name, grad_v->name) == 0) { + if (ggml_nelements(grad_v) == ggml_nelements(gguf_tensor)) { + if (grad_v->data) { + ggml_backend_tensor_set(grad_v, gguf_tensor->data, 0, ggml_nbytes(grad_v)); + grad_v_loaded++; + } + } + break; + } + } + + } + + gguf_free(gguf_context); + return (grad_m_loaded > 0 || grad_v_loaded > 0); +} diff --git a/include/llama.h b/include/llama.h index 136d7690e45..ec8c54c164d 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1383,6 +1383,10 @@ extern "C" { void * get_opt_pars_ud; // userdata for calculating optimizer parameters enum ggml_opt_optimizer_type optimizer_type; + + // Optional checkpoint loading + const char * checkpoint_path; // path to checkpoint file to load optimizer state from (nullptr = don't load) + bool load_optimizer_state; // whether to load optimizer state from checkpoint_path }; LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params); @@ -1394,7 +1398,8 @@ extern "C" { ggml_opt_result_t result_eval, int64_t idata_split, ggml_opt_epoch_callback callback_train, - ggml_opt_epoch_callback callback_eval); + ggml_opt_epoch_callback callback_eval, + int64_t resume_from_batch); // LoRA training parameters enum llama_lora_target_module { @@ -1427,12 +1432,21 @@ extern "C" { // LoRA parameter filter (returns true for LoRA tensors only) LLAMA_API bool llama_opt_param_filter_lora(const struct ggml_tensor * tensor, void * userdata); + + LLAMA_API int64_t llama_opt_get_iter(struct llama_context * ctx); LLAMA_API bool llama_lora_save_adapter( const struct llama_adapter_lora * adapter, const char * filename, const struct llama_model * model ); + + LLAMA_API bool llama_lora_save_checkpoint( + const struct llama_adapter_lora * adapter, + const char * filename, + const struct llama_model * model, + struct llama_context * ctx + ); #ifdef __cplusplus } diff --git a/src/llama-context.cpp b/src/llama-context.cpp index d7f84d1d36b..07f9cc1a364 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2102,6 +2102,16 @@ void llama_context::opt_init(struct llama_model * model, struct llama_opt_params } } } + + if (lopt_params.load_optimizer_state && lopt_params.checkpoint_path) { + if (opt_load_state(lopt_params.checkpoint_path)) { + pending_optimizer_checkpoint_path = lopt_params.checkpoint_path; + should_load_optimizer_tensors = true; + optimizer_tensors_loaded = false; + } else { + LLAMA_LOG_ERROR("Failed to load optimizer state from: %s\n", lopt_params.checkpoint_path); + } + } } void llama_context::opt_epoch_iter( @@ -2189,6 +2199,17 @@ void llama_context::opt_epoch_iter( } ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_tokens(), res->get_logits()); ggml_opt_alloc(opt_ctx, train); + + // Load optimizer tensors on first training iteration if pending + if (train && should_load_optimizer_tensors && !optimizer_tensors_loaded) { + if (ggml_opt_load_tensors(opt_ctx, pending_optimizer_checkpoint_path.c_str())) { + LLAMA_LOG_INFO("Successfully loaded optimizer state tensor data\n"); + optimizer_tensors_loaded = true; + } else { + LLAMA_LOG_ERROR("Failed to load optimizer tensor data\n"); + } + should_load_optimizer_tensors = false; // Only try once + } res->set_inputs(&ubatch); { @@ -2219,7 +2240,8 @@ void llama_context::opt_epoch( ggml_opt_result_t result_eval, int64_t idata_split, ggml_opt_epoch_callback callback_train, - ggml_opt_epoch_callback callback_eval) { + ggml_opt_epoch_callback callback_eval, + int64_t resume_from_batch) { const uint32_t n_ctx = this->n_ctx(); const uint32_t n_batch = std::min(cparams.n_batch, n_ctx); const uint32_t n_ubatch = std::min(cparams.n_ubatch, n_batch); @@ -2234,7 +2256,7 @@ void llama_context::opt_epoch( std::vector tokens(n_ctx); std::vector labels_sparse(n_ctx); - int64_t idata = 0; + int64_t idata = (resume_from_batch >= 0) ? resume_from_batch + 1 : 0; int64_t t_loop_start = ggml_time_us(); int64_t ndata_in_loop = idata_split*ubatch_per_ctx; @@ -2261,6 +2283,24 @@ void llama_context::opt_epoch( llama_batch_free(batch); } +int64_t llama_context::opt_get_iter() { + return ggml_opt_get_iter(opt_ctx); +} + +bool llama_context::opt_save_state(const char* filename) { + if (!opt_ctx) { + return false; + } + return ggml_opt_save_state(opt_ctx, filename); +} + +bool llama_context::opt_load_state(const char* filename) { + if (!opt_ctx) { + return false; + } + return ggml_opt_load_state(opt_ctx, filename); +} + // // interface implementation // @@ -2803,12 +2843,19 @@ void llama_opt_epoch( ggml_opt_result_t result_eval, int64_t idata_split, ggml_opt_epoch_callback callback_train, - ggml_opt_epoch_callback callback_eval) { + ggml_opt_epoch_callback callback_eval, + int64_t resume_from_batch) { + // Use the unified API that handles both normal and resume cases ctx->opt_epoch( dataset, result_train, result_eval, idata_split, callback_train, - callback_eval); + callback_eval, + resume_from_batch); +} + +int64_t llama_opt_get_iter(struct llama_context * ctx) { + return ctx->opt_get_iter(); } diff --git a/src/llama-context.h b/src/llama-context.h index f23aa8ee136..bbb8ab32272 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -157,7 +157,15 @@ struct llama_context { ggml_opt_result_t result_eval, int64_t idata_split, ggml_opt_epoch_callback callback_train, - ggml_opt_epoch_callback callback_eval); + ggml_opt_epoch_callback callback_eval, + int64_t resume_from_batch = -1); + + // Optimizer state access for checkpointing (delegated to ggml_opt API) + int64_t opt_get_iter(); + + // Optimizer state persistence + bool opt_save_state(const char* filename); + bool opt_load_state(const char* filename); void opt_epoch_iter( ggml_opt_dataset_t dataset, @@ -262,6 +270,11 @@ struct llama_context { // training ggml_opt_context_t opt_ctx = nullptr; + + // optimizer state loading (deferred until after ggml_opt_build) + std::string pending_optimizer_checkpoint_path; + bool should_load_optimizer_tensors = false; + bool optimizer_tensors_loaded = false; ggml_threadpool_t threadpool = nullptr; ggml_threadpool_t threadpool_batch = nullptr; diff --git a/src/llama-lora-training.cpp b/src/llama-lora-training.cpp index e7db81d591f..7ddc5307792 100644 --- a/src/llama-lora-training.cpp +++ b/src/llama-lora-training.cpp @@ -1,11 +1,8 @@ #include "llama-lora-training.h" #include -#include #include -#include -#include -#include +#include ggml_context * llama_lora_create_context(size_t mem_size) { @@ -357,3 +354,44 @@ bool llama_lora_save_adapter( gguf_free(gguf_ctx); return success; } + +bool llama_lora_save_checkpoint( + const struct llama_adapter_lora * adapter, + const char * checkpoint_path, + const struct llama_model * model, + struct llama_context * ctx +) { + if (!adapter || !checkpoint_path || !model || !ctx) { + LLAMA_LOG_ERROR("llama_lora_save_checkpoint: invalid parameters\n"); + return false; + } + + std::filesystem::path checkpoint_dir = std::filesystem::path(checkpoint_path); + if (!checkpoint_dir.empty()) { + if (!std::filesystem::exists(checkpoint_dir)) { + if (!std::filesystem::create_directories(checkpoint_dir)) { + LLAMA_LOG_ERROR("llama_lora_save_checkpoint: failed to create checkpoint directory: %s\n", + checkpoint_dir.c_str()); + return false; + } + } + } + + std::filesystem::path model_path = checkpoint_dir / "model.gguf"; + bool lora_saved = llama_lora_save_adapter(adapter, model_path.c_str(), model); + if (!lora_saved) { + LLAMA_LOG_ERROR("llama_lora_save_checkpoint: failed to save LoRA adapter weights to %s\n", + model_path.c_str()); + return false; + } + + std::filesystem::path optimizer_path = checkpoint_dir / "optimizer.gguf"; + bool optimizer_saved = ctx->opt_save_state(optimizer_path.c_str()); + if (!optimizer_saved) { + LLAMA_LOG_ERROR("llama_lora_save_checkpoint: failed to save optimizer state to %s\n", + optimizer_path.c_str()); + return false; + } + + return true; +} diff --git a/src/llama-lora-training.h b/src/llama-lora-training.h index ed777be7b36..3dd09ee58de 100644 --- a/src/llama-lora-training.h +++ b/src/llama-lora-training.h @@ -4,6 +4,7 @@ #include "llama-model.h" #include "llama-adapter.h" #include "llama-impl.h" +#include "llama-context.h" #include "ggml.h" From d17ba354bd907b8bc672d650040ba7d64d01715f Mon Sep 17 00:00:00 2001 From: Marcus Edel Date: Thu, 9 Oct 2025 11:21:13 -0400 Subject: [PATCH 2/4] Use the correct function call for the IM2COL_3D op. Signed-off-by: Marcus Edel --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 540254f31da..dbaa441b956 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -13176,16 +13176,16 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * } else if (tensor->op == GGML_OP_IM2COL_3D) { const int32_t s0 = tensor->op_params[0]; const int32_t s1 = tensor->op_params[1]; - const int32_t s1 = tensor->op_params[2]; + const int32_t s2 = tensor->op_params[2]; const int32_t p0 = tensor->op_params[3]; const int32_t p1 = tensor->op_params[4]; - const int32_t p1 = tensor->op_params[5]; + const int32_t p2 = tensor->op_params[5]; const int32_t d0 = tensor->op_params[6]; const int32_t d1 = tensor->op_params[7]; - const int32_t d1 = tensor->op_params[8]; + const int32_t d2 = tensor->op_params[8]; const int32_t IC = tensor->op_params[9]; - tensor_clone = ggml_im2col(ggml_ctx, src_clone[0], src_clone[1], IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, tensor->type); + tensor_clone = ggml_im2col_3d(ggml_ctx, src_clone[0], src_clone[1], IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, tensor->type); } else if (tensor->op == GGML_OP_TIMESTEP_EMBEDDING) { const int32_t dim = tensor->op_params[0]; const int32_t max_period = tensor->op_params[1]; From 947d3f9cd0f90341ac7b7832b7200024f6df719f Mon Sep 17 00:00:00 2001 From: Marcus Edel Date: Thu, 9 Oct 2025 11:22:15 -0400 Subject: [PATCH 3/4] Change format types to resolve warnings. Signed-off-by: Marcus Edel --- examples/training/finetune-lora.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/training/finetune-lora.cpp b/examples/training/finetune-lora.cpp index 0f166757246..b307dd34126 100644 --- a/examples/training/finetune-lora.cpp +++ b/examples/training/finetune-lora.cpp @@ -259,7 +259,7 @@ static void checkpoint_progress_callback( std::string checkpoint_path = get_checkpoint_filename(cb_data->checkpoint_save_dir, cb_data->global_step); if (!save_checkpoint(cb_data->ctx, cb_data->adapter, meta, checkpoint_path)) { - LOG_ERR("Failed to save checkpoint at step %ld\n", cb_data->global_step); + LOG_ERR("Failed to save checkpoint at step %lld\n", (long long)cb_data->global_step); } } } @@ -547,7 +547,7 @@ int main(int argc, char ** argv) { ggml_opt_result_t result_eval = ggml_opt_result_init(); for (int epoch = start_epoch; epoch < ft_params.num_epochs; ++epoch) { - LOG_INF("Starting epoch %d (step %ld)\n", epoch, cb_data.global_step); + LOG_INF("Starting epoch %d (step %lld)\n", epoch, (long long)cb_data.global_step); cb_data.current_epoch = epoch; int64_t resume_batch = 0; @@ -561,7 +561,7 @@ int main(int argc, char ** argv) { ggml_opt_epoch_callback_progress_bar : checkpoint_progress_callback; if (resume_batch > 0) { - LOG_INF("Resuming training from epoch %d, step %ld \n", epoch, resume_batch); + LOG_INF("Resuming training from epoch %d, step %lld \n", epoch, (long long)resume_batch); } else if (ft_params.checkpoint_save_steps > 0) { LOG_INF("Checkpointing enabled, saving every %d steps\n", ft_params.checkpoint_save_steps); } else { From 4610e02ce6563e626a2c4aae065d54c49b9049ec Mon Sep 17 00:00:00 2001 From: Marcus Edel Date: Thu, 9 Oct 2025 12:23:39 -0400 Subject: [PATCH 4/4] Fix cross-platform compilation errors: Windows filesystem path conversion using .string().c_str(). Signed-off-by: Marcus Edel --- src/llama-lora-training.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/llama-lora-training.cpp b/src/llama-lora-training.cpp index 7ddc5307792..79edc62c70b 100644 --- a/src/llama-lora-training.cpp +++ b/src/llama-lora-training.cpp @@ -371,25 +371,25 @@ bool llama_lora_save_checkpoint( if (!std::filesystem::exists(checkpoint_dir)) { if (!std::filesystem::create_directories(checkpoint_dir)) { LLAMA_LOG_ERROR("llama_lora_save_checkpoint: failed to create checkpoint directory: %s\n", - checkpoint_dir.c_str()); + checkpoint_dir.string().c_str()); return false; } } } std::filesystem::path model_path = checkpoint_dir / "model.gguf"; - bool lora_saved = llama_lora_save_adapter(adapter, model_path.c_str(), model); + bool lora_saved = llama_lora_save_adapter(adapter, model_path.string().c_str(), model); if (!lora_saved) { LLAMA_LOG_ERROR("llama_lora_save_checkpoint: failed to save LoRA adapter weights to %s\n", - model_path.c_str()); + model_path.string().c_str()); return false; } std::filesystem::path optimizer_path = checkpoint_dir / "optimizer.gguf"; - bool optimizer_saved = ctx->opt_save_state(optimizer_path.c_str()); + bool optimizer_saved = ctx->opt_save_state(optimizer_path.string().c_str()); if (!optimizer_saved) { LLAMA_LOG_ERROR("llama_lora_save_checkpoint: failed to save optimizer state to %s\n", - optimizer_path.c_str()); + optimizer_path.string().c_str()); return false; }