finetune.cpp command-line arg

graehl · graehl · commit 5a22882c9e97 · 2025-05-28T13:24:18.000-07:00
add to ggml-opt learning rate (adamw alpha) cmdline arg, and an optimizer enum defaulting to adamw, preparatory to work to support SGD these are in common args a set of optimizer options active only for the new FINETUNE example (which includes all the previous finetune.cpp PERPLEXITY options as a precaution) perhaps breaking with precedent, the ggml_opt_optimizer_params struct is included directly as args - if desired, we can instead just add learning rate and optimizer type to a struct independent of ggml-opt.h as proposed in #13835
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -1095,6 +1095,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
         "llama-embedding",
         "llama-eval-callback",
         "llama-export-lora",
+        "llama-finetune",
         "llama-gen-docs",
         "llama-gguf",
         "llama-gguf-hash",
@@ -1239,6 +1240,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     sampler_type_names.pop_back();
 
 
+    params.optimize = ggml_opt_get_default_optimizer_params(NULL);
+    params.optimize.alpha = 1e-8; // default 1e-3 is much too high for LLAMA_EXAMPLE_FINETUNE
+
     /**
      * filter options by example
      * rules:
@@ -1472,14 +1476,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.ctx_shift = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
     add_opt(common_arg(
         {"--chunks"}, "N",
         string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
         [](common_params & params, int value) {
             params.n_chunks = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_RETRIEVAL}));
     add_opt(common_arg(
         {"-fa", "--flash-attn"},
         string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
@@ -2117,70 +2121,88 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.hellaswag = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE}));
     add_opt(common_arg(
         {"--hellaswag-tasks"}, "N",
         string_format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
         [](common_params & params, int value) {
             params.hellaswag_tasks = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE}));
     add_opt(common_arg(
         {"--winogrande"},
         "compute Winogrande score over random tasks from datafile supplied with -f",
         [](common_params & params) {
             params.winogrande = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE}));
     add_opt(common_arg(
         {"--winogrande-tasks"}, "N",
         string_format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
         [](common_params & params, int value) {
             params.winogrande_tasks = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE}));
     add_opt(common_arg(
         {"--multiple-choice"},
         "compute multiple choice score over random tasks from datafile supplied with -f",
         [](common_params & params) {
             params.multiple_choice = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE}));
     add_opt(common_arg(
         {"--multiple-choice-tasks"}, "N",
         string_format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
         [](common_params & params, int value) {
             params.multiple_choice_tasks = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE}));
     add_opt(common_arg(
         {"--kl-divergence"},
         "computes KL-divergence to logits provided via --kl-divergence-base",
         [](common_params & params) {
             params.kl_divergence = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE}));
     add_opt(common_arg(
         {"--save-all-logits", "--kl-divergence-base"}, "FNAME",
         "set logits file",
         [](common_params & params, const std::string & value) {
             params.logits_file = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE}));
     add_opt(common_arg(
         {"--ppl-stride"}, "N",
         string_format("stride for perplexity calculation (default: %d)", params.ppl_stride),
         [](common_params & params, int value) {
             params.ppl_stride = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE}));
     add_opt(common_arg(
         {"--ppl-output-type"}, "<0|1>",
         string_format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
         [](common_params & params, int value) {
             params.ppl_output_type = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE}));
+    add_opt(common_arg(
+        {"-lr", "-alpha", "--alpha", "--learning-rate"}, "ALPHA",
+        string_format("adamw optimizer alpha (default: %.1f)", (double)params.optimize.adamw.alpha),
+        [](common_params & params, const std::string & value) {
+            params.optimize.adamw.alpha = std::stof(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_FINETUNE}));
+    add_opt(common_arg(
+        {"-opt", "--optimizer"}, "N",
+        "adamw (N=0) or //TODO:SGD (N=1)",
+        [](common_params & params, int N) {
+          if (N == GGML_OPT_OPTIMIZER_SGD)
+            throw std::invalid_argument("TODO: implement SGD");
+          if (N >= GGML_OPT_OPTIMIZER_COUNT)
+            throw std::invalid_argument("invalid --optimizer N (try 0)");
+          params.optimize.optimizer = (enum ggml_opt_optimizer)N;
+        }
+    ).set_examples({LLAMA_EXAMPLE_FINETUNE}));
     add_opt(common_arg(
         {"-dt", "--defrag-thold"}, "N",
         string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
diff --git a/common/common.h b/common/common.h
@@ -3,6 +3,7 @@
 #pragma once
 
 #include "llama-cpp.h"
+#include "ggml-opt.h"
 
 #include <set>
 #include <string>
@@ -80,6 +81,7 @@ enum llama_example {
     LLAMA_EXAMPLE_LOOKUP,
     LLAMA_EXAMPLE_PARALLEL,
     LLAMA_EXAMPLE_TTS,
+    LLAMA_EXAMPLE_FINETUNE,
 
     LLAMA_EXAMPLE_COUNT,
 };
@@ -349,6 +351,8 @@ struct common_params {
     bool no_mmproj = false;         // explicitly disable multimodal model
     std::vector<std::string> image; // path to image file(s)
 
+    // finetune
+    struct ggml_opt_optimizer_params optimize;
     // embedding
     bool embedding         = false; // get only sentence embedding
     int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
diff --git a/examples/training/finetune.cpp b/examples/training/finetune.cpp
@@ -18,7 +18,7 @@ int main(int argc, char ** argv) {
 
     params.escape = false;
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_FINETUNE)) {
         return 1;
     }
 
@@ -60,8 +60,8 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true);
     ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get())/2);
 
-    struct ggml_opt_optimizer_params optimizer_params = ggml_opt_get_default_optimizer_params(nullptr);
-    optimizer_params.adamw.alpha = 1e-7f; // learning rate
+    struct ggml_opt_optimizer_params &optimizer_params = params.optimize;
+    LOG_INF("-optimizer %d -lr: %.1f", optimizer_params.optimizer, (double)optimizer_params.adamw.alpha);
 
     struct llama_opt_params lopt_params {
         /*n_ctx_train     =*/ 0,
diff --git a/ggml/include/ggml-opt.h b/ggml/include/ggml-opt.h
@@ -74,6 +74,14 @@ extern "C" {
         GGML_OPT_BUILD_TYPE_OPT     = 30,
     };
 
+    enum ggml_opt_optimizer
+    {
+      GGML_OPT_OPTIMIZER_ADAMW,
+      GGML_OPT_OPTIMIZER_SGD,
+
+      GGML_OPT_OPTIMIZER_COUNT
+    };
+
     // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
     struct ggml_opt_optimizer_params {
         // AdamW optimizer parameters
@@ -84,6 +92,7 @@ extern "C" {
             float eps;   // epsilon for numerical stability
             float wd;    // weight decay for AdamW, use 0.0f to disable
         } adamw;
+        enum ggml_opt_optimizer optimizer;
     };
 
     // callback to calculate optimizer parameters prior to a backward pass
diff --git a/ggml/src/ggml-opt.cpp b/ggml/src/ggml-opt.cpp
@@ -228,6 +228,7 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us
     result.adamw.beta2 = 0.999f;
     result.adamw.eps   = 1e-8f;
     result.adamw.wd    = 0.0f;
+    result.optimizer   = GGML_OPT_OPTIMIZER_ADAMW;
 
     return result;
 }

Original file line number	Diff line number	Diff line change
`@@ -228,6 +228,7 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us`
`228`	`228`	`result.adamw.beta2 = 0.999f;`
`229`	`229`	`result.adamw.eps = 1e-8f;`
`230`	`230`	`result.adamw.wd = 0.0f;`
	`231`	`+ result.optimizer = GGML_OPT_OPTIMIZER_ADAMW;`
`231`	`232`
`232`	`233`	`return result;`
`233`	`234`	`}`