ggml-org
diff --git a/‎CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎common/arg.cpp‎
Lines changed: 37 additions & 0 deletions b/‎common/arg.cpp‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎common/common.h‎
Lines changed: 11 additions & 3 deletions b/‎common/common.h‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎examples/training/finetune.cpp‎
Lines changed: 36 additions & 31 deletions b/‎examples/training/finetune.cpp‎
Lines changed: 36 additions & 31 deletions
diff --git a/‎ggml/include/ggml-opt.h‎
Lines changed: 27 additions & 8 deletions b/‎ggml/include/ggml-opt.h‎
Lines changed: 27 additions & 8 deletions
diff --git a/‎ggml/include/ggml.h‎
Lines changed: 11 additions & 2 deletions b/‎ggml/include/ggml.h‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎ggml/src/ggml-cpu/ggml-cpu.c‎
Lines changed: 6 additions & 0 deletions b/‎ggml/src/ggml-cpu/ggml-cpu.c‎
Lines changed: 6 additions & 0 deletions
@@ -12,6 +12,8 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 
+message("CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
+
 # Add path to modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
 
 
@@ -1181,6 +1181,7 @@ static void add_rpc_devices(std::string servers) {
 }
 
 bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
+    params.optimize = ggml_opt_get_default_optimizer_params(nullptr);
     auto ctx_arg = common_params_parser_init(params, ex, print_usage);
     const common_params params_org = ctx_arg.params; // the example can modify the default params
 
@@ -3376,5 +3377,41 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
 
+    add_opt(
+        common_arg(
+            { "-lr", "--learning-rate" }, "ALPHA",
+            string_format(
+                "adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~100x (no momentum)",
+                (double) params.optimize.adamw.alpha),
+            [](common_params & params, const std::string & value) { params.optimize.adamw.alpha = std::stof(value); })
+            .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+    add_opt(common_arg(
+                { "-wd", "--weight-decay" }, "WD",
+                string_format(
+                    "adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).",
+                    (double) params.optimize.adamw.wd),
+                [](common_params & params, const std::string & value) { params.optimize.adamw.wd = std::stof(value); })
+                .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+    add_opt(common_arg(
+                { "-val", "--val-split" }, "FRACTION",
+                string_format(
+                    "portion of data to use as validation when optimizing (default: %.2g).",
+                    (double) params.val_split),
+                [](common_params & params, const std::string & value) { params.val_split = std::stof(value); })
+                .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+    add_opt(common_arg({ "-epochs", "--epochs" }, "N",
+                       string_format("optimizer max # of epochs (default: %d)", params.epochs),
+                       [](common_params & params, int epochs) { params.epochs = epochs; })
+                .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+    add_opt(common_arg({ "-opt", "--optimizer" },
+        "sgd|adamw",
+        "adamw or sgd",
+        [](common_params & params, const std::string & name) {
+            params.optimizer = ggml_opt_get_optimizer(name.c_str());
+            if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
+                throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
+            }
+        }).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+
     return ctx_arg;
 }
@@ -2,13 +2,14 @@
 
 #pragma once
 
-#include "llama-cpp.h"
-
 #include <set>
+#include <sstream>
 #include <string>
 #include <string_view>
 #include <vector>
-#include <sstream>
+
+#include "ggml-opt.h"
+#include "llama-cpp.h"
 
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@@ -80,6 +81,7 @@ enum llama_example {
     LLAMA_EXAMPLE_LOOKUP,
     LLAMA_EXAMPLE_PARALLEL,
     LLAMA_EXAMPLE_TTS,
+    LLAMA_EXAMPLE_FINETUNE,
 
     LLAMA_EXAMPLE_COUNT,
 };
@@ -350,6 +352,12 @@ struct common_params {
     bool no_mmproj = false;         // explicitly disable multimodal model
     std::vector<std::string> image; // path to image file(s)
 
+    // finetune
+    struct ggml_opt_optimizer_params optimize;
+    enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
+    float val_split = 0.05f; // fraction of data used for validation when optimizing
+    unsigned epochs = 2;
+
     // embedding
     bool embedding         = false; // get only sentence embedding
     int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
 
@@ -1,29 +1,31 @@
-#include "arg.h"
-#include "common.h"
-#include "log.h"
-#include "llama.h"
-
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
 #include <vector>
 
+#include "arg.h"
+#include "common.h"
+#include "llama.h"
+#include "log.h"
+
 #if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
+#    pragma warning(disable : 4244 4267)  // possible loss of data
 #endif
 
 int main(int argc, char ** argv) {
     common_params params;
+    struct ggml_opt_optimizer_params & optimizer_params = params.optimize;
 
     params.escape = false;
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_FINETUNE)) {
         return 1;
     }
 
     if (params.use_mmap) {
-        LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n", __func__);
+        LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n",
+                __func__);
         params.use_mmap = false;
     }
     if (params.cache_type_k != GGML_TYPE_F32) {
@@ -38,11 +40,11 @@ int main(int argc, char ** argv) {
     common_init();
     llama_backend_init();
     llama_numa_init(params.numa);
-
     // load the model and apply lora adapter, if any
-    common_init_result llama_init = common_init_from_params(params);
-    llama_model_ptr   & model = llama_init.model;
-    llama_context_ptr & ctx   = llama_init.context;
+    common_init_result  llama_init = common_init_from_params(params);
+    llama_model_ptr &   model      = llama_init.model;
+    llama_context_ptr & ctx        = llama_init.context;
+    auto pctx = ctx.get();
 
     if (model == NULL) {
         LOG_ERR("%s: unable to load model\n", __func__);
@@ -55,31 +57,34 @@ int main(int argc, char ** argv) {
         LOG_INF("%s\n", common_params_get_system_info(params).c_str());
     }
 
-    constexpr float val_split = 0.05f;
-
-    std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true);
-    ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get())/2);
-
-    struct ggml_opt_optimizer_params optimizer_params = ggml_opt_get_default_optimizer_params(nullptr);
-    optimizer_params.adamw.alpha = 1e-7f; // learning rate
-
-    struct llama_opt_params lopt_params {
-        /*n_ctx_train     =*/ 0,
-        /*param_filter    =*/ llama_opt_param_filter_all,
-        /*param_filter_ud =*/ nullptr,
-        /*get_opt_pars    =*/ ggml_opt_get_constant_optimizer_params,
-        /*get_opt_pars_ud =*/ &optimizer_params,
+    std::vector<llama_token> tokens  = common_tokenize(pctx, params.prompt, true);
+    ggml_opt_dataset_t       dataset = common_opt_dataset_init(pctx, tokens, llama_n_ctx(pctx) / 2);
+
+    float alpha                = optimizer_params.adamw.alpha;
+    optimizer_params.sgd.alpha = alpha;
+    float wd                   = optimizer_params.adamw.wd;
+    optimizer_params.sgd.wd    = wd;
+    LOG_INF("-optimizer %s -lr %.2g -wd %.2g -epochs %d -val %.2g\n", ggml_opt_optimizer_name(params.optimizer),
+            (double) alpha, (double) wd, params.epochs, (double) params.val_split);
+
+    struct llama_opt_params lopt_params{
+        /*n_ctx_train     =*/0,
+        /*param_filter    =*/llama_opt_param_filter_all,
+        /*param_filter_ud =*/nullptr,
+        /*get_opt_pars    =*/ggml_opt_get_constant_optimizer_params,
+        /*get_opt_pars_ud =*/&optimizer_params,
+        /*optimizer_type  =*/params.optimizer,
     };
-    llama_opt_init(ctx.get(), model.get(), lopt_params);
+    llama_opt_init(pctx, model.get(), lopt_params);
 
-    const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - val_split);
+    const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - params.val_split);
 
     ggml_opt_result_t result_train = ggml_opt_result_init();
     ggml_opt_result_t result_eval  = ggml_opt_result_init();
 
-    for (int epoch = 0; epoch < 2; ++epoch) {
-        llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split,
-            ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
+    for (unsigned epoch = 0; epoch < params.epochs; ++epoch) {
+        llama_opt_epoch(pctx, dataset, result_train, result_eval, idata_split,
+                        ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
         fprintf(stderr, "\n");
 
         ggml_opt_result_reset(result_train);
 
@@ -74,16 +74,30 @@ extern "C" {
         GGML_OPT_BUILD_TYPE_OPT     = 30,
     };
 
+    enum ggml_opt_optimizer_type {
+        GGML_OPT_OPTIMIZER_TYPE_ADAMW,
+        GGML_OPT_OPTIMIZER_TYPE_SGD,
+
+        GGML_OPT_OPTIMIZER_TYPE_COUNT
+    };
+
+    // "adamw" or "sgd" (case insensitive)
+    GGML_API const char *                 ggml_opt_optimizer_name(enum ggml_opt_optimizer_type);
+    GGML_API enum ggml_opt_optimizer_type ggml_opt_get_optimizer(const char *);
+
     // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
     struct ggml_opt_optimizer_params {
-        // AdamW optimizer parameters
         struct {
-            float alpha; // learning rate
-            float beta1;
-            float beta2;
-            float eps;   // epsilon for numerical stability
-            float wd;    // weight decay for AdamW, use 0.0f to disable
+            float alpha;  // learning rate
+            float beta1;  // adamw
+            float beta2;  // adamw
+            float eps;    // epsilon for numerical stability
+            float wd;     // weight decay - 0.0f to disable
         } adamw;
+        struct {
+            float alpha; // learning rate
+            float wd; // weight decay
+        } sgd;
     };
 
     // callback to calculate optimizer parameters prior to a backward pass
@@ -113,7 +127,10 @@ extern "C" {
         int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done
 
         ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
-        void * get_opt_pars_ud;                     // userdata for calculating optimizer parameters
+        void *                        get_opt_pars_ud;  // userdata for calculating optimizer parameters
+
+        // only GGML_OPT_OPTIMIZER_TYPE_ADAMW allocates m, v per parameter
+        enum ggml_opt_optimizer_type optimizer;
     };
 
     // get parameters for an optimization context with defaults set where possible
@@ -186,7 +203,7 @@ extern "C" {
     //    The second context should contain all other tensors and will be (re)allocated automatically.
     //    Due to this automated allocation the data of the second context is not defined when accessed in user code.
     //    Note that the second dimension of the inputs/outputs are interpreted as the number of datapoints in those tensors.
-    // 4. Call ggml_opt_fit. If you need more control you can use ggml_opt_epoch instead.
+    // 4. Call ggml_opt_fit. If you need more control (e.g. optimizer sgd) you can use ggml_opt_epoch instead.
 
     // signature for a callback while evaluating opt_ctx on dataset, called after an evaluation
     typedef void (*ggml_opt_epoch_callback)(
@@ -226,12 +243,14 @@ extern "C" {
             struct ggml_tensor            * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
             ggml_opt_dataset_t              dataset,        // dataset with data and optionally also labels
             enum ggml_opt_loss_type         loss_type,      // loss to minimize
+            enum ggml_opt_optimizer_type    optimizer,      // sgd or adamw
             ggml_opt_get_optimizer_params   get_opt_pars,   // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
             int64_t                         nepoch,         // how many times the dataset should be iterated over
             int64_t                         nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
             float                           val_split,      // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
             bool                            silent);        // whether or not info prints to stderr should be suppressed
 
+    GGML_API enum ggml_opt_optimizer_type ggml_opt_context_optimizer_type(ggml_opt_context_t);
 #ifdef  __cplusplus
 }
 #endif
@@ -450,7 +450,7 @@ extern "C" {
         GGML_OP_REPEAT_BACK,
         GGML_OP_CONCAT,
         GGML_OP_SILU_BACK,
-        GGML_OP_NORM, // normalize
+        GGML_OP_NORM,  // normalize
         GGML_OP_RMS_NORM,
         GGML_OP_RMS_NORM_BACK,
         GGML_OP_GROUP_NORM,
@@ -486,7 +486,7 @@ extern "C" {
         GGML_OP_POOL_1D,
         GGML_OP_POOL_2D,
         GGML_OP_POOL_2D_BACK,
-        GGML_OP_UPSCALE, // nearest interpolate
+        GGML_OP_UPSCALE,  // nearest interpolate
         GGML_OP_PAD,
         GGML_OP_PAD_REFLECT_1D,
         GGML_OP_ARANGE,
@@ -517,6 +517,7 @@ extern "C" {
         GGML_OP_CROSS_ENTROPY_LOSS,
         GGML_OP_CROSS_ENTROPY_LOSS_BACK,
         GGML_OP_OPT_STEP_ADAMW,
+        GGML_OP_OPT_STEP_SGD,
 
         GGML_OP_COUNT,
     };
@@ -2063,6 +2064,14 @@ extern "C" {
             struct ggml_tensor  * v,
             struct ggml_tensor  * adamw_params); // parameters such a the learning rate
 
+    // SGD (with weight decay) step
+    GGML_API struct ggml_tensor * ggml_opt_step_sgd(
+        // params: alpha (learning rate), wd (weight decay)
+        struct ggml_context * ctx,
+        struct ggml_tensor *  a,
+        struct ggml_tensor *  grad,
+        struct ggml_tensor *  adamw_params);
+
     //
     // automatic differentiation
     //
 
@@ -2061,6 +2061,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
                 ggml_compute_forward_opt_step_adamw(params, tensor);
             }
             break;
+        case GGML_OP_OPT_STEP_SGD:
+            {
+                ggml_compute_forward_opt_step_sgd(params, tensor);
+            }
+            break;
         case GGML_OP_NONE:
             {
                 // nop
@@ -2345,6 +2350,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
         case GGML_OP_CROSS_ENTROPY_LOSS:
         case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
         case GGML_OP_OPT_STEP_ADAMW:
+        case GGML_OP_OPT_STEP_SGD:
             {
                 n_tasks = n_threads;
             } break;
Original file line number	Diff line number	Diff line change
`@@ -2061,6 +2061,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm`
`2061`	`2061`	`ggml_compute_forward_opt_step_adamw(params, tensor);`
`2062`	`2062`	`}`
`2063`	`2063`	`break;`
	`2064`	`+ case GGML_OP_OPT_STEP_SGD:`
	`2065`	`+ {`
	`2066`	`+ ggml_compute_forward_opt_step_sgd(params, tensor);`
	`2067`	`+ }`
	`2068`	`+ break;`
`2064`	`2069`	`case GGML_OP_NONE:`
`2065`	`2070`	`{`
`2066`	`2071`	`// nop`
`@@ -2345,6 +2350,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {`
`2345`	`2350`	`case GGML_OP_CROSS_ENTROPY_LOSS:`
`2346`	`2351`	`case GGML_OP_CROSS_ENTROPY_LOSS_BACK:`
`2347`	`2352`	`case GGML_OP_OPT_STEP_ADAMW:`
	`2353`	`+ case GGML_OP_OPT_STEP_SGD:`
`2348`	`2354`	`{`
`2349`	`2355`	`n_tasks = n_threads;`
`2350`	`2356`	`} break;`