ggml-org
diff --git a/‎.clang-format‎
Lines changed: 1 addition & 3 deletions b/‎.clang-format‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎common/arg.cpp‎
Lines changed: 1375 additions & 1548 deletions b/‎common/arg.cpp‎
Lines changed: 1375 additions & 1548 deletions
diff --git a/‎common/common.h‎
Lines changed: 2 additions & 0 deletions b/‎common/common.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/training/finetune.cpp‎
Lines changed: 11 additions & 3 deletions b/‎examples/training/finetune.cpp‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎ggml/include/ggml-opt.h‎
Lines changed: 16 additions & 10 deletions b/‎ggml/include/ggml-opt.h‎
Lines changed: 16 additions & 10 deletions
diff --git a/‎ggml/include/ggml.h‎
Lines changed: 9 additions & 2 deletions b/‎ggml/include/ggml.h‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎ggml/src/ggml-cpu/ggml-cpu.c‎
Lines changed: 6 additions & 0 deletions b/‎ggml/src/ggml-cpu/ggml-cpu.c‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cpu/ops.cpp‎
Lines changed: 64 additions & 4 deletions b/‎ggml/src/ggml-cpu/ops.cpp‎
Lines changed: 64 additions & 4 deletions
diff --git a/‎ggml/src/ggml-cpu/ops.h‎
Lines changed: 1 addition & 1 deletion b/‎ggml/src/ggml-cpu/ops.h‎
Lines changed: 1 addition & 1 deletion
@@ -23,7 +23,7 @@ AllowShortLambdasOnASingleLine: Inline
 AllowShortLoopsOnASingleLine: false
 AlwaysBreakBeforeMultilineStrings: true
 BinPackArguments: true
-BinPackParameters: true # OnePerLine
+BinPackParameters: true
 BitFieldColonSpacing: Both
 BreakBeforeBraces: Custom # Attach
 BraceWrapping:
@@ -45,7 +45,6 @@ BraceWrapping:
   SplitEmptyFunction: false
   SplitEmptyRecord: false
   SplitEmptyNamespace: false
-# BreakAdjacentStringLiterals: true
 BreakAfterAttributes: Never
 BreakBeforeBinaryOperators: None
 BreakBeforeInlineASMColon: OnlyMultiline
@@ -158,4 +157,3 @@ TabWidth:        4
 UseTab:          Never
 WhitespaceSensitiveMacros: ['STRINGIZE']
 ...
-
@@ -12,6 +12,8 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 
+message("CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
+
 # Add path to modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
 
 
@@ -354,6 +354,8 @@ struct common_params {
 
     // finetune
     struct ggml_opt_optimizer_params optimize;
+    unsigned                         epochs = 2;
+
     // embedding
     bool embedding         = false; // get only sentence embedding
     int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
 
@@ -38,7 +38,6 @@ int main(int argc, char ** argv) {
     common_init();
     llama_backend_init();
     llama_numa_init(params.numa);
-
     // load the model and apply lora adapter, if any
     common_init_result llama_init = common_init_from_params(params);
     llama_model_ptr   & model = llama_init.model;
@@ -61,7 +60,16 @@ int main(int argc, char ** argv) {
     ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get())/2);
 
     struct ggml_opt_optimizer_params & optimizer_params = params.optimize;
-    LOG_INF("-optimizer %d -lr: %.1f", optimizer_params.optimizer, (double) optimizer_params.adamw.alpha);
+    if (optimizer_params.optimizer == GGML_OPT_OPTIMIZER_SGD) {
+        double was = (double) optimizer_params.common.alpha;
+        double by  = 1e2;
+        double to  = was * by;
+        LOG_INF("sgd multiplying -lr by %.3g (no momentum) from -lr: %.2g to %.2g\n", by, was, to);
+        optimizer_params.common.alpha = to;
+    }
+
+    LOG_INF("-optimizer %s -lr %.2g -wd %.2g -epochs %d\n", ggml_opt_optimizer_name(optimizer_params.optimizer),
+            (double) optimizer_params.common.alpha, (double) optimizer_params.common.wd, params.epochs);
 
     struct llama_opt_params lopt_params {
         /*n_ctx_train     =*/ 0,
@@ -77,7 +85,7 @@ int main(int argc, char ** argv) {
     ggml_opt_result_t result_train = ggml_opt_result_init();
     ggml_opt_result_t result_eval  = ggml_opt_result_init();
 
-    for (int epoch = 0; epoch < 2; ++epoch) {
+    for (unsigned epoch = 0; epoch < params.epochs; ++epoch) {
         llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split,
             ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
         fprintf(stderr, "\n");
 
@@ -74,28 +74,34 @@ extern "C" {
         GGML_OPT_BUILD_TYPE_OPT     = 30,
     };
 
-    enum ggml_opt_optimizer {
+    enum ggml_opt_optimizer_type {
         GGML_OPT_OPTIMIZER_ADAMW,
         GGML_OPT_OPTIMIZER_SGD,
 
         GGML_OPT_OPTIMIZER_COUNT
     };
 
     // "adamw" or "sgd" (case insensitive)
-    GGML_API const char *            ggml_opt_optimizer_name(enum ggml_opt_optimizer);
-    GGML_API enum ggml_opt_optimizer named_ggml_opt_optimizer(const char *);
+    GGML_API const char *                 ggml_opt_optimizer_name(enum ggml_opt_optimizer_type);
+    GGML_API enum ggml_opt_optimizer_type ggml_opt_get_optimizer(const char *);
 
     // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
     struct ggml_opt_optimizer_params {
-        // AdamW optimizer parameters
+        // SGD and AdamW optimizer parameters
+        struct {
+            float alpha;  // learning rate
+            float wd;     // weight decay for SGD or AdamW, use 0.0f to disable
+        } common;
+
         struct {
             float alpha; // learning rate
-            float beta1;
-            float beta2;
-            float eps;   // epsilon for numerical stability
-            float wd;    // weight decay for AdamW, use 0.0f to disable
+            float beta1;  // adamw
+            float beta2;  // adamw
+            float eps;    // epsilon for numerical stability
         } adamw;
-        enum ggml_opt_optimizer optimizer;
+
+        // only GGML_OPT_OPTIMIZER_ADAMW allocates m, v per parameter
+        enum ggml_opt_optimizer_type optimizer;
     };
 
     // callback to calculate optimizer parameters prior to a backward pass
@@ -125,7 +131,7 @@ extern "C" {
         int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done
 
         ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
-        void * get_opt_pars_ud;                     // userdata for calculating optimizer parameters
+        void *                        get_opt_pars_ud;  // userdata for calculating optimizer parameters
     };
 
     // get parameters for an optimization context with defaults set where possible
 
@@ -450,7 +450,7 @@ extern "C" {
         GGML_OP_REPEAT_BACK,
         GGML_OP_CONCAT,
         GGML_OP_SILU_BACK,
-        GGML_OP_NORM, // normalize
+        GGML_OP_NORM,  // normalize
         GGML_OP_RMS_NORM,
         GGML_OP_RMS_NORM_BACK,
         GGML_OP_GROUP_NORM,
@@ -486,7 +486,7 @@ extern "C" {
         GGML_OP_POOL_1D,
         GGML_OP_POOL_2D,
         GGML_OP_POOL_2D_BACK,
-        GGML_OP_UPSCALE, // nearest interpolate
+        GGML_OP_UPSCALE,  // nearest interpolate
         GGML_OP_PAD,
         GGML_OP_PAD_REFLECT_1D,
         GGML_OP_ARANGE,
@@ -517,6 +517,7 @@ extern "C" {
         GGML_OP_CROSS_ENTROPY_LOSS,
         GGML_OP_CROSS_ENTROPY_LOSS_BACK,
         GGML_OP_OPT_STEP_ADAMW,
+        GGML_OP_OPT_STEP_SGD,
 
         GGML_OP_COUNT,
     };
@@ -2063,6 +2064,12 @@ extern "C" {
             struct ggml_tensor  * v,
             struct ggml_tensor  * adamw_params); // parameters such a the learning rate
 
+    // SGD (with weight decay) step
+    GGML_API struct ggml_tensor * ggml_opt_step_sgd(struct ggml_context * ctx, struct ggml_tensor * a,
+                                                    struct ggml_tensor * grad,
+                                                    // parameters: alpha (learning rate), wd (weight decay)
+                                                    struct ggml_tensor * adamw_params);
+
     //
     // automatic differentiation
     //
 
@@ -2061,6 +2061,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
                 ggml_compute_forward_opt_step_adamw(params, tensor);
             }
             break;
+        case GGML_OP_OPT_STEP_SGD:
+            {
+                ggml_compute_forward_opt_step_sgd(params, tensor);
+            }
+            break;
         case GGML_OP_NONE:
             {
                 // nop
@@ -2345,6 +2350,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
         case GGML_OP_CROSS_ENTROPY_LOSS:
         case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
         case GGML_OP_OPT_STEP_ADAMW:
+        case GGML_OP_OPT_STEP_SGD:
             {
                 n_tasks = n_threads;
             } break;
 
@@ -8946,7 +8946,7 @@ static void ggml_compute_forward_opt_step_adamw_f32(
     GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
     GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m));
     GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v));
-    GGML_ASSERT(ggml_nelements(adamw_params) == 7);
+    GGML_ASSERT(ggml_nelements(adamw_params) == 8);
 
     const int ith = params->ith;
     const int nth = params->nth;
@@ -8964,14 +8964,14 @@ static void ggml_compute_forward_opt_step_adamw_f32(
     const int ir1 = MIN(ir0 + dr, nr);
 
     const float * adamw_params_ptr = ggml_get_data_f32(adamw_params);
+
     const float alpha  = adamw_params_ptr[0];
     const float beta1  = adamw_params_ptr[1];
     const float beta2  = adamw_params_ptr[2];
     const float eps    = adamw_params_ptr[3];
-    const float wd     = adamw_params_ptr[4];
     const float beta1h = adamw_params_ptr[5];
     const float beta2h = adamw_params_ptr[6];
-
+    const float keep   = adamw_params_ptr[7];
     for (int ir = ir0; ir < ir1; ++ir) {
         const int64_t i03 = ir/(ne02*ne01);
         const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
@@ -8994,7 +8994,7 @@ static void ggml_compute_forward_opt_step_adamw_f32(
             // The weight decay is applied independently of the Adam momenta m and v.
             // This is NOT equivalent to l2 regularization that adds w[i00]*w[i00] to the loss.
             // See: https://arxiv.org/pdf/1711.05101v3.pdf
-            w[i00] = w[i00]*(1.0f - alpha*wd) - alpha*mh/vh;
+            w[i00] = w[i00] * keep - alpha * mh / vh;
         }
     }
 }
@@ -9016,3 +9016,63 @@ void ggml_compute_forward_opt_step_adamw(
             }
     }
 }
+
+static void ggml_compute_forward_opt_step_sgd_f32(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0         = dst->src[0];
+    const ggml_tensor * src0_grad    = dst->src[1];
+    const ggml_tensor * adamw_params = dst->src[2];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
+    GGML_ASSERT(ggml_nelements(adamw_params) == 8);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_nrows(src0);
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1) / nth;
+
+    // row range for this thread
+    const int ir0 = dr * ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    // using adamw param subset we care about - alpha, wd - could have a separate struct
+    const float * adamw_params_ptr = ggml_get_data_f32(adamw_params);
+    const float   alpha            = adamw_params_ptr[0];
+    const float   keep             = adamw_params_ptr[7];
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        const int64_t i03 = ir / (ne02 * ne01);
+        const int64_t i02 = (ir - i03 * ne02 * ne01) / ne01;
+        const int64_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01);
+
+        const size_t offset = i03 * nb03 + i02 * nb02 + i01 * nb01;
+
+        float *       w = (float *) ((char *) src0->data + offset);                   // weight
+        const float * g = (const float *) ((const char *) src0_grad->data + offset);  // grad
+
+        for (int i00 = 0; i00 < ne00; ++i00) {
+            w[i00] = w[i00] * keep - alpha * g[i00];
+        }
+    }
+}
+
+void ggml_compute_forward_opt_step_sgd(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_opt_step_sgd_f32(params, dst);
+            }
+            break;
+        default:
+            {
+                GGML_ABORT("fatal error - sgd is F32 only");
+            }
+    }
+}
@@ -104,7 +104,7 @@ void ggml_compute_forward_custom(const struct ggml_compute_params * params, stru
 void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-
+void ggml_compute_forward_opt_step_sgd(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 #ifdef __cplusplus
 }
 #endif
Original file line number	Diff line number	Diff line change
`@@ -2061,6 +2061,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm`
`2061`	`2061`	`ggml_compute_forward_opt_step_adamw(params, tensor);`
`2062`	`2062`	`}`
`2063`	`2063`	`break;`
	`2064`	`+ case GGML_OP_OPT_STEP_SGD:`
	`2065`	`+ {`
	`2066`	`+ ggml_compute_forward_opt_step_sgd(params, tensor);`
	`2067`	`+ }`
	`2068`	`+ break;`
`2064`	`2069`	`case GGML_OP_NONE:`
`2065`	`2070`	`{`
`2066`	`2071`	`// nop`
`@@ -2345,6 +2350,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {`
`2345`	`2350`	`case GGML_OP_CROSS_ENTROPY_LOSS:`
`2346`	`2351`	`case GGML_OP_CROSS_ENTROPY_LOSS_BACK:`
`2347`	`2352`	`case GGML_OP_OPT_STEP_ADAMW:`
	`2353`	`+ case GGML_OP_OPT_STEP_SGD:`
`2348`	`2354`	`{`
`2349`	`2355`	`n_tasks = n_threads;`
`2350`	`2356`	`} break;`
Original file line number	Diff line number	Diff line change
`@@ -104,7 +104,7 @@ void ggml_compute_forward_custom(const struct ggml_compute_params * params, stru`
`104`	`104`	`void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst);`
`105`	`105`	`void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);`
`106`	`106`	`void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);`
`107`		`-`
	`107`	`+void ggml_compute_forward_opt_step_sgd(const struct ggml_compute_params * params, struct ggml_tensor * dst);`
`108`	`108`	`#ifdef __cplusplus`
`109`	`109`	`}`
`110`	`110`	`#endif`