add L1/L2 regularization

mseminatore · mseminatore · commit 18ac019eca8e · 2026-02-11T17:22:44.000-08:00
diff --git a/IMPROVEMENT_ROADMAP.md b/IMPROVEMENT_ROADMAP.md
@@ -8,12 +8,6 @@ A prioritized list of remaining improvements and enhancements for the library.
   - Would significantly improve training stability for deeper networks
   - Requires forward pass normalization, learnable gamma/beta, and backward pass gradients
 
-- [ ] **L1/L2 Weight Regularization** *(~2-3 hours)*
-  - L2 (Ridge): Penalize large weights, reduces overfitting
-  - L1 (LASSO): Encourage sparse weights for feature selection
-  - Add `ann_set_weight_decay()` for L2, `ann_set_l1_regularization()` for L1
-  - Apply in all optimizers during weight update
-
 ## Low Priority
 
 - [ ] **Complete Network Serialization for Resumable Training** *(~4-6 hours)*
@@ -74,3 +68,4 @@ A prioritized list of remaining improvements and enhancements for the library.
 - Learning curve CSV export
 - Tensor optimizations (memcpy, memset, loop unrolling, cache-friendly access)
 - TPE hyperparameter optimization (`hypertune_tpe_search()`)
+- L1/L2 weight regularization (`ann_set_weight_decay()`, `ann_set_l1_regularization()`)
diff --git a/README.md b/README.md
@@ -124,6 +124,8 @@ ann_set_batch_size | set the mini-batch size
 ann_set_epoch_limit | set the maximum number of epochs
 ann_set_lr_scheduler | set learning rate scheduler callback
 ann_set_gradient_clip | set gradient clipping threshold
+ann_set_weight_decay | set L2 regularization (weight decay) coefficient
+ann_set_l1_regularization | set L1 regularization (LASSO) coefficient
 ann_set_dropout | set default dropout rate for hidden layers
 ann_set_layer_dropout | set dropout rate for a specific layer
 ann_get_layer_count | get the number of layers in the network
diff --git a/ann.c b/ann.c
@@ -1501,6 +1501,49 @@ static void clip_gradients(PNetwork pnet)
 	}
 }
 
+//--------------------------------------------------------
+// Apply L1/L2 weight regularization
+// L2 (weight decay): W = W * (1 - lr * lambda)
+// L1 (LASSO): W = W - lr * lambda * sign(W)
+// Note: Only applied to weights, not biases (standard practice)
+//--------------------------------------------------------
+static void apply_regularization(PNetwork pnet)
+{
+	real lr = pnet->learning_rate;
+	real l2 = pnet->l2_lambda;
+	real l1 = pnet->l1_lambda;
+
+	if (l2 <= (real)0.0 && l1 <= (real)0.0)
+		return;
+
+	for (int layer = 0; layer < pnet->layer_count - 1; layer++)
+	{
+		PTensor W = pnet->layers[layer].t_weights;
+		int size = W->rows * W->cols;
+
+		// L2 regularization: W = W * (1 - lr * l2)
+		if (l2 > (real)0.0)
+		{
+			real decay = (real)1.0 - lr * l2;
+			tensor_mul_scalar(W, decay);
+		}
+
+		// L1 regularization: W = W - lr * l1 * sign(W)
+		if (l1 > (real)0.0)
+		{
+			real *w = W->values;
+			real delta = lr * l1;
+			for (int i = 0; i < size; i++)
+			{
+				if (w[i] > (real)0.0)
+					w[i] -= delta;
+				else if (w[i] < (real)0.0)
+					w[i] += delta;
+			}
+		}
+	}
+}
+
 //--------------------------------------------------------
 // Stochastic Gradient Descent (SGD)
 //
@@ -1522,6 +1565,8 @@ static void optimize_sgd(PNetwork pnet)
 		// bias = bias + n * bias_grad
 		tensor_axpy(pnet->learning_rate, pnet->layers[layer].t_bias_grad, pnet->layers[layer].t_bias);
 	}
+
+	apply_regularization(pnet);
 }
 
 //-----------------------------------------------
@@ -1554,6 +1599,8 @@ static void optimize_momentum(PNetwork pnet)
 		// bias = bias + n * bias_m
 		tensor_axpy(pnet->learning_rate, pnet->layers[layer].t_bias_m, pnet->layers[layer].t_bias);
 	}
+
+	apply_regularization(pnet);
 }
 
 //-----------------------------------------------
@@ -1599,6 +1646,8 @@ static void optimize_adagrad(PNetwork pnet)
 			b->values[i] += pnet->learning_rate * grad / ((real)sqrt(bv->values[i]) + epsilon);
 		}
 	}
+
+	apply_regularization(pnet);
 }
 
 //-----------------------------------------------
@@ -1646,6 +1695,8 @@ static void optimize_rmsprop(PNetwork pnet)
 			b->values[i] += pnet->learning_rate * grad / ((real)sqrt(bv->values[i]) + epsilon);
 		}
 	}
+
+	apply_regularization(pnet);
 }
 
 //-----------------------------------------------
@@ -1721,6 +1772,8 @@ static void optimize_adam(PNetwork pnet)
 			b->values[i] += pnet->learning_rate * mhat / ((real)sqrt(vhat) + epsilon);
 		}
 	}
+
+	apply_regularization(pnet);
 }
 
 //[]---------------------------------------------[]
@@ -2043,6 +2096,8 @@ PNetwork ann_make_network(Optimizer_type opt, Loss_type loss_type)
 	pnet->base_learning_rate = (real)0.0;		// set when training starts
 	pnet->default_dropout	= (real)0.0;		// dropout disabled by default
 	pnet->is_training		= 0;				// inference mode by default
+	pnet->l2_lambda			= (real)0.0;		// L2 regularization disabled
+	pnet->l1_lambda			= (real)0.0;		// L1 regularization disabled
 	
 	// Training history
 	pnet->loss_history		= NULL;
@@ -2430,6 +2485,28 @@ void ann_set_gradient_clip(PNetwork pnet, real max_grad)
 	pnet->max_gradient = max_grad;
 }
 
+//------------------------------
+// set L2 regularization (weight decay)
+//------------------------------
+void ann_set_weight_decay(PNetwork pnet, real lambda)
+{
+	if (!pnet)
+		return;
+
+	pnet->l2_lambda = lambda;
+}
+
+//------------------------------
+// set L1 regularization (LASSO)
+//------------------------------
+void ann_set_l1_regularization(PNetwork pnet, real lambda)
+{
+	if (!pnet)
+		return;
+
+	pnet->l1_lambda = lambda;
+}
+
 //------------------------------
 // set weight initialization strategy
 //------------------------------
diff --git a/ann.h b/ann.h
@@ -242,6 +242,9 @@ struct Network
 	real default_dropout;				// default dropout rate for hidden layers (0 = disabled)
 	int is_training;					// 1 = training mode (apply dropout), 0 = inference mode
 
+	real l2_lambda;						// L2 regularization (weight decay) coefficient (0 = disabled)
+	real l1_lambda;						// L1 regularization (LASSO) coefficient (0 = disabled)
+
 	Loss_func loss_func;				// the error function
 	Output_func print_func;				// print output function
 	Optimization_func optimize_func;	// learning rate/weight optimizer
@@ -618,6 +621,36 @@ ANN_API void ann_set_convergence(PNetwork pnet, real limit);
  */
 ANN_API void ann_set_gradient_clip(PNetwork pnet, real max_grad);
 
+/**
+ * Set L2 regularization (weight decay) coefficient.
+ * 
+ * L2 regularization penalizes large weights by adding lambda * ||W||^2 to the loss.
+ * This encourages smaller, more distributed weights and helps prevent overfitting.
+ * Applied as: W = W * (1 - lr * lambda) during weight updates.
+ * 
+ * Default: 0 (disabled)
+ * Common values: 1e-4 to 1e-2
+ * 
+ * @param pnet Network to configure
+ * @param lambda L2 regularization strength (0 = disabled)
+ */
+ANN_API void ann_set_weight_decay(PNetwork pnet, real lambda);
+
+/**
+ * Set L1 regularization (LASSO) coefficient.
+ * 
+ * L1 regularization penalizes the absolute value of weights, encouraging sparsity.
+ * Pushes small weights toward exactly zero, useful for feature selection.
+ * Applied as: W = W - lr * lambda * sign(W) during weight updates.
+ * 
+ * Default: 0 (disabled)
+ * Common values: 1e-5 to 1e-3
+ * 
+ * @param pnet Network to configure
+ * @param lambda L1 regularization strength (0 = disabled)
+ */
+ANN_API void ann_set_l1_regularization(PNetwork pnet, real lambda);
+
 /**
  * Set weight initialization strategy.
  * 
diff --git a/test_optimizers.c b/test_optimizers.c
@@ -313,5 +313,61 @@ void test_main(int argc, char *argv[]) {
         ann_free_network(net_large_lr);
     }
 
+    // ========================================================================
+    // L1/L2 REGULARIZATION
+    // ========================================================================
+    SUITE("Weight Regularization");
+    COMMENT("Testing L1 and L2 regularization...");
+
+    // L2 regularization (weight decay)
+    PNetwork net_l2 = create_xor_network(OPT_ADAM);
+    if (net_l2) {
+        ann_set_weight_decay(net_l2, 0.001f);
+        TESTEX("L2 lambda set correctly", (fabs(net_l2->l2_lambda - 0.001f) < 1e-6));
+
+        real loss_l2 = train_xor(net_l2);
+        TESTEX("Training with L2 regularization completed", (isfinite(loss_l2)));
+        TESTEX("L2 regularization still allows learning (<0.3)", (loss_l2 < 0.3f));
+
+        // Verify weights are smaller with regularization (spot check)
+        real max_weight = 0.0f;
+        PTensor w = net_l2->layers[1].t_weights;
+        for (int i = 0; i < w->rows * w->cols; i++) {
+            if (fabs(w->values[i]) > max_weight)
+                max_weight = (real)fabs(w->values[i]);
+        }
+        TESTEX("L2 regularization limits weight magnitude (<10)", (max_weight < 10.0f));
+
+        ann_free_network(net_l2);
+    }
+
+    // L1 regularization (LASSO)
+    PNetwork net_l1 = create_xor_network(OPT_ADAM);
+    if (net_l1) {
+        ann_set_l1_regularization(net_l1, 0.0001f);
+        TESTEX("L1 lambda set correctly", (fabs(net_l1->l1_lambda - 0.0001f) < 1e-7));
+
+        real loss_l1 = train_xor(net_l1);
+        TESTEX("Training with L1 regularization completed", (isfinite(loss_l1)));
+        TESTEX("L1 regularization still allows learning (<0.3)", (loss_l1 < 0.3f));
+
+        ann_free_network(net_l1);
+    }
+
+    // Combined L1 + L2 regularization
+    PNetwork net_elastic = create_xor_network(OPT_ADAM);
+    if (net_elastic) {
+        ann_set_weight_decay(net_elastic, 0.001f);
+        ann_set_l1_regularization(net_elastic, 0.0001f);
+        TESTEX("Combined L1+L2 set correctly", 
+               (fabs(net_elastic->l2_lambda - 0.001f) < 1e-6 && 
+                fabs(net_elastic->l1_lambda - 0.0001f) < 1e-7));
+
+        real loss_elastic = train_xor(net_elastic);
+        TESTEX("Training with elastic net regularization completed", (isfinite(loss_elastic)));
+
+        ann_free_network(net_elastic);
+    }
+
     TESTEX("Optimizer tests completed", 1);
 }