Skip to content

Commit 18ac019

Browse files
committed
add L1/L2 regularization
1 parent daab392 commit 18ac019

File tree

5 files changed

+169
-6
lines changed

5 files changed

+169
-6
lines changed

IMPROVEMENT_ROADMAP.md

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,6 @@ A prioritized list of remaining improvements and enhancements for the library.
88
- Would significantly improve training stability for deeper networks
99
- Requires forward pass normalization, learnable gamma/beta, and backward pass gradients
1010

11-
- [ ] **L1/L2 Weight Regularization** *(~2-3 hours)*
12-
- L2 (Ridge): Penalize large weights, reduces overfitting
13-
- L1 (LASSO): Encourage sparse weights for feature selection
14-
- Add `ann_set_weight_decay()` for L2, `ann_set_l1_regularization()` for L1
15-
- Apply in all optimizers during weight update
16-
1711
## Low Priority
1812

1913
- [ ] **Complete Network Serialization for Resumable Training** *(~4-6 hours)*
@@ -74,3 +68,4 @@ A prioritized list of remaining improvements and enhancements for the library.
7468
- Learning curve CSV export
7569
- Tensor optimizations (memcpy, memset, loop unrolling, cache-friendly access)
7670
- TPE hyperparameter optimization (`hypertune_tpe_search()`)
71+
- L1/L2 weight regularization (`ann_set_weight_decay()`, `ann_set_l1_regularization()`)

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,8 @@ ann_set_batch_size | set the mini-batch size
124124
ann_set_epoch_limit | set the maximum number of epochs
125125
ann_set_lr_scheduler | set learning rate scheduler callback
126126
ann_set_gradient_clip | set gradient clipping threshold
127+
ann_set_weight_decay | set L2 regularization (weight decay) coefficient
128+
ann_set_l1_regularization | set L1 regularization (LASSO) coefficient
127129
ann_set_dropout | set default dropout rate for hidden layers
128130
ann_set_layer_dropout | set dropout rate for a specific layer
129131
ann_get_layer_count | get the number of layers in the network

ann.c

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1501,6 +1501,49 @@ static void clip_gradients(PNetwork pnet)
15011501
}
15021502
}
15031503

1504+
//--------------------------------------------------------
1505+
// Apply L1/L2 weight regularization
1506+
// L2 (weight decay): W = W * (1 - lr * lambda)
1507+
// L1 (LASSO): W = W - lr * lambda * sign(W)
1508+
// Note: Only applied to weights, not biases (standard practice)
1509+
//--------------------------------------------------------
1510+
static void apply_regularization(PNetwork pnet)
1511+
{
1512+
real lr = pnet->learning_rate;
1513+
real l2 = pnet->l2_lambda;
1514+
real l1 = pnet->l1_lambda;
1515+
1516+
if (l2 <= (real)0.0 && l1 <= (real)0.0)
1517+
return;
1518+
1519+
for (int layer = 0; layer < pnet->layer_count - 1; layer++)
1520+
{
1521+
PTensor W = pnet->layers[layer].t_weights;
1522+
int size = W->rows * W->cols;
1523+
1524+
// L2 regularization: W = W * (1 - lr * l2)
1525+
if (l2 > (real)0.0)
1526+
{
1527+
real decay = (real)1.0 - lr * l2;
1528+
tensor_mul_scalar(W, decay);
1529+
}
1530+
1531+
// L1 regularization: W = W - lr * l1 * sign(W)
1532+
if (l1 > (real)0.0)
1533+
{
1534+
real *w = W->values;
1535+
real delta = lr * l1;
1536+
for (int i = 0; i < size; i++)
1537+
{
1538+
if (w[i] > (real)0.0)
1539+
w[i] -= delta;
1540+
else if (w[i] < (real)0.0)
1541+
w[i] += delta;
1542+
}
1543+
}
1544+
}
1545+
}
1546+
15041547
//--------------------------------------------------------
15051548
// Stochastic Gradient Descent (SGD)
15061549
//
@@ -1522,6 +1565,8 @@ static void optimize_sgd(PNetwork pnet)
15221565
// bias = bias + n * bias_grad
15231566
tensor_axpy(pnet->learning_rate, pnet->layers[layer].t_bias_grad, pnet->layers[layer].t_bias);
15241567
}
1568+
1569+
apply_regularization(pnet);
15251570
}
15261571

15271572
//-----------------------------------------------
@@ -1554,6 +1599,8 @@ static void optimize_momentum(PNetwork pnet)
15541599
// bias = bias + n * bias_m
15551600
tensor_axpy(pnet->learning_rate, pnet->layers[layer].t_bias_m, pnet->layers[layer].t_bias);
15561601
}
1602+
1603+
apply_regularization(pnet);
15571604
}
15581605

15591606
//-----------------------------------------------
@@ -1599,6 +1646,8 @@ static void optimize_adagrad(PNetwork pnet)
15991646
b->values[i] += pnet->learning_rate * grad / ((real)sqrt(bv->values[i]) + epsilon);
16001647
}
16011648
}
1649+
1650+
apply_regularization(pnet);
16021651
}
16031652

16041653
//-----------------------------------------------
@@ -1646,6 +1695,8 @@ static void optimize_rmsprop(PNetwork pnet)
16461695
b->values[i] += pnet->learning_rate * grad / ((real)sqrt(bv->values[i]) + epsilon);
16471696
}
16481697
}
1698+
1699+
apply_regularization(pnet);
16491700
}
16501701

16511702
//-----------------------------------------------
@@ -1721,6 +1772,8 @@ static void optimize_adam(PNetwork pnet)
17211772
b->values[i] += pnet->learning_rate * mhat / ((real)sqrt(vhat) + epsilon);
17221773
}
17231774
}
1775+
1776+
apply_regularization(pnet);
17241777
}
17251778

17261779
//[]---------------------------------------------[]
@@ -2043,6 +2096,8 @@ PNetwork ann_make_network(Optimizer_type opt, Loss_type loss_type)
20432096
pnet->base_learning_rate = (real)0.0; // set when training starts
20442097
pnet->default_dropout = (real)0.0; // dropout disabled by default
20452098
pnet->is_training = 0; // inference mode by default
2099+
pnet->l2_lambda = (real)0.0; // L2 regularization disabled
2100+
pnet->l1_lambda = (real)0.0; // L1 regularization disabled
20462101

20472102
// Training history
20482103
pnet->loss_history = NULL;
@@ -2430,6 +2485,28 @@ void ann_set_gradient_clip(PNetwork pnet, real max_grad)
24302485
pnet->max_gradient = max_grad;
24312486
}
24322487

2488+
//------------------------------
2489+
// set L2 regularization (weight decay)
2490+
//------------------------------
2491+
void ann_set_weight_decay(PNetwork pnet, real lambda)
2492+
{
2493+
if (!pnet)
2494+
return;
2495+
2496+
pnet->l2_lambda = lambda;
2497+
}
2498+
2499+
//------------------------------
2500+
// set L1 regularization (LASSO)
2501+
//------------------------------
2502+
void ann_set_l1_regularization(PNetwork pnet, real lambda)
2503+
{
2504+
if (!pnet)
2505+
return;
2506+
2507+
pnet->l1_lambda = lambda;
2508+
}
2509+
24332510
//------------------------------
24342511
// set weight initialization strategy
24352512
//------------------------------

ann.h

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,9 @@ struct Network
242242
real default_dropout; // default dropout rate for hidden layers (0 = disabled)
243243
int is_training; // 1 = training mode (apply dropout), 0 = inference mode
244244

245+
real l2_lambda; // L2 regularization (weight decay) coefficient (0 = disabled)
246+
real l1_lambda; // L1 regularization (LASSO) coefficient (0 = disabled)
247+
245248
Loss_func loss_func; // the error function
246249
Output_func print_func; // print output function
247250
Optimization_func optimize_func; // learning rate/weight optimizer
@@ -618,6 +621,36 @@ ANN_API void ann_set_convergence(PNetwork pnet, real limit);
618621
*/
619622
ANN_API void ann_set_gradient_clip(PNetwork pnet, real max_grad);
620623

624+
/**
625+
* Set L2 regularization (weight decay) coefficient.
626+
*
627+
* L2 regularization penalizes large weights by adding lambda * ||W||^2 to the loss.
628+
* This encourages smaller, more distributed weights and helps prevent overfitting.
629+
* Applied as: W = W * (1 - lr * lambda) during weight updates.
630+
*
631+
* Default: 0 (disabled)
632+
* Common values: 1e-4 to 1e-2
633+
*
634+
* @param pnet Network to configure
635+
* @param lambda L2 regularization strength (0 = disabled)
636+
*/
637+
ANN_API void ann_set_weight_decay(PNetwork pnet, real lambda);
638+
639+
/**
640+
* Set L1 regularization (LASSO) coefficient.
641+
*
642+
* L1 regularization penalizes the absolute value of weights, encouraging sparsity.
643+
* Pushes small weights toward exactly zero, useful for feature selection.
644+
* Applied as: W = W - lr * lambda * sign(W) during weight updates.
645+
*
646+
* Default: 0 (disabled)
647+
* Common values: 1e-5 to 1e-3
648+
*
649+
* @param pnet Network to configure
650+
* @param lambda L1 regularization strength (0 = disabled)
651+
*/
652+
ANN_API void ann_set_l1_regularization(PNetwork pnet, real lambda);
653+
621654
/**
622655
* Set weight initialization strategy.
623656
*

test_optimizers.c

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,5 +313,61 @@ void test_main(int argc, char *argv[]) {
313313
ann_free_network(net_large_lr);
314314
}
315315

316+
// ========================================================================
317+
// L1/L2 REGULARIZATION
318+
// ========================================================================
319+
SUITE("Weight Regularization");
320+
COMMENT("Testing L1 and L2 regularization...");
321+
322+
// L2 regularization (weight decay)
323+
PNetwork net_l2 = create_xor_network(OPT_ADAM);
324+
if (net_l2) {
325+
ann_set_weight_decay(net_l2, 0.001f);
326+
TESTEX("L2 lambda set correctly", (fabs(net_l2->l2_lambda - 0.001f) < 1e-6));
327+
328+
real loss_l2 = train_xor(net_l2);
329+
TESTEX("Training with L2 regularization completed", (isfinite(loss_l2)));
330+
TESTEX("L2 regularization still allows learning (<0.3)", (loss_l2 < 0.3f));
331+
332+
// Verify weights are smaller with regularization (spot check)
333+
real max_weight = 0.0f;
334+
PTensor w = net_l2->layers[1].t_weights;
335+
for (int i = 0; i < w->rows * w->cols; i++) {
336+
if (fabs(w->values[i]) > max_weight)
337+
max_weight = (real)fabs(w->values[i]);
338+
}
339+
TESTEX("L2 regularization limits weight magnitude (<10)", (max_weight < 10.0f));
340+
341+
ann_free_network(net_l2);
342+
}
343+
344+
// L1 regularization (LASSO)
345+
PNetwork net_l1 = create_xor_network(OPT_ADAM);
346+
if (net_l1) {
347+
ann_set_l1_regularization(net_l1, 0.0001f);
348+
TESTEX("L1 lambda set correctly", (fabs(net_l1->l1_lambda - 0.0001f) < 1e-7));
349+
350+
real loss_l1 = train_xor(net_l1);
351+
TESTEX("Training with L1 regularization completed", (isfinite(loss_l1)));
352+
TESTEX("L1 regularization still allows learning (<0.3)", (loss_l1 < 0.3f));
353+
354+
ann_free_network(net_l1);
355+
}
356+
357+
// Combined L1 + L2 regularization
358+
PNetwork net_elastic = create_xor_network(OPT_ADAM);
359+
if (net_elastic) {
360+
ann_set_weight_decay(net_elastic, 0.001f);
361+
ann_set_l1_regularization(net_elastic, 0.0001f);
362+
TESTEX("Combined L1+L2 set correctly",
363+
(fabs(net_elastic->l2_lambda - 0.001f) < 1e-6 &&
364+
fabs(net_elastic->l1_lambda - 0.0001f) < 1e-7));
365+
366+
real loss_elastic = train_xor(net_elastic);
367+
TESTEX("Training with elastic net regularization completed", (isfinite(loss_elastic)));
368+
369+
ann_free_network(net_elastic);
370+
}
371+
316372
TESTEX("Optimizer tests completed", 1);
317373
}

0 commit comments

Comments
 (0)