update to support online training

mseminatore · mseminatore · commit 663fece0e863 · 2026-02-16T13:00:05.000-08:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -126,6 +126,10 @@ add_executable(test_training_convergence test_training_convergence.c testy/test_
 target_link_libraries(test_training_convergence libann ${BLAS_LIB} ${MATH_LIB})
 add_test(NAME test_training_convergence COMMAND test_training_convergence)
 
+add_executable(test_online_training test_online_training.c testy/test_main.c)
+target_link_libraries(test_online_training libann ${BLAS_LIB} ${MATH_LIB})
+add_test(NAME test_online_training COMMAND test_online_training)
+
 add_executable(test_json test_json.c testy/test_main.c)
 target_link_libraries(test_json libann ${BLAS_LIB} ${MATH_LIB})
 add_test(NAME test_json COMMAND test_json)
diff --git a/README.md b/README.md
@@ -115,6 +115,9 @@ ann_save_network | save a trained network (text)
 ann_load_network_binary | load a previously saved network (binary)
 ann_save_network_binary | save a trained network (binary)
 ann_train_network | train a network
+ann_train_begin | begin an online/incremental training session
+ann_train_step | train one mini-batch step (online training)
+ann_train_end | end an online/incremental training session
 ann_predict | predict an output using a previously trained network
 ann_set_convergence | set the convergence threshold (optional)
 ann_evaluate_accuracy | evaluate accuracy of trained network using test data
@@ -266,6 +269,37 @@ epoch,loss,learning_rate
 
 Plot with gnuplot, Python matplotlib, or Excel to diagnose training issues.
 
+## Online / Incremental Training
+
+For scenarios where data arrives incrementally (streaming, fine-tuning a loaded model, or user feedback), use the step-based training API:
+
+```c
+PNetwork net = ann_make_network(OPT_ADAM, LOSS_MSE);
+ann_add_layer(net, 784, LAYER_INPUT, ACTIVATION_NULL);
+ann_add_layer(net, 128, LAYER_HIDDEN, ACTIVATION_SIGMOID);
+ann_add_layer(net, 10, LAYER_OUTPUT, ACTIVATION_SOFTMAX);
+
+ann_train_begin(net);
+
+// Feed mini-batches one at a time
+for (int i = 0; i < num_batches; i++)
+{
+    real loss = ann_train_step(net, batch_inputs[i], batch_targets[i], batch_size);
+    printf("Step %d loss: %f\n", i, loss);
+
+    // Safe to predict mid-training (dropout is auto-disabled)
+    ann_predict(net, test_input, prediction);
+}
+
+ann_train_end(net);
+```
+
+Key differences from `ann_train_network()`:
+- **Does not reset optimizer state** — Adam momentum/variance are preserved across calls
+- **Does not reinitialize weights** — safe for fine-tuning loaded/pre-trained models
+- **Single sample training** — pass `batch_size=1` to train on individual examples
+- **`ann_predict()` is safe mid-training** — dropout is automatically disabled during inference
+
 # Hyperparameter Tuning
 
 The `ann_hypertune` module provides automated hyperparameter search to find
diff --git a/ann.c b/ann.c
@@ -2281,6 +2281,114 @@ real ann_train_network(PNetwork pnet, PTensor inputs, PTensor outputs, int rows)
 	return loss;
 }
 
+//-----------------------------------------------
+// Begin an online/incremental training session
+//-----------------------------------------------
+int ann_train_begin(PNetwork pnet)
+{
+	if (!pnet)
+		return ERR_NULL_PTR;
+
+	if (pnet->layer_count <= 0 || !pnet->layers)
+		return ERR_INVALID;
+
+	// Enable training mode (for dropout)
+	pnet->is_training = 1;
+
+	// Save base learning rate for schedulers
+	if (pnet->base_learning_rate == (real)0.0)
+		pnet->base_learning_rate = pnet->learning_rate;
+
+	// Initialize weights only if not already set (e.g. loaded model)
+	init_weights(pnet);
+
+	// Ensure batch tensors are allocated for the configured batch size
+	if (ensure_batch_tensors(pnet, pnet->batchSize) != ERR_OK)
+	{
+		invoke_error_callback(ERR_ALLOC, "ann_train_begin");
+		return ERR_ALLOC;
+	}
+
+	return ERR_OK;
+}
+
+//-----------------------------------------------
+// Train one mini-batch step (online training)
+//-----------------------------------------------
+real ann_train_step(PNetwork pnet, const real *inputs, const real *targets, int batch_size)
+{
+	if (!pnet || !inputs || !targets)
+		return (real)0.0;
+
+	if (batch_size <= 0)
+		return (real)0.0;
+
+	int input_node_count = pnet->layers[0].node_count;
+	int output_node_count = pnet->layers[pnet->layer_count - 1].node_count;
+
+	unsigned actual_batch_size = (unsigned)batch_size;
+
+	// Reallocate batch tensors if batch size changed
+	if (pnet->current_batch_size != actual_batch_size)
+	{
+		if (ensure_batch_tensors(pnet, actual_batch_size) != ERR_OK)
+		{
+			invoke_error_callback(ERR_ALLOC, "ann_train_step");
+			return (real)0.0;
+		}
+	}
+
+	// Allocate temporary batch target tensor
+	PTensor batch_targets = tensor_create(actual_batch_size, output_node_count);
+	if (!batch_targets)
+	{
+		invoke_error_callback(ERR_ALLOC, "ann_train_step");
+		return (real)0.0;
+	}
+
+	// Zero gradients
+	for (int layer = 0; layer < pnet->layer_count - 1; layer++)
+	{
+		tensor_fill(pnet->layers[layer].t_gradients, (real)0.0);
+		tensor_fill(pnet->layers[layer].t_bias_grad, (real)0.0);
+	}
+
+	// Copy inputs into batch input tensor
+	PTensor batch_input = pnet->layers[0].t_batch_values;
+	memcpy(batch_input->values, inputs, actual_batch_size * input_node_count * sizeof(real));
+
+	// Copy targets into batch target tensor
+	memcpy(batch_targets->values, targets, actual_batch_size * output_node_count * sizeof(real));
+
+	// Forward pass
+	eval_network_batched(pnet, actual_batch_size);
+
+	// Backward pass (computes loss and gradients)
+	real loss = back_propagate_batched(pnet, actual_batch_size, batch_targets);
+
+	// Increment training iteration (for Adam bias correction)
+	pnet->train_iteration++;
+
+	// Update weights
+	pnet->optimize_func(pnet);
+
+	tensor_free(batch_targets);
+
+	return loss;
+}
+
+//-----------------------------------------------
+// End an online/incremental training session
+//-----------------------------------------------
+void ann_train_end(PNetwork pnet)
+{
+	if (!pnet)
+		return;
+
+	// Disable training mode (for dropout)
+	pnet->is_training = 0;
+}
+
 //------------------------------
 // evaluate the accuracy 
 //------------------------------
@@ -2842,9 +2950,16 @@ int ann_predict(const PNetwork pnet, const real *inputs, real *outputs)
 		pnet->layers[0].t_values->values[node] = *inputs++;
 	}
 
+	// Temporarily disable training mode for inference (prevents dropout)
+	int was_training = pnet->is_training;
+	pnet->is_training = 0;
+
 	// evaluate network
 	eval_network(pnet);
 
+	// Restore training mode
+	pnet->is_training = was_training;
+
 	// get the outputs
 	node_count = pnet->layers[pnet->layer_count - 1].node_count;
 	for (int node = 0; node < node_count; node++)
diff --git a/ann.h b/ann.h
@@ -497,11 +497,67 @@ ANN_API PNetwork ann_load_network_binary(const char *filename);
  */
 ANN_API real ann_train_network(PNetwork pnet, PTensor inputs, PTensor outputs, int rows);
 
+/**
+ * Begin an online/incremental training session.
+ * 
+ * Prepares the network for step-by-step training without resetting
+ * optimizer state (e.g. Adam momentum). Weights are initialized only 
+ * if not already set (safe for loaded/pre-trained models).
+ * 
+ * Call ann_train_step() to train on individual mini-batches, then
+ * ann_train_end() when finished.
+ * 
+ * @param pnet Network to train (must have layers defined)
+ * @return ERR_OK on success
+ * @return ERR_NULL_PTR if pnet is NULL
+ * @return ERR_INVALID if network has no layers
+ * @return ERR_ALLOC if batch tensor allocation fails
+ * 
+ * @see ann_train_step() to train on a single mini-batch
+ * @see ann_train_end() to finish the training session
+ */
+ANN_API int ann_train_begin(PNetwork pnet);
+
+/**
+ * Train on a single mini-batch (online/incremental training).
+ * 
+ * Performs one forward pass, backward pass, and weight update on the
+ * provided mini-batch. Does not reset optimizer state between calls,
+ * enabling incremental learning on streaming data.
+ * 
+ * Must be called between ann_train_begin() and ann_train_end().
+ * 
+ * @param pnet Network being trained
+ * @param inputs Input data (batch_size consecutive input vectors, each of
+ *               size = first layer node_count)
+ * @param targets Target data (batch_size consecutive target vectors, each of
+ *                size = last layer node_count)
+ * @param batch_size Number of samples in this mini-batch
+ * @return Loss for this mini-batch, or 0.0 on error
+ * 
+ * @see ann_train_begin() to start a training session
+ * @see ann_train_end() to finish the training session
+ */
+ANN_API real ann_train_step(PNetwork pnet, const real *inputs, const real *targets, int batch_size);
+
+/**
+ * End an online/incremental training session.
+ * 
+ * Disables training mode (stops dropout from being applied).
+ * The network is ready for inference after this call.
+ * 
+ * @param pnet Network that was being trained
+ * 
+ * @see ann_train_begin() to start a training session
+ */
+ANN_API void ann_train_end(PNetwork pnet);
+
 /**
  * Run trained network on single input to produce output.
  * 
  * Forward-propagates input through all layers and returns the output
- * layer activations. Network must be trained before calling.
+ * layer activations. Safe to call during online training (between
+ * ann_train_begin/ann_train_end) — dropout is automatically disabled.
  * 
  * @param pnet Trained network (must not be NULL)
  * @param inputs Input feature vector (size = first layer node_count)
diff --git a/test_online_training.c b/test_online_training.c