ooples
diff --git a/‎docs/DistributedTrainingImplementations.md‎
Lines changed: 513 additions & 0 deletions b/‎docs/DistributedTrainingImplementations.md‎
Lines changed: 513 additions & 0 deletions
diff --git a/‎scripts/launch-distributed-training.ps1‎
Lines changed: 170 additions & 0 deletions b/‎scripts/launch-distributed-training.ps1‎
Lines changed: 170 additions & 0 deletions
diff --git a/‎scripts/launch-distributed-training.sh‎
Lines changed: 155 additions & 0 deletions b/‎scripts/launch-distributed-training.sh‎
Lines changed: 155 additions & 0 deletions
diff --git a/‎src/AutoML/AutoMLModelBase.cs‎
Lines changed: 36 additions & 0 deletions b/‎src/AutoML/AutoMLModelBase.cs‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎src/AutoML/NeuralArchitectureSearch.cs‎
Lines changed: 2 additions & 1 deletion b/‎src/AutoML/NeuralArchitectureSearch.cs‎
Lines changed: 2 additions & 1 deletion
@@ -0,0 +1,170 @@
+################################################################################
+# AiDotNet Distributed Training Launcher (PowerShell)
+#
+# This script launches distributed training using MPI across multiple processes.
+#
+# For Beginners:
+# MPI (Message Passing Interface) is a standard for running programs across
+# multiple computers or processors. Think of it like a coordinator that starts
+# your program on multiple machines at once and helps them communicate.
+#
+# Usage:
+#   .\launch-distributed-training.ps1 -NumProcesses <num> -Program <path> [additional args...]
+#
+# Examples:
+#   # Run on 4 GPUs locally
+#   .\launch-distributed-training.ps1 -NumProcesses 4 -Program ".\MyTrainingApp.exe"
+#
+#   # Run on 8 GPUs with additional arguments (note: args after Program are passed to the app)
+#   .\launch-distributed-training.ps1 -NumProcesses 8 -Program ".\MyTrainingApp.exe" -- --epochs 100 --lr 0.001
+#
+#   # Run with config file containing spaces in path (use -- separator)
+#   .\launch-distributed-training.ps1 -NumProcesses 8 -Program ".\MyTrainingApp.exe" -- --config "My Config.json"
+#
+#   # Run across 2 machines with 4 GPUs each
+#   .\launch-distributed-training.ps1 -NumProcesses 8 -Program ".\MyTrainingApp.exe" -Hosts "machine1,machine2"
+################################################################################
+
+param(
+    [Parameter(Mandatory=$true, HelpMessage="Number of processes to spawn (typically equals number of GPUs)")]
+    [int]$NumProcesses,
+
+    [Parameter(Mandatory=$true, HelpMessage="Path to your training program executable")]
+    [string]$Program,
+
+    [Parameter(Mandatory=$false, HelpMessage="Comma-separated list of host machines")]
+    [string]$Hosts = "",
+
+    [Parameter(
+        Mandatory = $false,
+        HelpMessage = "Additional arguments to pass to your program",
+        ValueFromRemainingArguments = $true)]
+    [string[]]$ProgramArgs = @()
+)
+
+# Display header
+Write-Host "======================================" -ForegroundColor Cyan
+Write-Host "AiDotNet Distributed Training Launcher" -ForegroundColor Cyan
+Write-Host "======================================" -ForegroundColor Cyan
+Write-Host ""
+
+# Display configuration
+Write-Host "Configuration:" -ForegroundColor Yellow
+Write-Host "  Number of processes: $NumProcesses"
+Write-Host "  Program: $Program"
+if ($ProgramArgs.Count -gt 0) {
+    Write-Host "  Program arguments: $($ProgramArgs -join ' ')"
+}
+if ($Hosts) {
+    Write-Host "  Hosts: $Hosts"
+}
+Write-Host ""
+
+# Check if mpiexec is available
+$mpiexec = Get-Command mpiexec -ErrorAction SilentlyContinue
+
+if (-not $mpiexec) {
+    Write-Host "Error: mpiexec not found in PATH" -ForegroundColor Red
+    Write-Host ""
+    Write-Host "For Beginners:" -ForegroundColor Yellow
+    Write-Host "  You need to install Microsoft MPI to run distributed training on Windows."
+    Write-Host "  Download from: https://docs.microsoft.com/en-us/message-passing-interface/microsoft-mpi"
+    Write-Host ""
+    Write-Host "  Installation steps:"
+    Write-Host "    1. Download MS-MPI installer"
+    Write-Host "    2. Install both the runtime (msmpisetup.exe) and SDK (msmpisdk.msi)"
+    Write-Host "    3. Restart your terminal/PowerShell"
+    exit 1
+}
+
+Write-Host "Using MPI command: $($mpiexec.Source)" -ForegroundColor Green
+Write-Host ""
+
+# Check if program exists
+if (-not (Test-Path $Program)) {
+    Write-Host "Error: Program '$Program' not found" -ForegroundColor Red
+    Write-Host ""
+    Write-Host "For Beginners:" -ForegroundColor Yellow
+    Write-Host "  Make sure you've built your training program and the path is correct."
+    Write-Host "  Example: dotnet publish -c Release -o .\publish"
+    Write-Host "  Then use: -Program '.\publish\MyTrainingApp.exe'"
+    exit 1
+}
+
+# Security: Validate that Program is an executable file
+$ProgramItem = Get-Item -Path $Program -ErrorAction Stop
+$allowedExtensions = @('.exe', '.dll')
+if ($ProgramItem.Extension -notin $allowedExtensions) {
+    Write-Host "Error: Program must be an executable (.exe) or .NET assembly (.dll)" -ForegroundColor Red
+    Write-Host "  Received: $($ProgramItem.Extension)" -ForegroundColor Red
+    Write-Host ""
+    Write-Host "Security Note:" -ForegroundColor Yellow
+    Write-Host "  Only executable files (.exe) and .NET assemblies (.dll) are allowed"
+    Write-Host "  to prevent execution of potentially malicious scripts or documents."
+    exit 1
+}
+
+# Security: Resolve to absolute path to prevent path traversal attacks
+$Program = $ProgramItem.FullName
+Write-Host "Resolved program path: $Program" -ForegroundColor Green
+Write-Host ""
+
+# Build mpiexec command
+$mpiCommand = "mpiexec"
+$mpiArgsList = @(
+    "-n", $NumProcesses.ToString()
+)
+
+# Add hosts if specified
+if ($Hosts) {
+    $mpiArgsList += @("-hosts", $Hosts)
+}
+
+# Add the program
+$mpiArgsList += $Program
+
+# Add program arguments if specified
+if ($ProgramArgs.Count -gt 0) {
+    $mpiArgsList += $ProgramArgs
+}
+
+# Display command
+Write-Host "Launching distributed training..." -ForegroundColor Yellow
+Write-Host "Command: $mpiCommand $($mpiArgsList -join ' ')" -ForegroundColor Gray
+Write-Host ""
+Write-Host "======================================" -ForegroundColor Cyan
+Write-Host ""
+
+# Launch distributed training
+try {
+    # Use Start-Process to capture output and wait for completion
+    $process = Start-Process -FilePath $mpiCommand -ArgumentList $mpiArgsList -NoNewWindow -Wait -PassThru
+    $exitCode = $process.ExitCode
+}
+catch {
+    Write-Host ""
+    Write-Host "======================================" -ForegroundColor Cyan
+    Write-Host "Error launching training: $_" -ForegroundColor Red
+    Write-Host "======================================" -ForegroundColor Cyan
+    exit 1
+}
+
+# Display results
+Write-Host ""
+Write-Host "======================================" -ForegroundColor Cyan
+if ($exitCode -eq 0) {
+    Write-Host "Training completed successfully!" -ForegroundColor Green
+}
+else {
+    Write-Host "Training failed with exit code: $exitCode" -ForegroundColor Red
+    Write-Host ""
+    Write-Host "Common issues:" -ForegroundColor Yellow
+    Write-Host "  - Make sure all nodes can communicate (check firewalls)"
+    Write-Host "  - Verify MS-MPI is installed on all machines"
+    Write-Host "  - Check that the program path is correct on all machines"
+    Write-Host "  - Ensure sufficient GPU memory is available"
+    Write-Host "  - Try running with fewer processes to check for memory issues"
+}
+Write-Host "======================================" -ForegroundColor Cyan
+
+exit $exitCode
@@ -0,0 +1,155 @@
+#!/bin/bash
+
+################################################################################
+# AiDotNet Distributed Training Launcher (Bash)
+#
+# This script launches distributed training using MPI across multiple processes.
+#
+# For Beginners:
+# MPI (Message Passing Interface) is a standard for running programs across
+# multiple computers or processors. Think of it like a coordinator that starts
+# your program on multiple machines at once and helps them communicate.
+#
+# Usage:
+#   ./launch-distributed-training.sh <num_processes> <program> [args...]
+#
+# Examples:
+#   # Run on 4 GPUs locally
+#   ./launch-distributed-training.sh 4 ./MyTrainingApp
+#
+#   # Run on 8 GPUs with additional arguments
+#   ./launch-distributed-training.sh 8 ./MyTrainingApp --epochs 100 --lr 0.001
+#
+#   # Run across 2 machines with 4 GPUs each
+#   ./launch-distributed-training.sh 8 ./MyTrainingApp --hosts machine1,machine2
+################################################################################
+
+# Check if enough arguments provided
+if [ "$#" -lt 2 ]; then
+    echo "Error: Insufficient arguments"
+    echo ""
+    echo "Usage: $0 <num_processes> <program> [args...]"
+    echo ""
+    echo "Arguments:"
+    echo "  num_processes  - Number of processes to spawn (typically equals number of GPUs)"
+    echo "  program        - Path to your training program executable"
+    echo "  args           - Any additional arguments to pass to your program"
+    echo ""
+    echo "Examples:"
+    echo "  $0 4 ./MyTrainingApp"
+    echo "  $0 8 ./MyTrainingApp --epochs 100"
+    exit 1
+fi
+
+# Parse arguments
+NUM_PROCESSES=$1
+PROGRAM=$2
+shift 2
+PROGRAM_ARGS=("$@")
+
+echo "======================================"
+echo "AiDotNet Distributed Training Launcher"
+echo "======================================"
+echo ""
+echo "Configuration:"
+echo "  Number of processes: $NUM_PROCESSES"
+echo "  Program: $PROGRAM"
+if [ "${#PROGRAM_ARGS[@]}" -gt 0 ]; then
+    echo "  Program arguments: ${PROGRAM_ARGS[*]}"
+else
+    echo "  Program arguments: (none)"
+fi
+echo ""
+
+# Check if mpiexec/mpirun is available
+if command -v mpiexec &> /dev/null; then
+    MPI_CMD="mpiexec"
+elif command -v mpirun &> /dev/null; then
+    MPI_CMD="mpirun"
+else
+    echo "Error: Neither mpiexec nor mpirun found in PATH"
+    echo ""
+    echo "For Beginners:"
+    echo "  You need to install MPI to run distributed training."
+    echo "  On Ubuntu/Debian: sudo apt-get install mpich"
+    echo "  On macOS: brew install mpich"
+    echo "  On Windows: Install Microsoft MPI from https://docs.microsoft.com/en-us/message-passing-interface/microsoft-mpi"
+    exit 1
+fi
+
+echo "Using MPI command: $MPI_CMD"
+echo ""
+
+# Check if program exists
+if [ ! -f "$PROGRAM" ]; then
+    echo "Error: Program '$PROGRAM' not found"
+    echo ""
+    echo "For Beginners:"
+    echo "  Make sure you've built your training program and the path is correct."
+    echo "  Example: dotnet publish -c Release -o ./publish"
+    echo "  Then use: $0 4 ./publish/MyTrainingApp"
+    exit 1
+fi
+
+# Check if program is executable
+if [ ! -x "$PROGRAM" ]; then
+    # Validate the file exists and is a regular file
+    if [ ! -f "$PROGRAM" ]; then
+        echo "Error: Program file does not exist or is not a regular file: $PROGRAM"
+        exit 1
+    fi
+
+    # Validate we can modify the file
+    if [ ! -w "$PROGRAM" ]; then
+        echo "Error: No write permission to make program executable: $PROGRAM"
+        echo "Run: chmod +x \"$PROGRAM\" manually with appropriate permissions"
+        exit 1
+    fi
+
+    echo "Warning: Program '$PROGRAM' is not executable."
+    read -p "Make it executable? (y/N): " -n 1 -r
+    echo
+    if [[ $REPLY =~ ^[Yy]$ ]]; then
+        chmod +x "$PROGRAM"
+        if [ $? -ne 0 ]; then
+            echo "Error: Failed to make program executable"
+            exit 1
+        fi
+        echo "Made executable."
+    else
+        echo "Error: Program must be executable to run."
+        exit 1
+    fi
+fi
+
+# Launch distributed training
+echo "Launching distributed training..."
+echo "Command: $MPI_CMD -n $NUM_PROCESSES $PROGRAM ${PROGRAM_ARGS[*]}"
+echo ""
+echo "======================================"
+echo ""
+
+# Execute MPI command
+# -n: Number of processes
+# The program and its arguments follow
+"$MPI_CMD" -n "$NUM_PROCESSES" "$PROGRAM" "${PROGRAM_ARGS[@]}"
+
+# Capture exit code
+EXIT_CODE=$?
+
+echo ""
+echo "======================================"
+if [ $EXIT_CODE -eq 0 ]; then
+    echo "Training completed successfully!"
+else
+    echo "Training failed with exit code: $EXIT_CODE"
+    echo ""
+    echo "Common issues:"
+    echo "  - Make sure all nodes can communicate (check firewalls)"
+    echo "  - Verify MPI is installed on all machines"
+    echo "  - Check that the program path is correct on all machines"
+    echo "  - Ensure sufficient GPU memory is available"
+fi
+echo "======================================"
+
+exit $EXIT_CODE
@@ -737,6 +737,42 @@ public virtual void SetModelsToTry(List<ModelType> modelTypes)
             SetCandidateModels(modelTypes);
         }
 
+        /// <summary>
+        /// Gets the default loss function for gradient computation.
+        /// </summary>
+        /// <remarks>
+        /// AutoML delegates to the best model found during search. If no best model exists yet,
+        /// returns Mean Squared Error as a sensible default.
+        /// </remarks>
+        public virtual ILossFunction<T> DefaultLossFunction =>
+            BestModel is not null && BestModel != null
+                ? BestModel.DefaultLossFunction
+                : new MeanSquaredErrorLoss<T>();
+
+        /// <summary>
+        /// Computes gradients by delegating to the best model.
+        /// </summary>
+        public virtual Vector<T> ComputeGradients(TInput input, TOutput target, ILossFunction<T>? lossFunction = null)
+        {
+            if (BestModel is null || BestModel == null)
+                throw new InvalidOperationException(
+                    "Cannot compute gradients before AutoML search has found a best model. Call Search() first.");
+
+            return BestModel.ComputeGradients(input, target, lossFunction);
+        }
+
+        /// <summary>
+        /// Applies gradients by delegating to the best model.
+        /// </summary>
+        public virtual void ApplyGradients(Vector<T> gradients, T learningRate)
+        {
+            if (BestModel is null || BestModel == null)
+                throw new InvalidOperationException(
+                    "Cannot apply gradients before AutoML search has found a best model. Call Search() first.");
+
+            BestModel.ApplyGradients(gradients, learningRate);
+        }
+
         #endregion
     }
 }
@@ -3,6 +3,7 @@
 using AiDotNet.Interfaces;
 using AiDotNet.LinearAlgebra;
 using AiDotNet.Models;
+using AiDotNet.NeuralNetworks;
 using AiDotNet.NumericOperations;
 using AiDotNet.Optimizers;
 using System;
@@ -155,7 +156,7 @@ private Architecture<T> RunGradientBasedSearch(
                 }
 
                 // Phase 2: Update network weights on training set
-                supernet.BackwardWeights(trainData, trainLabels);
+                supernet.BackwardWeights(trainData, trainLabels, supernet.DefaultLossFunction);
                 var weightParams = supernet.GetWeightParameters();
                 var weightGrads = supernet.GetWeightGradients();