ooples
diff --git a/‎src/AiDotNet.Serving/Batching/AdaptiveBatchingStrategy.cs‎
Lines changed: 98 additions & 0 deletions b/‎src/AiDotNet.Serving/Batching/AdaptiveBatchingStrategy.cs‎
Lines changed: 98 additions & 0 deletions
diff --git a/‎src/AiDotNet.Serving/Batching/BucketBatchingStrategy.cs‎
Lines changed: 70 additions & 0 deletions b/‎src/AiDotNet.Serving/Batching/BucketBatchingStrategy.cs‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎src/AiDotNet.Serving/Batching/IBatchingStrategy.cs‎
Lines changed: 37 additions & 0 deletions b/‎src/AiDotNet.Serving/Batching/IBatchingStrategy.cs‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎src/AiDotNet.Serving/Batching/SizeBatchingStrategy.cs‎
Lines changed: 39 additions & 0 deletions b/‎src/AiDotNet.Serving/Batching/SizeBatchingStrategy.cs‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎src/AiDotNet.Serving/Batching/TimeoutBatchingStrategy.cs‎
Lines changed: 38 additions & 0 deletions b/‎src/AiDotNet.Serving/Batching/TimeoutBatchingStrategy.cs‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎src/AiDotNet.Serving/Configuration/ServingOptions.cs‎
Lines changed: 74 additions & 0 deletions b/‎src/AiDotNet.Serving/Configuration/ServingOptions.cs‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎src/AiDotNet.Serving/Controllers/InferenceController.cs‎
Lines changed: 14 additions & 0 deletions b/‎src/AiDotNet.Serving/Controllers/InferenceController.cs‎
Lines changed: 14 additions & 0 deletions
@@ -0,0 +1,98 @@
+namespace AiDotNet.Serving.Batching;
+
+/// <summary>
+/// Adaptive batching strategy that dynamically adjusts batch size based on latency and throughput.
+/// This strategy aims to maximize throughput while maintaining latency SLAs.
+/// </summary>
+public class AdaptiveBatchingStrategy : IBatchingStrategy
+{
+    private readonly int _minBatchSize;
+    private readonly int _maxBatchSize;
+    private readonly int _maxWaitMs;
+    private readonly double _targetLatencyMs;
+    private readonly double _latencyToleranceFactor;
+
+    private int _currentOptimalBatchSize;
+    private double _recentAverageLatency;
+    private readonly object _lock = new();
+
+    // Constants for batch size adaptation
+    private const double SmoothingFactor = 0.3;
+    private const int BatchSizeAdjustmentStep = 5;
+
+    /// <summary>
+    /// Initializes a new instance of the AdaptiveBatchingStrategy.
+    /// </summary>
+    /// <param name="minBatchSize">Minimum batch size</param>
+    /// <param name="maxBatchSize">Maximum batch size</param>
+    /// <param name="maxWaitMs">Maximum wait time before processing</param>
+    /// <param name="targetLatencyMs">Target latency in milliseconds</param>
+    /// <param name="latencyToleranceFactor">Tolerance factor for latency (e.g., 2.0 means p99 should be less than 2x p50)</param>
+    public AdaptiveBatchingStrategy(
+        int minBatchSize,
+        int maxBatchSize,
+        int maxWaitMs,
+        double targetLatencyMs,
+        double latencyToleranceFactor = 2.0)
+    {
+        _minBatchSize = minBatchSize;
+        _maxBatchSize = maxBatchSize;
+        _maxWaitMs = maxWaitMs;
+        _targetLatencyMs = targetLatencyMs;
+        _latencyToleranceFactor = latencyToleranceFactor;
+        _currentOptimalBatchSize = minBatchSize;
+        _recentAverageLatency = targetLatencyMs;
+    }
+
+    public string Name => "Adaptive";
+
+    public bool ShouldProcessBatch(int queuedRequests, double timeInQueueMs, double averageLatencyMs, int queueDepth)
+    {
+        if (queuedRequests == 0)
+            return false;
+
+        // Process if we have enough requests for optimal batch size
+        if (queuedRequests >= _currentOptimalBatchSize)
+            return true;
+
+        // Process if we're approaching max wait time
+        if (timeInQueueMs >= _maxWaitMs)
+            return true;
+
+        // Process if queue is building up (backpressure detection)
+        if (queueDepth > _currentOptimalBatchSize * 2)
+            return true;
+
+        return false;
+    }
+
+    public int GetOptimalBatchSize(int queuedRequests, double averageLatencyMs)
+    {
+        lock (_lock)
+        {
+            return Math.Min(Math.Min(queuedRequests, _currentOptimalBatchSize), _maxBatchSize);
+        }
+    }
+
+    public void UpdatePerformanceFeedback(int batchSize, double latencyMs)
+    {
+        lock (_lock)
+        {
+            // Exponential moving average of latency
+            _recentAverageLatency = SmoothingFactor * latencyMs + (1 - SmoothingFactor) * _recentAverageLatency;
+
+            // Adapt batch size based on latency
+            if (_recentAverageLatency < _targetLatencyMs)
+            {
+                // Latency is good, try increasing batch size
+                _currentOptimalBatchSize = Math.Min(_currentOptimalBatchSize + BatchSizeAdjustmentStep, _maxBatchSize);
+            }
+            else if (_recentAverageLatency > _targetLatencyMs * _latencyToleranceFactor)
+            {
+                // Latency is too high, decrease batch size
+                _currentOptimalBatchSize = Math.Max(_currentOptimalBatchSize - BatchSizeAdjustmentStep, _minBatchSize);
+            }
+            // Otherwise, keep current batch size
+        }
+    }
+}
@@ -0,0 +1,70 @@
+namespace AiDotNet.Serving.Batching;
+
+/// <summary>
+/// Bucket-based batching strategy that groups requests by input size into buckets.
+/// This minimizes padding overhead for variable-length sequences.
+/// </summary>
+public class BucketBatchingStrategy : IBatchingStrategy
+{
+    private readonly int[] _bucketBoundaries;
+    private readonly int _maxBatchSize;
+    private readonly int _maxWaitMs;
+
+    /// <summary>
+    /// Initializes a new instance of the BucketBatchingStrategy.
+    /// </summary>
+    /// <param name="bucketBoundaries">Array of bucket boundaries (e.g., [32, 64, 128, 256, 512])</param>
+    /// <param name="maxBatchSize">Maximum batch size per bucket</param>
+    /// <param name="maxWaitMs">Maximum wait time before processing</param>
+    public BucketBatchingStrategy(int[] bucketBoundaries, int maxBatchSize, int maxWaitMs)
+    {
+        _bucketBoundaries = bucketBoundaries ?? new[] { 32, 64, 128, 256, 512 };
+        Array.Sort(_bucketBoundaries);
+        _maxBatchSize = maxBatchSize;
+        _maxWaitMs = maxWaitMs;
+    }
+
+    public string Name => "Bucket";
+
+    /// <summary>
+    /// Gets the bucket index for a given input size.
+    /// </summary>
+    /// <param name="inputSize">The size of the input</param>
+    /// <returns>The bucket index</returns>
+    public int GetBucketIndex(int inputSize)
+    {
+        for (int i = 0; i < _bucketBoundaries.Length; i++)
+        {
+            if (inputSize <= _bucketBoundaries[i])
+                return i;
+        }
+        return _bucketBoundaries.Length; // Largest bucket
+    }
+
+    /// <summary>
+    /// Gets the padded size for a bucket.
+    /// </summary>
+    /// <param name="bucketIndex">The bucket index</param>
+    /// <returns>The padded size for the bucket</returns>
+    public int GetBucketSize(int bucketIndex)
+    {
+        if (bucketIndex < _bucketBoundaries.Length)
+            return _bucketBoundaries[bucketIndex];
+        return _bucketBoundaries[^1] * 2; // Double the largest bucket for overflow
+    }
+
+    public bool ShouldProcessBatch(int queuedRequests, double timeInQueueMs, double averageLatencyMs, int queueDepth)
+    {
+        return queuedRequests >= _maxBatchSize || (queuedRequests > 0 && timeInQueueMs >= _maxWaitMs);
+    }
+
+    public int GetOptimalBatchSize(int queuedRequests, double averageLatencyMs)
+    {
+        return Math.Min(queuedRequests, _maxBatchSize);
+    }
+
+    public void UpdatePerformanceFeedback(int batchSize, double latencyMs)
+    {
+        // Could be enhanced to adapt bucket boundaries based on request distribution
+    }
+}
@@ -0,0 +1,37 @@
+namespace AiDotNet.Serving.Batching;
+
+/// <summary>
+/// Interface for batching strategies that determine when to process accumulated requests.
+/// </summary>
+public interface IBatchingStrategy
+{
+    /// <summary>
+    /// Gets the name of the batching strategy.
+    /// </summary>
+    string Name { get; }
+
+    /// <summary>
+    /// Determines whether a batch should be processed based on the current state.
+    /// </summary>
+    /// <param name="queuedRequests">Number of requests currently queued</param>
+    /// <param name="timeInQueueMs">Time in milliseconds since the oldest request was queued</param>
+    /// <param name="averageLatencyMs">Average latency of recent batches in milliseconds</param>
+    /// <param name="queueDepth">Current queue depth</param>
+    /// <returns>True if the batch should be processed; otherwise, false</returns>
+    bool ShouldProcessBatch(int queuedRequests, double timeInQueueMs, double averageLatencyMs, int queueDepth);
+
+    /// <summary>
+    /// Determines the optimal batch size for the current state.
+    /// </summary>
+    /// <param name="queuedRequests">Number of requests currently queued</param>
+    /// <param name="averageLatencyMs">Average latency of recent batches in milliseconds</param>
+    /// <returns>The optimal batch size</returns>
+    int GetOptimalBatchSize(int queuedRequests, double averageLatencyMs);
+
+    /// <summary>
+    /// Updates the strategy with performance feedback.
+    /// </summary>
+    /// <param name="batchSize">Size of the batch that was processed</param>
+    /// <param name="latencyMs">Latency in milliseconds for processing the batch</param>
+    void UpdatePerformanceFeedback(int batchSize, double latencyMs);
+}
@@ -0,0 +1,39 @@
+namespace AiDotNet.Serving.Batching;
+
+/// <summary>
+/// Size-based batching strategy that processes batches when a size threshold is reached.
+/// </summary>
+public class SizeBatchingStrategy : IBatchingStrategy
+{
+    private readonly int _batchSize;
+    private readonly int _maxWaitMs;
+
+    /// <summary>
+    /// Initializes a new instance of the SizeBatchingStrategy.
+    /// </summary>
+    /// <param name="batchSize">Target batch size to trigger processing</param>
+    /// <param name="maxWaitMs">Maximum wait time before processing smaller batches</param>
+    public SizeBatchingStrategy(int batchSize, int maxWaitMs)
+    {
+        _batchSize = batchSize;
+        _maxWaitMs = maxWaitMs;
+    }
+
+    public string Name => "Size";
+
+    public bool ShouldProcessBatch(int queuedRequests, double timeInQueueMs, double averageLatencyMs, int queueDepth)
+    {
+        // Process if we have enough requests or waited too long
+        return queuedRequests >= _batchSize || (queuedRequests > 0 && timeInQueueMs >= _maxWaitMs);
+    }
+
+    public int GetOptimalBatchSize(int queuedRequests, double averageLatencyMs)
+    {
+        return Math.Min(queuedRequests, _batchSize);
+    }
+
+    public void UpdatePerformanceFeedback(int batchSize, double latencyMs)
+    {
+        // No adaptation for size strategy
+    }
+}
@@ -0,0 +1,38 @@
+namespace AiDotNet.Serving.Batching;
+
+/// <summary>
+/// Timeout-based batching strategy that processes batches when a time threshold is reached.
+/// </summary>
+public class TimeoutBatchingStrategy : IBatchingStrategy
+{
+    private readonly int _timeoutMs;
+    private readonly int _maxBatchSize;
+
+    /// <summary>
+    /// Initializes a new instance of the TimeoutBatchingStrategy.
+    /// </summary>
+    /// <param name="timeoutMs">Maximum time to wait before processing a batch</param>
+    /// <param name="maxBatchSize">Maximum batch size</param>
+    public TimeoutBatchingStrategy(int timeoutMs, int maxBatchSize)
+    {
+        _timeoutMs = timeoutMs;
+        _maxBatchSize = maxBatchSize;
+    }
+
+    public string Name => "Timeout";
+
+    public bool ShouldProcessBatch(int queuedRequests, double timeInQueueMs, double averageLatencyMs, int queueDepth)
+    {
+        return queuedRequests > 0 && timeInQueueMs >= _timeoutMs;
+    }
+
+    public int GetOptimalBatchSize(int queuedRequests, double averageLatencyMs)
+    {
+        return _maxBatchSize > 0 ? Math.Min(queuedRequests, _maxBatchSize) : queuedRequests;
+    }
+
+    public void UpdatePerformanceFeedback(int batchSize, double latencyMs)
+    {
+        // No adaptation for timeout strategy
+    }
+}
@@ -26,6 +26,80 @@ public class ServingOptions
     /// </summary>
     public int MaxBatchSize { get; set; } = 100;
 
+    /// <summary>
+    /// Gets or sets the minimum batch size for adaptive batching.
+    /// Default is 1.
+    /// </summary>
+    public int MinBatchSize { get; set; } = 1;
+
+    /// <summary>
+    /// Gets or sets the batching strategy to use.
+    /// Options: "Timeout", "Size", "Adaptive", "Bucket"
+    /// Default is "Adaptive".
+    /// </summary>
+    public string BatchingStrategy { get; set; } = "Adaptive";
+
+    /// <summary>
+    /// Gets or sets the target latency in milliseconds for adaptive batching.
+    /// The adaptive strategy will try to maintain this latency while maximizing throughput.
+    /// Default is 20 milliseconds.
+    /// </summary>
+    public double TargetLatencyMs { get; set; } = 20.0;
+
+    /// <summary>
+    /// Gets or sets the latency tolerance factor for adaptive batching.
+    /// This defines the acceptable ratio between p99 and p50 latency.
+    /// Default is 2.0 (p99 should be less than 2x p50).
+    /// </summary>
+    public double LatencyToleranceFactor { get; set; } = 2.0;
+
+    /// <summary>
+    /// Gets or sets the maximum queue size for backpressure handling.
+    /// When the queue is full, new requests will be rejected.
+    /// Set to 0 for unlimited queue size.
+    /// Default is 1000.
+    /// </summary>
+    public int MaxQueueSize { get; set; } = 1000;
+
+    /// <summary>
+    /// Gets or sets whether to enable priority-based request scheduling.
+    /// Default is false.
+    /// </summary>
+    public bool EnablePriorityScheduling { get; set; } = false;
+
+    /// <summary>
+    /// Gets or sets the padding strategy to use for variable-length sequences.
+    /// Options: "Minimal", "Bucket", "Fixed"
+    /// Default is "Minimal".
+    /// </summary>
+    public string PaddingStrategy { get; set; } = "Minimal";
+
+    /// <summary>
+    /// Gets or sets the bucket sizes for bucket-based batching and padding.
+    /// Default is [32, 64, 128, 256, 512].
+    /// </summary>
+    public int[] BucketSizes { get; set; } = new[] { 32, 64, 128, 256, 512 };
+
+    /// <summary>
+    /// Gets or sets the fixed size for fixed-size padding strategy.
+    /// Only used when PaddingStrategy is "Fixed".
+    /// Default is 512.
+    /// </summary>
+    public int FixedPaddingSize { get; set; } = 512;
+
+    /// <summary>
+    /// Gets or sets whether to enable detailed performance metrics collection.
+    /// This includes latency percentiles, throughput, and batch utilization.
+    /// Default is true.
+    /// </summary>
+    public bool EnablePerformanceMetrics { get; set; } = true;
+
+    /// <summary>
+    /// Gets or sets the maximum number of latency samples to keep for percentile calculation.
+    /// Default is 10000.
+    /// </summary>
+    public int MaxLatencySamples { get; set; } = 10000;
+
     /// <summary>
     /// Gets or sets the root directory where model files are stored.
     /// Model paths are restricted to this directory for security.
 
@@ -215,4 +215,18 @@ public ActionResult<Dictionary<string, object>> GetStatistics()
         var stats = _requestBatcher.GetStatistics();
         return Ok(stats);
     }
+
+    /// <summary>
+    /// Gets detailed performance metrics including latency percentiles, throughput,
+    /// batch utilization, and queue depth monitoring.
+    /// </summary>
+    /// <returns>Detailed performance metrics</returns>
+    /// <response code="200">Returns detailed performance metrics</response>
+    [HttpGet("metrics")]
+    [ProducesResponseType(typeof(Dictionary<string, object>), StatusCodes.Status200OK)]
+    public ActionResult<Dictionary<string, object>> GetPerformanceMetrics()
+    {
+        var metrics = _requestBatcher.GetPerformanceMetrics();
+        return Ok(metrics);
+    }
 }