ooples · ooples · Apr 5, 2026 · Apr 4, 2026 · Apr 4, 2026 · Apr 4, 2026
@@ -0,0 +1,175 @@
+using AiDotNet.Memory;
+using AiDotNet.Tensors;
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Configs;
+using BenchmarkDotNet.Toolchains.InProcess.Emit;
+
+namespace AiDotNetBenchmarkTests.BenchmarkTests;
+
+/// <summary>
+/// Benchmarks comparing tensor allocation strategies for RWKV7Block-like forward passes.
+/// Measures: new Tensor (baseline) vs ForwardArena vs LayerWorkspace (target).
+/// </summary>
+[MemoryDiagnoser]
+[Config(typeof(InProcessConfig))]
+public class ArenaAllocationBenchmarks
+{
+    private class InProcessConfig : ManualConfig
+    {
+        public InProcessConfig()
+        {
+            AddJob(BenchmarkDotNet.Jobs.Job.ShortRun
+                .WithToolchain(InProcessEmitToolchain.Instance));
+        }
+    }
+
+    private const int BatchSize = 1;
+    private const int SeqLen = 32;
+    private const int ModelDim = 64;
+    private const int TimestepsPerForward = 32;
+    private const int TensorsPerTimestep = 7;
+
+    // Buffer indices (matching RWKV7Block pattern)
+    private const int TsRInput = 0, TsKInput = 1, TsVInput = 2;
+    private const int TsAInput = 3, TsBInput = 4, TsWkvOut = 5, TsYt = 6;
+    private const int SqAllR = 0, SqAllK = 1, SqAllV = 2, SqAllA = 3;
+    private const int SqAllB = 4, SqAllWkv = 5, SqAllWkvPre = 6, SqAllWkvGated = 7;
+
+    private ForwardArena<float> _arena = null!;
+    private int[] _timestepShape = null!;
+    private int[] _sequenceShape = null!;
+    private LayerWorkspace<float> _workspace = null!;
+
+    [GlobalSetup]
+    public void Setup()
+    {
+        _timestepShape = [BatchSize, ModelDim];
+        _sequenceShape = [BatchSize, SeqLen, ModelDim];
+
+        _arena = new ForwardArena<float>();
+        _arena.EnsureCapacity(_timestepShape, TensorsPerTimestep);
+        _arena.EnsureCapacity(_sequenceShape, 8);
+
+        _workspace = new LayerWorkspace<float>(timestepCount: 7, sequenceCount: 8);
+        _workspace.DeclareTimestep(TsRInput, ModelDim);
+        _workspace.DeclareTimestep(TsKInput, ModelDim);
+        _workspace.DeclareTimestep(TsVInput, ModelDim);
+        _workspace.DeclareTimestep(TsAInput, ModelDim);
+        _workspace.DeclareTimestep(TsBInput, ModelDim);
+        _workspace.DeclareTimestep(TsWkvOut, ModelDim);
+        _workspace.DeclareTimestep(TsYt, ModelDim);
+        _workspace.DeclareSequence(SqAllR, ModelDim);
+        _workspace.DeclareSequence(SqAllK, ModelDim);
+        _workspace.DeclareSequence(SqAllV, ModelDim);
+        _workspace.DeclareSequence(SqAllA, ModelDim);
+        _workspace.DeclareSequence(SqAllB, ModelDim);
+        _workspace.DeclareSequence(SqAllWkv, ModelDim);
+        _workspace.DeclareSequence(SqAllWkvPre, ModelDim);
+        _workspace.DeclareSequence(SqAllWkvGated, ModelDim);
+        _workspace.BeginForward(BatchSize, SeqLen);
+    }
+
+    /// <summary>
+    /// Baseline: raw new Tensor allocation (current RWKV7Block pattern).
+    /// Creates 8 sequence + 7×128 timestep tensors = 904 allocations per forward pass.
+    /// </summary>
+    [Benchmark(Baseline = true)]
+    public int RawAllocation_RWKV7Pattern()
+    {
+        int count = 0;
+        for (int i = 0; i < 8; i++)
+        {
+            var t = new Tensor<float>(_sequenceShape);
+            count += t.Length;
+        }
+        for (int step = 0; step < TimestepsPerForward; step++)
+        {
+            for (int i = 0; i < TensorsPerTimestep; i++)
+            {
+                var t = new Tensor<float>(_timestepShape);
+                count += t.Length;
+            }
+        }
+        return count;
+    }
+
+    /// <summary>
+    /// Arena: bump-pointer allocation.
+    /// </summary>
+    [Benchmark]
+    public int Arena_RWKV7Pattern()
+    {
+        int count = 0;
+        _arena.Reset();
+        for (int i = 0; i < 8; i++)
+            count += _arena.Rent(_sequenceShape).Length;
+        for (int step = 0; step < TimestepsPerForward; step++)
+        {
+            _arena.Reset();
+            for (int i = 0; i < TensorsPerTimestep; i++)
+                count += _arena.Rent(_timestepShape).Length;
+        }
+        return count;
+    }
+
+    /// <summary>
+    /// LayerWorkspace: index-based pre-allocated buffers (production target).
+    /// Zero allocation — same tensors returned every call.
+    /// </summary>
+    [Benchmark]
+    public int Workspace_RWKV7Pattern()
+    {
+        int count = 0;
+        _workspace.BeginForward(BatchSize, SeqLen); // Include sizing check in measurement
+        // Sequence buffers (pre-allocated, same tensor every call)
+        count += _workspace.Sequence(SqAllR).Length;
+        count += _workspace.Sequence(SqAllK).Length;
+        count += _workspace.Sequence(SqAllV).Length;
+        count += _workspace.Sequence(SqAllA).Length;
+        count += _workspace.Sequence(SqAllB).Length;
+        count += _workspace.Sequence(SqAllWkv).Length;
+        count += _workspace.Sequence(SqAllWkvPre).Length;
+        count += _workspace.Sequence(SqAllWkvGated).Length;
+
+        // Timestep buffers (same tensor reused each iteration)
+        for (int step = 0; step < TimestepsPerForward; step++)
+        {
+            count += _workspace.Timestep(TsRInput).Length;
+            count += _workspace.Timestep(TsKInput).Length;
+            count += _workspace.Timestep(TsVInput).Length;
+            count += _workspace.Timestep(TsAInput).Length;
+            count += _workspace.Timestep(TsBInput).Length;
+            count += _workspace.Timestep(TsWkvOut).Length;
+            count += _workspace.Timestep(TsYt).Length;
+        }
+        return count;
+    }
+
+    /// <summary>
+    /// Micro-benchmark: single new Tensor cost.
+    /// </summary>
+    [Benchmark]
+    public Tensor<float> Single_NewTensor()
+    {
+        return new Tensor<float>(_timestepShape);
+    }
+
+    /// <summary>
+    /// Micro-benchmark: single arena rent cost.
+    /// </summary>
+    [Benchmark]
+    public Tensor<float> Single_ArenaRent()
+    {
+        _arena.Reset();
+        return _arena.Rent(_timestepShape);
+    }
+
+    /// <summary>
+    /// Micro-benchmark: single workspace lookup cost.
+    /// </summary>
+    [Benchmark]
+    public Tensor<float> Single_WorkspaceLookup()
+    {
+        return _workspace.Timestep(TsRInput);
+    }
+}
@@ -1,4 +1,4 @@
-using AiDotNet.Interfaces;
+using AiDotNet.Interfaces;
 using AiDotNet.LinearAlgebra;
 using AiDotNet.Tensors.Engines;
 using AiDotNet.WindowFunctions;
@@ -420,7 +420,7 @@ private Tensor<T> InverseBatched(Tensor<Complex<T>> spectrograms, int? targetLen
         }
         outputLength = targetLength ?? outputLength;
 
-        var output = new Tensor<T>(new[] { batchSize, outputLength });
+        var output = TensorAllocator.Rent<T>(new[] { batchSize, outputLength });
 
         for (int b = 0; b < batchSize; b++)
         {

@@ -0,0 +1,165 @@
+using System.Runtime.CompilerServices;
+
+namespace AiDotNet.Memory;
+
+/// <summary>
+/// Bump-pointer arena allocator for zero-allocation forward passes.
+/// Pre-allocates Tensor objects grouped by shape, dishes them out via array index
+/// increment (O(1), zero syscalls), and resets all cursors at end of forward pass.
+///
+/// This beats PyTorch's per-tensor malloc on CPU by eliminating all system calls
+/// and GC pressure during the forward pass. Tensors are pre-created during warmup
+/// and recycled across calls.
+/// </summary>
+internal sealed class ForwardArena<T>
+{
+    private readonly Dictionary<ShapeKey, Tensor<T>[]> _slabs = new();
+    private readonly Dictionary<ShapeKey, int> _cursors = new();
+    private ShapeKey[]? _cursorKeysCache;
+    private const int DefaultSlabSize = 4;
+    private const int GrowthFactor = 2;
+
+    /// <summary>
+    /// Rent a tensor with the given shape. O(1) — single array index + increment.
+    /// Zero system calls, zero GC pressure.
+    /// </summary>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public Tensor<T> Rent(int[] shape)
+    {
+        var key = new ShapeKey(shape);
+
+        if (!_slabs.TryGetValue(key, out var slab))
+            return GrowAndRent(key, shape);
+
+        if (!_cursors.TryGetValue(key, out var cursor))
+            cursor = 0;
+
+        if (cursor >= slab.Length)
+            return GrowAndRent(key, shape);
+
+        _cursors[key] = cursor + 1;
+        var tensor = slab[cursor];
+        tensor.Data.Span.Clear();
+        return tensor;
+    }
+
+    /// <summary>
+    /// Rent a tensor without clearing its data. Use when the tensor will be
+    /// completely overwritten before any reads (e.g., output of MatMul).
+    /// </summary>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public Tensor<T> RentUninitialized(int[] shape)
+    {
+        var key = new ShapeKey(shape);
+
+        if (!_slabs.TryGetValue(key, out var slab))
+            return GrowAndRent(key, shape, clear: false);
+
+        if (!_cursors.TryGetValue(key, out var cursor))
+            cursor = 0;
+
+        if (cursor >= slab.Length)
+            return GrowAndRent(key, shape, clear: false);
+
+        _cursors[key] = cursor + 1;
+        return slab[cursor];
+    }
+
+    /// <summary>
+    /// Reset all cursors to 0. Called at end of Forward pass.
+    /// No deallocation — tensors stay pre-allocated for next call.
+    /// </summary>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public void Reset()
+    {
+        // Use cached keys array to avoid allocation during reset
+        if (_cursorKeysCache is null || _cursorKeysCache.Length != _cursors.Count)
+            _cursorKeysCache = new ShapeKey[_cursors.Count];
+        _cursors.Keys.CopyTo(_cursorKeysCache, 0);
+        for (int i = 0; i < _cursorKeysCache.Length; i++)
+            _cursors[_cursorKeysCache[i]] = 0;
+    }
+
+    /// <summary>
+    /// Pre-allocate capacity for a given shape. Call during layer construction
+    /// or at start of forward pass when shapes change.
+    /// </summary>
+    public void EnsureCapacity(int[] shape, int count)
+    {
+        var key = new ShapeKey(shape, defensiveCopy: true);
+        if (_slabs.TryGetValue(key, out var existing) && existing.Length >= count)
+            return;
+
+        var newSlab = new Tensor<T>[count];
+        int copyFrom = 0;
+        if (existing is not null)
+        {
+            Array.Copy(existing, newSlab, existing.Length);
+            copyFrom = existing.Length;
+        }
+        for (int i = copyFrom; i < count; i++)
+            newSlab[i] = new Tensor<T>(shape);
+
+        _slabs[key] = newSlab;
+        if (!_cursors.ContainsKey(key))
+            _cursors[key] = 0;
+    }
+
+    /// <summary>
+    /// Gets the total number of pre-allocated tensors across all shapes.
+    /// </summary>
+    public int TotalPreAllocated => _slabs.Values.Sum(s => s.Length);
+
+    /// <summary>
+    /// Gets the peak number of tensors rented in the current forward pass.
+    /// </summary>
+    public int CurrentRented => _cursors.Values.Sum();
+
+    private Tensor<T> GrowAndRent(ShapeKey key, int[] shape, bool clear = true)
+    {
+        int currentSize = _slabs.TryGetValue(key, out var existing) ? existing.Length : 0;
+        int newSize = Math.Max(currentSize * GrowthFactor, DefaultSlabSize);
+        EnsureCapacity(shape, newSize);
+
+        var cursor = _cursors.TryGetValue(key, out var c) ? c : 0;
+        _cursors[key] = cursor + 1;
+        var tensor = _slabs[key][cursor];
+        if (clear) tensor.Data.Span.Clear();
+        return tensor;
+    }
+}
+
+/// <summary>
+/// Value-type shape key for arena dictionary lookups. Pre-computes hash
+/// to avoid per-lookup allocation. Matches TensorPool.GetTensorPoolKey pattern.
+/// </summary>
+public readonly struct ShapeKey : IEquatable<ShapeKey>
+{
+    private readonly int _hash;
+    private readonly int[] _dims;
+
+    public ShapeKey(int[] shape, bool defensiveCopy = false)
+    {
+        _dims = defensiveCopy ? (int[])shape.Clone() : shape;
+        unchecked
+        {
+            int hash = (int)2166136261;
+            hash = (hash ^ shape.Length) * 16777619;
+            for (int i = 0; i < shape.Length; i++)
+                hash = (hash ^ shape[i]) * 16777619;
+            _hash = hash;
+        }
+    }
+
+    public override int GetHashCode() => _hash;
+
+    public override bool Equals(object? obj) => obj is ShapeKey other && Equals(other);
+
+    public bool Equals(ShapeKey other)
+    {
+        if (_dims.Length != other._dims.Length) return false;
+        for (int i = 0; i < _dims.Length; i++)
+            if (_dims[i] != other._dims[i]) return false;
+        return true;
+    }
+}