Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
175 changes: 175 additions & 0 deletions AiDotNetBenchmarkTests/BenchmarkTests/ArenaAllocationBenchmarks.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
using AiDotNet.Memory;
using AiDotNet.Tensors;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Configs;
using BenchmarkDotNet.Toolchains.InProcess.Emit;

namespace AiDotNetBenchmarkTests.BenchmarkTests;

/// <summary>
/// Benchmarks comparing tensor allocation strategies for RWKV7Block-like forward passes.
/// Measures: new Tensor (baseline) vs ForwardArena vs LayerWorkspace (target).
/// </summary>
[MemoryDiagnoser]
[Config(typeof(InProcessConfig))]
public class ArenaAllocationBenchmarks
{
private class InProcessConfig : ManualConfig
{
public InProcessConfig()
{
AddJob(BenchmarkDotNet.Jobs.Job.ShortRun
.WithToolchain(InProcessEmitToolchain.Instance));
}
}

private const int BatchSize = 1;
private const int SeqLen = 32;
private const int ModelDim = 64;
private const int TimestepsPerForward = 32;
private const int TensorsPerTimestep = 7;

// Buffer indices (matching RWKV7Block pattern)
private const int TsRInput = 0, TsKInput = 1, TsVInput = 2;
private const int TsAInput = 3, TsBInput = 4, TsWkvOut = 5, TsYt = 6;
private const int SqAllR = 0, SqAllK = 1, SqAllV = 2, SqAllA = 3;
private const int SqAllB = 4, SqAllWkv = 5, SqAllWkvPre = 6, SqAllWkvGated = 7;

private ForwardArena<float> _arena = null!;
private int[] _timestepShape = null!;
private int[] _sequenceShape = null!;
private LayerWorkspace<float> _workspace = null!;

[GlobalSetup]
public void Setup()
{
_timestepShape = [BatchSize, ModelDim];
_sequenceShape = [BatchSize, SeqLen, ModelDim];

_arena = new ForwardArena<float>();
_arena.EnsureCapacity(_timestepShape, TensorsPerTimestep);
_arena.EnsureCapacity(_sequenceShape, 8);

_workspace = new LayerWorkspace<float>(timestepCount: 7, sequenceCount: 8);
_workspace.DeclareTimestep(TsRInput, ModelDim);
_workspace.DeclareTimestep(TsKInput, ModelDim);
_workspace.DeclareTimestep(TsVInput, ModelDim);
_workspace.DeclareTimestep(TsAInput, ModelDim);
_workspace.DeclareTimestep(TsBInput, ModelDim);
_workspace.DeclareTimestep(TsWkvOut, ModelDim);
_workspace.DeclareTimestep(TsYt, ModelDim);
_workspace.DeclareSequence(SqAllR, ModelDim);
_workspace.DeclareSequence(SqAllK, ModelDim);
_workspace.DeclareSequence(SqAllV, ModelDim);
_workspace.DeclareSequence(SqAllA, ModelDim);
_workspace.DeclareSequence(SqAllB, ModelDim);
_workspace.DeclareSequence(SqAllWkv, ModelDim);
_workspace.DeclareSequence(SqAllWkvPre, ModelDim);
_workspace.DeclareSequence(SqAllWkvGated, ModelDim);
_workspace.BeginForward(BatchSize, SeqLen);
}

/// <summary>
/// Baseline: raw new Tensor allocation (current RWKV7Block pattern).
/// Creates 8 sequence + 7×128 timestep tensors = 904 allocations per forward pass.
/// </summary>
[Benchmark(Baseline = true)]
public int RawAllocation_RWKV7Pattern()
{
int count = 0;
for (int i = 0; i < 8; i++)
{
var t = new Tensor<float>(_sequenceShape);
count += t.Length;
}
for (int step = 0; step < TimestepsPerForward; step++)
{
for (int i = 0; i < TensorsPerTimestep; i++)
{
var t = new Tensor<float>(_timestepShape);
count += t.Length;
}
}
return count;
}

/// <summary>
/// Arena: bump-pointer allocation.
/// </summary>
[Benchmark]
public int Arena_RWKV7Pattern()
{
int count = 0;
_arena.Reset();
for (int i = 0; i < 8; i++)
count += _arena.Rent(_sequenceShape).Length;
for (int step = 0; step < TimestepsPerForward; step++)
{
_arena.Reset();
for (int i = 0; i < TensorsPerTimestep; i++)
count += _arena.Rent(_timestepShape).Length;
}
return count;
}

/// <summary>
/// LayerWorkspace: index-based pre-allocated buffers (production target).
/// Zero allocation — same tensors returned every call.
/// </summary>
[Benchmark]
public int Workspace_RWKV7Pattern()
{
int count = 0;
_workspace.BeginForward(BatchSize, SeqLen); // Include sizing check in measurement
// Sequence buffers (pre-allocated, same tensor every call)
count += _workspace.Sequence(SqAllR).Length;
count += _workspace.Sequence(SqAllK).Length;
count += _workspace.Sequence(SqAllV).Length;
count += _workspace.Sequence(SqAllA).Length;
count += _workspace.Sequence(SqAllB).Length;
count += _workspace.Sequence(SqAllWkv).Length;
count += _workspace.Sequence(SqAllWkvPre).Length;
count += _workspace.Sequence(SqAllWkvGated).Length;

// Timestep buffers (same tensor reused each iteration)
for (int step = 0; step < TimestepsPerForward; step++)
{
count += _workspace.Timestep(TsRInput).Length;
count += _workspace.Timestep(TsKInput).Length;
count += _workspace.Timestep(TsVInput).Length;
count += _workspace.Timestep(TsAInput).Length;
count += _workspace.Timestep(TsBInput).Length;
count += _workspace.Timestep(TsWkvOut).Length;
count += _workspace.Timestep(TsYt).Length;
}
return count;
}

/// <summary>
/// Micro-benchmark: single new Tensor cost.
/// </summary>
[Benchmark]
public Tensor<float> Single_NewTensor()
{
return new Tensor<float>(_timestepShape);
}

/// <summary>
/// Micro-benchmark: single arena rent cost.
/// </summary>
[Benchmark]
public Tensor<float> Single_ArenaRent()
{
_arena.Reset();
return _arena.Rent(_timestepShape);
}

/// <summary>
/// Micro-benchmark: single workspace lookup cost.
/// </summary>
[Benchmark]
public Tensor<float> Single_WorkspaceLookup()
{
return _workspace.Timestep(TsRInput);
}
}
4 changes: 2 additions & 2 deletions src/Diffusion/Audio/ShortTimeFourierTransform.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using AiDotNet.Interfaces;
using AiDotNet.Interfaces;
using AiDotNet.LinearAlgebra;
using AiDotNet.Tensors.Engines;
using AiDotNet.WindowFunctions;
Expand Down Expand Up @@ -420,7 +420,7 @@ private Tensor<T> InverseBatched(Tensor<Complex<T>> spectrograms, int? targetLen
}
outputLength = targetLength ?? outputLength;

var output = new Tensor<T>(new[] { batchSize, outputLength });
var output = TensorAllocator.Rent<T>(new[] { batchSize, outputLength });

for (int b = 0; b < batchSize; b++)
{
Expand Down
165 changes: 165 additions & 0 deletions src/Memory/ForwardArena.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
using System.Runtime.CompilerServices;

namespace AiDotNet.Memory;

/// <summary>
/// Bump-pointer arena allocator for zero-allocation forward passes.
/// Pre-allocates Tensor objects grouped by shape, dishes them out via array index
/// increment (O(1), zero syscalls), and resets all cursors at end of forward pass.
///
/// This beats PyTorch's per-tensor malloc on CPU by eliminating all system calls
/// and GC pressure during the forward pass. Tensors are pre-created during warmup
/// and recycled across calls.
/// </summary>
internal sealed class ForwardArena<T>
{
private readonly Dictionary<ShapeKey, Tensor<T>[]> _slabs = new();
private readonly Dictionary<ShapeKey, int> _cursors = new();
private ShapeKey[]? _cursorKeysCache;
private const int DefaultSlabSize = 4;
private const int GrowthFactor = 2;

/// <summary>
/// Rent a tensor with the given shape. O(1) — single array index + increment.
/// Zero system calls, zero GC pressure.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public Tensor<T> Rent(int[] shape)
{
var key = new ShapeKey(shape);

if (!_slabs.TryGetValue(key, out var slab))
return GrowAndRent(key, shape);

if (!_cursors.TryGetValue(key, out var cursor))
cursor = 0;

if (cursor >= slab.Length)
return GrowAndRent(key, shape);

_cursors[key] = cursor + 1;
var tensor = slab[cursor];
tensor.Data.Span.Clear();
return tensor;
}

/// <summary>
/// Rent a tensor without clearing its data. Use when the tensor will be
/// completely overwritten before any reads (e.g., output of MatMul).
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public Tensor<T> RentUninitialized(int[] shape)
{
var key = new ShapeKey(shape);

if (!_slabs.TryGetValue(key, out var slab))
return GrowAndRent(key, shape, clear: false);

if (!_cursors.TryGetValue(key, out var cursor))
cursor = 0;

if (cursor >= slab.Length)
return GrowAndRent(key, shape, clear: false);

_cursors[key] = cursor + 1;
return slab[cursor];
}

/// <summary>
/// Reset all cursors to 0. Called at end of Forward pass.
/// No deallocation — tensors stay pre-allocated for next call.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void Reset()
{
// Use cached keys array to avoid allocation during reset
if (_cursorKeysCache is null || _cursorKeysCache.Length != _cursors.Count)
_cursorKeysCache = new ShapeKey[_cursors.Count];
_cursors.Keys.CopyTo(_cursorKeysCache, 0);
for (int i = 0; i < _cursorKeysCache.Length; i++)
_cursors[_cursorKeysCache[i]] = 0;
}

/// <summary>
/// Pre-allocate capacity for a given shape. Call during layer construction
/// or at start of forward pass when shapes change.
/// </summary>
public void EnsureCapacity(int[] shape, int count)
{
var key = new ShapeKey(shape, defensiveCopy: true);
if (_slabs.TryGetValue(key, out var existing) && existing.Length >= count)
return;

var newSlab = new Tensor<T>[count];
int copyFrom = 0;
if (existing is not null)
{
Array.Copy(existing, newSlab, existing.Length);
copyFrom = existing.Length;
}
for (int i = copyFrom; i < count; i++)
newSlab[i] = new Tensor<T>(shape);

_slabs[key] = newSlab;
if (!_cursors.ContainsKey(key))
_cursors[key] = 0;
}

/// <summary>
/// Gets the total number of pre-allocated tensors across all shapes.
/// </summary>
public int TotalPreAllocated => _slabs.Values.Sum(s => s.Length);

/// <summary>
/// Gets the peak number of tensors rented in the current forward pass.
/// </summary>
public int CurrentRented => _cursors.Values.Sum();

private Tensor<T> GrowAndRent(ShapeKey key, int[] shape, bool clear = true)
{
int currentSize = _slabs.TryGetValue(key, out var existing) ? existing.Length : 0;
int newSize = Math.Max(currentSize * GrowthFactor, DefaultSlabSize);
EnsureCapacity(shape, newSize);

var cursor = _cursors.TryGetValue(key, out var c) ? c : 0;
_cursors[key] = cursor + 1;
var tensor = _slabs[key][cursor];
if (clear) tensor.Data.Span.Clear();
return tensor;
}
}

/// <summary>
/// Value-type shape key for arena dictionary lookups. Pre-computes hash
/// to avoid per-lookup allocation. Matches TensorPool.GetTensorPoolKey pattern.
/// </summary>
public readonly struct ShapeKey : IEquatable<ShapeKey>
{
private readonly int _hash;
private readonly int[] _dims;

public ShapeKey(int[] shape, bool defensiveCopy = false)
{
_dims = defensiveCopy ? (int[])shape.Clone() : shape;
unchecked
{
int hash = (int)2166136261;
hash = (hash ^ shape.Length) * 16777619;
for (int i = 0; i < shape.Length; i++)
hash = (hash ^ shape[i]) * 16777619;
_hash = hash;
}
}

public override int GetHashCode() => _hash;

public override bool Equals(object? obj) => obj is ShapeKey other && Equals(other);

public bool Equals(ShapeKey other)
{
if (_dims.Length != other._dims.Length) return false;
for (int i = 0; i < _dims.Length; i++)
if (_dims[i] != other._dims[i]) return false;
return true;
}
}
Loading
Loading