-
-
Notifications
You must be signed in to change notification settings - Fork 8
perf: LayerWorkspace arena allocator for zero-allocation forward passes (#1014) #1083
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
10 commits
Select commit
Hold shift + click to select a range
d3637db
perf: add ForwardArena and LayerWorkspace for zero-allocation forward…
ooples d7847f7
perf: benchmark baseline — LayerWorkspace is 232x faster than raw all…
ooples fcf1854
perf: migrate RWKV7Block to LayerWorkspace (13 of 84 allocations elim…
ooples 674eb7d
perf: complete RWKV7Block forward-path migration to LayerWorkspace
ooples 2acbe01
perf: convert 224 per-forward tensor allocations to TensorAllocator.R…
ooples be4cd7c
perf: convert 1 Diffusion allocation + add Memory imports to 17 layer…
ooples 1e3a8bd
perf: eliminate 93 more allocations — ScalarMinusTensor + bracket pat…
ooples 554b23f
fix: address 25 PR review comments — zero-init safety, ShapeKey, vali…
ooples d8673da
fix: address 25 PR review comments — SiLU zero-alloc, arena safety, c…
ooples 3d19820
fix: ShapeKey zero-alloc lookup + revert 3 more state buffer Rent calls
ooples File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
175 changes: 175 additions & 0 deletions
175
AiDotNetBenchmarkTests/BenchmarkTests/ArenaAllocationBenchmarks.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,175 @@ | ||
| using AiDotNet.Memory; | ||
| using AiDotNet.Tensors; | ||
| using BenchmarkDotNet.Attributes; | ||
| using BenchmarkDotNet.Configs; | ||
| using BenchmarkDotNet.Toolchains.InProcess.Emit; | ||
|
|
||
| namespace AiDotNetBenchmarkTests.BenchmarkTests; | ||
|
|
||
| /// <summary> | ||
| /// Benchmarks comparing tensor allocation strategies for RWKV7Block-like forward passes. | ||
| /// Measures: new Tensor (baseline) vs ForwardArena vs LayerWorkspace (target). | ||
| /// </summary> | ||
| [MemoryDiagnoser] | ||
| [Config(typeof(InProcessConfig))] | ||
| public class ArenaAllocationBenchmarks | ||
| { | ||
| private class InProcessConfig : ManualConfig | ||
| { | ||
| public InProcessConfig() | ||
| { | ||
| AddJob(BenchmarkDotNet.Jobs.Job.ShortRun | ||
| .WithToolchain(InProcessEmitToolchain.Instance)); | ||
| } | ||
| } | ||
|
|
||
| private const int BatchSize = 1; | ||
| private const int SeqLen = 32; | ||
| private const int ModelDim = 64; | ||
| private const int TimestepsPerForward = 32; | ||
| private const int TensorsPerTimestep = 7; | ||
|
|
||
| // Buffer indices (matching RWKV7Block pattern) | ||
| private const int TsRInput = 0, TsKInput = 1, TsVInput = 2; | ||
| private const int TsAInput = 3, TsBInput = 4, TsWkvOut = 5, TsYt = 6; | ||
| private const int SqAllR = 0, SqAllK = 1, SqAllV = 2, SqAllA = 3; | ||
| private const int SqAllB = 4, SqAllWkv = 5, SqAllWkvPre = 6, SqAllWkvGated = 7; | ||
|
|
||
| private ForwardArena<float> _arena = null!; | ||
| private int[] _timestepShape = null!; | ||
| private int[] _sequenceShape = null!; | ||
| private LayerWorkspace<float> _workspace = null!; | ||
|
|
||
| [GlobalSetup] | ||
| public void Setup() | ||
| { | ||
| _timestepShape = [BatchSize, ModelDim]; | ||
| _sequenceShape = [BatchSize, SeqLen, ModelDim]; | ||
|
|
||
| _arena = new ForwardArena<float>(); | ||
| _arena.EnsureCapacity(_timestepShape, TensorsPerTimestep); | ||
| _arena.EnsureCapacity(_sequenceShape, 8); | ||
|
|
||
| _workspace = new LayerWorkspace<float>(timestepCount: 7, sequenceCount: 8); | ||
| _workspace.DeclareTimestep(TsRInput, ModelDim); | ||
| _workspace.DeclareTimestep(TsKInput, ModelDim); | ||
| _workspace.DeclareTimestep(TsVInput, ModelDim); | ||
| _workspace.DeclareTimestep(TsAInput, ModelDim); | ||
| _workspace.DeclareTimestep(TsBInput, ModelDim); | ||
| _workspace.DeclareTimestep(TsWkvOut, ModelDim); | ||
| _workspace.DeclareTimestep(TsYt, ModelDim); | ||
| _workspace.DeclareSequence(SqAllR, ModelDim); | ||
| _workspace.DeclareSequence(SqAllK, ModelDim); | ||
| _workspace.DeclareSequence(SqAllV, ModelDim); | ||
| _workspace.DeclareSequence(SqAllA, ModelDim); | ||
| _workspace.DeclareSequence(SqAllB, ModelDim); | ||
| _workspace.DeclareSequence(SqAllWkv, ModelDim); | ||
| _workspace.DeclareSequence(SqAllWkvPre, ModelDim); | ||
| _workspace.DeclareSequence(SqAllWkvGated, ModelDim); | ||
| _workspace.BeginForward(BatchSize, SeqLen); | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Baseline: raw new Tensor allocation (current RWKV7Block pattern). | ||
| /// Creates 8 sequence + 7×128 timestep tensors = 904 allocations per forward pass. | ||
| /// </summary> | ||
| [Benchmark(Baseline = true)] | ||
| public int RawAllocation_RWKV7Pattern() | ||
| { | ||
| int count = 0; | ||
| for (int i = 0; i < 8; i++) | ||
| { | ||
| var t = new Tensor<float>(_sequenceShape); | ||
| count += t.Length; | ||
| } | ||
| for (int step = 0; step < TimestepsPerForward; step++) | ||
| { | ||
| for (int i = 0; i < TensorsPerTimestep; i++) | ||
| { | ||
| var t = new Tensor<float>(_timestepShape); | ||
| count += t.Length; | ||
| } | ||
| } | ||
| return count; | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Arena: bump-pointer allocation. | ||
| /// </summary> | ||
| [Benchmark] | ||
| public int Arena_RWKV7Pattern() | ||
| { | ||
| int count = 0; | ||
| _arena.Reset(); | ||
| for (int i = 0; i < 8; i++) | ||
| count += _arena.Rent(_sequenceShape).Length; | ||
| for (int step = 0; step < TimestepsPerForward; step++) | ||
| { | ||
| _arena.Reset(); | ||
| for (int i = 0; i < TensorsPerTimestep; i++) | ||
| count += _arena.Rent(_timestepShape).Length; | ||
| } | ||
| return count; | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// LayerWorkspace: index-based pre-allocated buffers (production target). | ||
| /// Zero allocation — same tensors returned every call. | ||
| /// </summary> | ||
| [Benchmark] | ||
| public int Workspace_RWKV7Pattern() | ||
| { | ||
| int count = 0; | ||
| _workspace.BeginForward(BatchSize, SeqLen); // Include sizing check in measurement | ||
| // Sequence buffers (pre-allocated, same tensor every call) | ||
| count += _workspace.Sequence(SqAllR).Length; | ||
| count += _workspace.Sequence(SqAllK).Length; | ||
| count += _workspace.Sequence(SqAllV).Length; | ||
| count += _workspace.Sequence(SqAllA).Length; | ||
| count += _workspace.Sequence(SqAllB).Length; | ||
| count += _workspace.Sequence(SqAllWkv).Length; | ||
| count += _workspace.Sequence(SqAllWkvPre).Length; | ||
| count += _workspace.Sequence(SqAllWkvGated).Length; | ||
|
|
||
| // Timestep buffers (same tensor reused each iteration) | ||
| for (int step = 0; step < TimestepsPerForward; step++) | ||
| { | ||
| count += _workspace.Timestep(TsRInput).Length; | ||
| count += _workspace.Timestep(TsKInput).Length; | ||
| count += _workspace.Timestep(TsVInput).Length; | ||
| count += _workspace.Timestep(TsAInput).Length; | ||
| count += _workspace.Timestep(TsBInput).Length; | ||
| count += _workspace.Timestep(TsWkvOut).Length; | ||
| count += _workspace.Timestep(TsYt).Length; | ||
| } | ||
| return count; | ||
| } | ||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| /// <summary> | ||
| /// Micro-benchmark: single new Tensor cost. | ||
| /// </summary> | ||
| [Benchmark] | ||
| public Tensor<float> Single_NewTensor() | ||
| { | ||
| return new Tensor<float>(_timestepShape); | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Micro-benchmark: single arena rent cost. | ||
| /// </summary> | ||
| [Benchmark] | ||
| public Tensor<float> Single_ArenaRent() | ||
| { | ||
| _arena.Reset(); | ||
| return _arena.Rent(_timestepShape); | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Micro-benchmark: single workspace lookup cost. | ||
| /// </summary> | ||
| [Benchmark] | ||
| public Tensor<float> Single_WorkspaceLookup() | ||
| { | ||
| return _workspace.Timestep(TsRInput); | ||
| } | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,165 @@ | ||
| using System.Runtime.CompilerServices; | ||
|
|
||
| namespace AiDotNet.Memory; | ||
|
|
||
| /// <summary> | ||
| /// Bump-pointer arena allocator for zero-allocation forward passes. | ||
| /// Pre-allocates Tensor objects grouped by shape, dishes them out via array index | ||
| /// increment (O(1), zero syscalls), and resets all cursors at end of forward pass. | ||
| /// | ||
| /// This beats PyTorch's per-tensor malloc on CPU by eliminating all system calls | ||
| /// and GC pressure during the forward pass. Tensors are pre-created during warmup | ||
| /// and recycled across calls. | ||
| /// </summary> | ||
| internal sealed class ForwardArena<T> | ||
| { | ||
| private readonly Dictionary<ShapeKey, Tensor<T>[]> _slabs = new(); | ||
| private readonly Dictionary<ShapeKey, int> _cursors = new(); | ||
| private ShapeKey[]? _cursorKeysCache; | ||
| private const int DefaultSlabSize = 4; | ||
| private const int GrowthFactor = 2; | ||
|
|
||
| /// <summary> | ||
| /// Rent a tensor with the given shape. O(1) — single array index + increment. | ||
| /// Zero system calls, zero GC pressure. | ||
| /// </summary> | ||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| public Tensor<T> Rent(int[] shape) | ||
| { | ||
| var key = new ShapeKey(shape); | ||
|
|
||
| if (!_slabs.TryGetValue(key, out var slab)) | ||
| return GrowAndRent(key, shape); | ||
|
|
||
| if (!_cursors.TryGetValue(key, out var cursor)) | ||
| cursor = 0; | ||
|
|
||
| if (cursor >= slab.Length) | ||
| return GrowAndRent(key, shape); | ||
|
|
||
| _cursors[key] = cursor + 1; | ||
| var tensor = slab[cursor]; | ||
| tensor.Data.Span.Clear(); | ||
| return tensor; | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Rent a tensor without clearing its data. Use when the tensor will be | ||
| /// completely overwritten before any reads (e.g., output of MatMul). | ||
| /// </summary> | ||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| public Tensor<T> RentUninitialized(int[] shape) | ||
| { | ||
| var key = new ShapeKey(shape); | ||
|
|
||
| if (!_slabs.TryGetValue(key, out var slab)) | ||
| return GrowAndRent(key, shape, clear: false); | ||
|
|
||
| if (!_cursors.TryGetValue(key, out var cursor)) | ||
| cursor = 0; | ||
|
|
||
| if (cursor >= slab.Length) | ||
| return GrowAndRent(key, shape, clear: false); | ||
|
|
||
| _cursors[key] = cursor + 1; | ||
| return slab[cursor]; | ||
| } | ||
ooples marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| /// <summary> | ||
| /// Reset all cursors to 0. Called at end of Forward pass. | ||
| /// No deallocation — tensors stay pre-allocated for next call. | ||
| /// </summary> | ||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| public void Reset() | ||
| { | ||
| // Use cached keys array to avoid allocation during reset | ||
| if (_cursorKeysCache is null || _cursorKeysCache.Length != _cursors.Count) | ||
| _cursorKeysCache = new ShapeKey[_cursors.Count]; | ||
| _cursors.Keys.CopyTo(_cursorKeysCache, 0); | ||
| for (int i = 0; i < _cursorKeysCache.Length; i++) | ||
| _cursors[_cursorKeysCache[i]] = 0; | ||
| } | ||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| /// <summary> | ||
| /// Pre-allocate capacity for a given shape. Call during layer construction | ||
| /// or at start of forward pass when shapes change. | ||
| /// </summary> | ||
| public void EnsureCapacity(int[] shape, int count) | ||
| { | ||
| var key = new ShapeKey(shape, defensiveCopy: true); | ||
| if (_slabs.TryGetValue(key, out var existing) && existing.Length >= count) | ||
| return; | ||
|
|
||
| var newSlab = new Tensor<T>[count]; | ||
| int copyFrom = 0; | ||
| if (existing is not null) | ||
| { | ||
| Array.Copy(existing, newSlab, existing.Length); | ||
| copyFrom = existing.Length; | ||
| } | ||
| for (int i = copyFrom; i < count; i++) | ||
| newSlab[i] = new Tensor<T>(shape); | ||
|
|
||
| _slabs[key] = newSlab; | ||
| if (!_cursors.ContainsKey(key)) | ||
| _cursors[key] = 0; | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Gets the total number of pre-allocated tensors across all shapes. | ||
| /// </summary> | ||
| public int TotalPreAllocated => _slabs.Values.Sum(s => s.Length); | ||
|
|
||
| /// <summary> | ||
| /// Gets the peak number of tensors rented in the current forward pass. | ||
| /// </summary> | ||
| public int CurrentRented => _cursors.Values.Sum(); | ||
|
|
||
| private Tensor<T> GrowAndRent(ShapeKey key, int[] shape, bool clear = true) | ||
| { | ||
| int currentSize = _slabs.TryGetValue(key, out var existing) ? existing.Length : 0; | ||
| int newSize = Math.Max(currentSize * GrowthFactor, DefaultSlabSize); | ||
| EnsureCapacity(shape, newSize); | ||
|
|
||
| var cursor = _cursors.TryGetValue(key, out var c) ? c : 0; | ||
| _cursors[key] = cursor + 1; | ||
| var tensor = _slabs[key][cursor]; | ||
| if (clear) tensor.Data.Span.Clear(); | ||
| return tensor; | ||
| } | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Value-type shape key for arena dictionary lookups. Pre-computes hash | ||
| /// to avoid per-lookup allocation. Matches TensorPool.GetTensorPoolKey pattern. | ||
| /// </summary> | ||
| public readonly struct ShapeKey : IEquatable<ShapeKey> | ||
| { | ||
| private readonly int _hash; | ||
| private readonly int[] _dims; | ||
|
|
||
| public ShapeKey(int[] shape, bool defensiveCopy = false) | ||
| { | ||
| _dims = defensiveCopy ? (int[])shape.Clone() : shape; | ||
| unchecked | ||
| { | ||
| int hash = (int)2166136261; | ||
| hash = (hash ^ shape.Length) * 16777619; | ||
| for (int i = 0; i < shape.Length; i++) | ||
| hash = (hash ^ shape[i]) * 16777619; | ||
| _hash = hash; | ||
| } | ||
| } | ||
|
|
||
| public override int GetHashCode() => _hash; | ||
|
|
||
| public override bool Equals(object? obj) => obj is ShapeKey other && Equals(other); | ||
|
|
||
| public bool Equals(ShapeKey other) | ||
| { | ||
| if (_dims.Length != other._dims.Length) return false; | ||
| for (int i = 0; i < _dims.Length; i++) | ||
| if (_dims[i] != other._dims[i]) return false; | ||
| return true; | ||
| } | ||
| } | ||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.