Skip to content

Commit c156103

Browse files
authored
Merge pull request #471 from dadhi/wip-issue468-v2
@wip issue468 v2
2 parents e68a7b3 + 81b864a commit c156103

File tree

8 files changed

+793
-29
lines changed

8 files changed

+793
-29
lines changed

src/FastExpressionCompiler.LightExpression/Expression.cs

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -221,9 +221,6 @@ public static ConstantExpression ConstantNull(Type type = null) =>
221221
public static ConstantExpression ConstantOf<T>(T value) =>
222222
value == null ? ConstantNull<T>() : new ValueConstantExpression<T>(value);
223223

224-
[MethodImpl((MethodImplOptions)256)]
225-
public static int TryGetIntConstantValue(Expression e) => ((IntConstantExpression)e).IntValue;
226-
227224
[RequiresUnreferencedCode(Trimming.Message)]
228225
public static NewExpression New(Type type)
229226
{
@@ -3914,9 +3911,8 @@ public sealed class TypedValueConstantExpression : ConstantExpression
39143911
public sealed class IntConstantExpression : ConstantExpression
39153912
{
39163913
public override Type Type => typeof(int);
3917-
public override object Value => IntValue;
3918-
public readonly int IntValue;
3919-
internal IntConstantExpression(int value) => IntValue = value;
3914+
public override object Value { get; }
3915+
internal IntConstantExpression(int value) => Value = value;
39203916
}
39213917

39223918
public class NewExpression : Expression, IArgumentProvider

src/FastExpressionCompiler/FastExpressionCompiler.cs

Lines changed: 444 additions & 19 deletions
Large diffs are not rendered by default.

src/FastExpressionCompiler/TestTools.cs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ public static class TestTools
2828
public static bool AllowPrintIL = false;
2929
public static bool AllowPrintCS = false;
3030
public static bool AllowPrintExpression = false;
31+
public static bool DisableAssertOpCodes = false;
3132

3233
static TestTools()
3334
{
@@ -43,6 +44,8 @@ public static void AssertOpCodes(this Delegate @delegate, params OpCode[] expect
4344

4445
public static void AssertOpCodes(this MethodInfo method, params OpCode[] expectedCodes)
4546
{
47+
if (DisableAssertOpCodes) return;
48+
4649
var ilReader = ILReaderFactory.Create(method);
4750
if (ilReader is null)
4851
{
@@ -952,6 +955,8 @@ public sealed class TestRun
952955
public SmallList<TestStats> Stats;
953956
public SmallList<TestFailure> Failures;
954957

958+
// todo: @wip put the output under the feature flag
959+
/// <summary>Will output the failures while running</summary>
955960
public void Run<T>(T test, TestTracking tracking = TestTracking.TrackFailedTestsOnly) where T : ITestX
956961
{
957962
var totalTestCount = TotalTestCount;

test/FastExpressionCompiler.Benchmarks/FastExpressionCompiler.Benchmarks.csproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
<Project Sdk="Microsoft.NET.Sdk">
22

33
<PropertyGroup>
4-
<TargetFrameworks>$(LatestSupportedNet)</TargetFrameworks>
4+
<TargetFrameworks>$(LatestSupportedNet);net8.0</TargetFrameworks>
55

66
<OutputType>Exe</OutputType>
77
<IsTestProject>false</IsTestProject>
Lines changed: 243 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,243 @@
1+
using System;
2+
using System.Linq.Expressions;
3+
using BenchmarkDotNet.Attributes;
4+
using BenchmarkDotNet.Diagnosers;
5+
using BenchmarkDotNet.Jobs;
6+
7+
namespace FastExpressionCompiler.Benchmarks;
8+
9+
/*
10+
## Base line with the static method, it seems to be a wrong idea for the improvement, because the closure-bound method is faster as I did discovered a long ago.
11+
12+
BenchmarkDotNet v0.14.0, Windows 11 (10.0.26100.3775)
13+
Intel Core i9-8950HK CPU 2.90GHz (Coffee Lake), 1 CPU, 12 logical and 6 physical cores
14+
.NET SDK 9.0.203
15+
[Host] : .NET 9.0.4 (9.0.425.16305), X64 RyuJIT AVX2
16+
.NET 8.0 : .NET 8.0.15 (8.0.1525.16413), X64 RyuJIT AVX2
17+
.NET 9.0 : .NET 9.0.4 (9.0.425.16305), X64 RyuJIT AVX2
18+
19+
20+
| Method | Job | Runtime | Mean | Error | StdDev | Ratio | RatioSD | Rank | BranchInstructions/Op | CacheMisses/Op | BranchMispredictions/Op | Allocated | Alloc Ratio |
21+
|------------------- |--------- |--------- |----------:|----------:|----------:|------:|--------:|-----:|----------------------:|---------------:|------------------------:|----------:|------------:|
22+
| InvokeCompiled | .NET 8.0 | .NET 8.0 | 0.4365 ns | 0.0246 ns | 0.0192 ns | 1.00 | 0.06 | 1 | 1 | -0 | -0 | - | NA |
23+
| InvokeCompiledFast | .NET 8.0 | .NET 8.0 | 1.0837 ns | 0.0557 ns | 0.0991 ns | 2.49 | 0.25 | 2 | 2 | 0 | 0 | - | NA |
24+
| | | | | | | | | | | | | | |
25+
| InvokeCompiled | .NET 9.0 | .NET 9.0 | 0.5547 ns | 0.0447 ns | 0.0871 ns | 1.02 | 0.22 | 1 | 1 | -0 | -0 | - | NA |
26+
| InvokeCompiledFast | .NET 9.0 | .NET 9.0 | 1.1920 ns | 0.0508 ns | 0.0450 ns | 2.20 | 0.34 | 2 | 2 | 0 | -0 | - | NA |
27+
28+
29+
## Sealing the closure type does not help
30+
31+
| Method | Job | Runtime | Mean | Error | StdDev | Median | Ratio | RatioSD | Rank | BranchInstructions/Op | BranchMispredictions/Op | CacheMisses/Op | Allocated | Alloc Ratio |
32+
|------------------- |--------- |--------- |----------:|----------:|----------:|----------:|------:|--------:|-----:|----------------------:|------------------------:|---------------:|----------:|------------:|
33+
| InvokeCompiledFast | .NET 8.0 | .NET 8.0 | 1.0066 ns | 0.0209 ns | 0.0233 ns | 0.9973 ns | 1.00 | 0.03 | 2 | 2 | 0 | 0 | - | NA |
34+
| InvokeCompiled | .NET 8.0 | .NET 8.0 | 0.5040 ns | 0.0217 ns | 0.0169 ns | 0.5016 ns | 0.50 | 0.02 | 1 | 1 | -0 | -0 | - | NA |
35+
| | | | | | | | | | | | | | | |
36+
| InvokeCompiledFast | .NET 9.0 | .NET 9.0 | 1.0640 ns | 0.0539 ns | 0.0929 ns | 1.0106 ns | 1.01 | 0.12 | 2 | 2 | 0 | 0 | - | NA |
37+
| InvokeCompiled | .NET 9.0 | .NET 9.0 | 0.5897 ns | 0.0451 ns | 0.0858 ns | 0.6156 ns | 0.56 | 0.09 | 1 | 1 | -0 | -0 | - | NA |
38+
39+
40+
## Steel the same speed with the minimal IL of 2 instructions
41+
42+
Job=.NET 8.0 Runtime=.NET 8.0
43+
44+
| Method | Mean | Error | StdDev | Ratio | RatioSD | Rank | Allocated | Alloc Ratio |
45+
|------------------- |----------:|----------:|----------:|------:|--------:|-----:|----------:|------------:|
46+
| InvokeCompiled | 0.4647 ns | 0.0321 ns | 0.0268 ns | 1.00 | 0.08 | 1 | - | NA |
47+
| InvokeCompiledFast | 0.9739 ns | 0.0433 ns | 0.0481 ns | 2.10 | 0.15 | 2 | - | NA |
48+
49+
50+
## But the Func speed is faster, hmm
51+
52+
Job=.NET 8.0 Runtime=.NET 8.0
53+
54+
| Method | Mean | Error | StdDev | Ratio | RatioSD | Rank | Allocated | Alloc Ratio |
55+
|--------------- |----------:|----------:|----------:|------:|--------:|-----:|----------:|------------:|
56+
| InvokeCompiled | 0.2685 ns | 0.0210 ns | 0.0186 ns | 1.00 | 0.09 | 2 | - | NA |
57+
| JustFunc | 0.1711 ns | 0.0310 ns | 0.0305 ns | 0.64 | 0.12 | 1 | - | NA |
58+
59+
60+
## HERE IS THE REASON:
61+
62+
FEC creates the DynamicMethod with `owner` param, but System compile uses the different overload without owner and internally with `transparentMethod: true`.
63+
Using this latter (System) overload drastically slows down the compilation but removes the additional branch instruction in the invocation, making a super simple delegates faster.
64+
But for the delegates doing actual/more work, having additional branch instruction is negligible and usually does not show in the invocation performance.
65+
66+
2x slow: `var method = new DynamicMethod(string.Empty, returnType, closurePlusParamTypes, typeof(ArrayClosure), true);`
67+
^^^^^^^^^^^^^^^^^^^^
68+
parity: `var method = new DynamicMethod(string.Empty, returnType, closurePlusParamTypes, true);`
69+
70+
Job=.NET 8.0 Runtime=.NET 8.0
71+
72+
| Method | Mean | Error | StdDev | Ratio | RatioSD | Rank | BranchInstructions/Op | Allocated | Alloc Ratio |
73+
|------------------- |----------:|----------:|----------:|------:|--------:|-----:|----------------------:|----------:|------------:|
74+
| InvokeCompiled | 0.5075 ns | 0.0153 ns | 0.0143 ns | 1.00 | 0.04 | 1 | 1 | - | NA |
75+
| InvokeCompiledFast | 0.5814 ns | 0.0433 ns | 0.0699 ns | 1.15 | 0.14 | 1 | 1 | - | NA |
76+
77+
78+
## Not with full eval before Compile the results are funny in the good way
79+
80+
Job=.NET 8.0 Runtime=.NET 8.0
81+
82+
| Method | Mean | Error | StdDev | Ratio | RatioSD | Rank | BranchInstructions/Op | Allocated | Alloc Ratio |
83+
|------------------------------- |----------:|----------:|----------:|------:|--------:|-----:|----------------------:|----------:|------------:|
84+
| InvokeCompiled | 0.5071 ns | 0.0289 ns | 0.0242 ns | 1.00 | 0.06 | 2 | 1 | - | NA |
85+
| InvokeCompiledFastWithEvalFlag | 0.0804 ns | 0.0341 ns | 0.0351 ns | 0.16 | 0.07 | 1 | 1 | - | NA |
86+
87+
88+
## Fastest so far
89+
90+
DefaultJob : .NET 9.0.4 (9.0.425.16305), X64 RyuJIT AVX2
91+
92+
| Method | Mean | Error | StdDev | Median | Ratio | RatioSD | Rank | BranchInstructions/Op | Allocated | Alloc Ratio |
93+
|-------------------------------------- |----------:|----------:|----------:|----------:|------:|--------:|-----:|----------------------:|----------:|------------:|
94+
| InvokeCompiled | 0.5088 ns | 0.0399 ns | 0.0842 ns | 0.4707 ns | 1.02 | 0.22 | 2 | 1 | - | NA |
95+
| InvokeCompiledFast | 0.1105 ns | 0.0360 ns | 0.0799 ns | 0.0689 ns | 0.22 | 0.16 | 1 | 1 | - | NA |
96+
| InvokeCompiledFast_DisableInterpreter | 1.0607 ns | 0.0540 ns | 0.0887 ns | 1.0301 ns | 2.13 | 0.34 | 3 | 2 | - | NA |
97+
98+
*/
99+
[MemoryDiagnoser, RankColumn]
100+
[HardwareCounters(HardwareCounter.BranchInstructions)]
101+
// [SimpleJob(RuntimeMoniker.Net90)]
102+
// [SimpleJob(RuntimeMoniker.Net80)]
103+
public class Issue468_InvokeCompiled_vs_InvokeCompiledFast
104+
{
105+
Func<bool> _compiled, _compiledFast, _compiledFast_DisableInterpreter, _justFunc = static () => true;
106+
107+
[GlobalSetup]
108+
public void Setup()
109+
{
110+
var expr = IssueTests.Issue468_Optimize_the_delegate_access_to_the_Closure_object_for_the_modern_NET.CreateExpression();
111+
_compiled = expr.CompileSys();
112+
_compiledFast = expr.CompileFast();
113+
_compiledFast_DisableInterpreter = expr.CompileFast(flags: CompilerFlags.DisableInterpreter);
114+
}
115+
116+
[Benchmark(Baseline = true)]
117+
public bool InvokeCompiled()
118+
{
119+
return _compiled();
120+
}
121+
122+
[Benchmark]
123+
public bool InvokeCompiledFast()
124+
{
125+
return _compiledFast();
126+
}
127+
128+
[Benchmark]
129+
public bool InvokeCompiledFast_DisableInterpreter()
130+
{
131+
return _compiledFast_DisableInterpreter();
132+
}
133+
134+
// [Benchmark]
135+
public bool JustFunc()
136+
{
137+
return _justFunc();
138+
}
139+
}
140+
141+
/*
142+
## Baseline. Does not look good. There is actually a regression I need to find and fix.
143+
144+
| Method | Job | Runtime | Mean | Error | StdDev | Ratio | RatioSD | Rank | Gen0 | Gen1 | Allocated | Alloc Ratio |
145+
|------------- |--------- |--------- |---------:|---------:|---------:|------:|--------:|-----:|-------:|-------:|----------:|------------:|
146+
| Compiled | .NET 8.0 | .NET 8.0 | 23.51 us | 0.468 us | 0.715 us | 1.00 | 0.04 | 2 | 0.6714 | 0.6409 | 4.13 KB | 1.00 |
147+
| CompiledFast | .NET 8.0 | .NET 8.0 | 17.63 us | 0.156 us | 0.146 us | 0.75 | 0.02 | 1 | 0.1831 | 0.1526 | 1.16 KB | 0.28 |
148+
| | | | | | | | | | | | | |
149+
| Compiled | .NET 9.0 | .NET 9.0 | 21.27 us | 0.114 us | 0.106 us | 1.00 | 0.01 | 2 | 0.6714 | 0.6409 | 4.13 KB | 1.00 |
150+
| CompiledFast | .NET 9.0 | .NET 9.0 | 16.82 us | 0.199 us | 0.186 us | 0.79 | 0.01 | 1 | 0.1831 | 0.1526 | 1.16 KB | 0.28 |
151+
152+
153+
## After reverting the regression
154+
155+
| Method | Job | Runtime | Mean | Error | StdDev | Ratio | RatioSD | Rank | Gen0 | Gen1 | Allocated | Alloc Ratio |
156+
|-------------------------- |--------- |--------- |----------:|----------:|----------:|------:|--------:|-----:|-------:|-------:|----------:|------------:|
157+
| Compiled | .NET 8.0 | .NET 8.0 | 25.093 us | 0.4979 us | 1.1034 us | 1.00 | 0.06 | 2 | 0.6714 | 0.6104 | 4.13 KB | 1.00 |
158+
| CompiledFast | .NET 8.0 | .NET 8.0 | 3.433 us | 0.0680 us | 0.0603 us | 0.14 | 0.01 | 1 | 0.1678 | 0.1526 | 1.12 KB | 0.27 |
159+
| CompiledFast_WithEvalFlag | .NET 8.0 | .NET 8.0 | 3.419 us | 0.0675 us | 0.1409 us | 0.14 | 0.01 | 1 | 0.2365 | 0.2289 | 1.48 KB | 0.36 |
160+
| | | | | | | | | | | | | |
161+
| Compiled | .NET 9.0 | .NET 9.0 | 25.491 us | 0.4667 us | 0.4137 us | 1.00 | 0.02 | 2 | 0.6714 | 0.6104 | 4.13 KB | 1.00 |
162+
| CompiledFast | .NET 9.0 | .NET 9.0 | 3.337 us | 0.0634 us | 0.0593 us | 0.13 | 0.00 | 1 | 0.1793 | 0.1755 | 1.12 KB | 0.27 |
163+
| CompiledFast_WithEvalFlag | .NET 9.0 | .NET 9.0 | 3.198 us | 0.0628 us | 0.0588 us | 0.13 | 0.00 | 1 | 0.2365 | 0.2289 | 1.48 KB | 0.36 |
164+
165+
166+
## Funny results after adding eval before compile
167+
168+
Job=.NET 8.0 Runtime=.NET 8.0
169+
170+
| Method | Mean | Error | StdDev | Median | Ratio | RatioSD | Rank | Gen0 | Gen1 | Allocated | Alloc Ratio |
171+
|-------------------------- |------------:|----------:|----------:|------------:|-------:|--------:|-----:|-------:|-------:|----------:|------------:|
172+
| Compiled | 22,507.0 ns | 435.99 ns | 652.57 ns | 22,519.1 ns | 131.40 | 8.03 | 3 | 0.6714 | 0.6409 | 4232 B | 11.02 |
173+
| CompiledFast | 3,051.9 ns | 59.71 ns | 55.86 ns | 3,036.6 ns | 17.82 | 1.01 | 2 | 0.1755 | 0.1678 | 1143 B | 2.98 |
174+
| CompiledFast_WithEvalFlag | 171.8 ns | 3.49 ns | 9.44 ns | 167.6 ns | 1.00 | 0.08 | 1 | 0.0610 | - | 384 B | 1.00 |
175+
176+
177+
## Now we're talking (after small interpretator optimizations)
178+
179+
DefaultJob : .NET 9.0.4 (9.0.425.16305), X64 RyuJIT AVX2
180+
181+
| Method | Mean | Error | StdDev | Median | Ratio | RatioSD | Rank | Gen0 | Gen1 | Allocated | Alloc Ratio |
182+
|-------------------------------- |-------------:|-----------:|-----------:|-------------:|-------:|--------:|-----:|-------:|-------:|----------:|------------:|
183+
| Compiled | 22,937.50 ns | 447.883 ns | 784.432 ns | 22,947.67 ns | 230.86 | 14.14 | 3 | 0.6714 | 0.6409 | 4232 B | 88.17 |
184+
| CompiledFast | 99.62 ns | 2.044 ns | 5.275 ns | 97.03 ns | 1.00 | 0.07 | 1 | 0.0076 | - | 48 B | 1.00 |
185+
| CompiledFast_DisableInterpreter | 3,010.37 ns | 60.174 ns | 91.893 ns | 3,010.03 ns | 30.30 | 1.80 | 2 | 0.1755 | 0.1678 | 1143 B | 23.81 |
186+
*/
187+
[MemoryDiagnoser, RankColumn]
188+
// [SimpleJob(RuntimeMoniker.Net90)]
189+
// [SimpleJob(RuntimeMoniker.Net80)]
190+
public class Issue468_Compile_vs_FastCompile
191+
{
192+
Expression<Func<bool>> _expr;
193+
194+
[GlobalSetup]
195+
public void Setup()
196+
{
197+
_expr = IssueTests.Issue468_Optimize_the_delegate_access_to_the_Closure_object_for_the_modern_NET.CreateExpression();
198+
}
199+
200+
[Benchmark]
201+
public object Compiled()
202+
{
203+
return _expr.Compile();
204+
}
205+
206+
[Benchmark(Baseline = true)]
207+
public object CompiledFast()
208+
{
209+
return _expr.CompileFast();
210+
}
211+
212+
[Benchmark]
213+
public object CompiledFast_DisableInterpreter()
214+
{
215+
return _expr.CompileFast(flags: CompilerFlags.DisableInterpreter);
216+
}
217+
}
218+
219+
[MemoryDiagnoser, RankColumn]
220+
// [SimpleJob(RuntimeMoniker.Net90)]
221+
// [SimpleJob(RuntimeMoniker.Net80)]
222+
public class Issue468_Eval_Optimization
223+
{
224+
Expression<Func<bool>> _expr;
225+
226+
[GlobalSetup]
227+
public void Setup()
228+
{
229+
_expr = IssueTests.Issue468_Optimize_the_delegate_access_to_the_Closure_object_for_the_modern_NET.CreateExpression();
230+
}
231+
232+
// [Benchmark(Baseline = true)]
233+
// public object Baseline()
234+
// {
235+
// return ExpressionCompiler.Interpreter.TryEvalPrimitive_OLD(out var result, _expr) ? result : null;
236+
// }
237+
238+
[Benchmark]
239+
public object Optimized()
240+
{
241+
return ExpressionCompiler.Interpreter.TryInterpretPrimitive(out var result, _expr) ? result : null;
242+
}
243+
}

test/FastExpressionCompiler.Benchmarks/Program.cs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,14 @@ public static void Main()
2020
// BenchmarkRunner.Run<ManuallyComposedLambdaBenchmark.Create>(); // not included in README.md, may be it needs to
2121
// BenchmarkRunner.Run<ManuallyComposedLambdaBenchmark.Create_and_Compile>(); // not included in README.md, may be it needs to
2222

23-
BenchmarkRunner.Run<LightExprVsExpr_Create_ComplexExpr>();
24-
BenchmarkRunner.Run<LightExprVsExpr_CreateAndCompile_ComplexExpr>();
23+
// BenchmarkRunner.Run<LightExprVsExpr_Create_ComplexExpr>();
24+
// BenchmarkRunner.Run<LightExprVsExpr_CreateAndCompile_ComplexExpr>();
2525

2626
//--------------------------------------------
2727

28+
// BenchmarkRunner.Run<Issue468_Compile_vs_FastCompile>();
29+
BenchmarkRunner.Run<Issue468_InvokeCompiled_vs_InvokeCompiledFast>();
30+
// BenchmarkRunner.Run<Issue468_Eval_Optimization>();
2831

2932
// BenchmarkRunner.Run<AccessByRef_vs_ByIGetRefStructImpl>();
3033

0 commit comments

Comments
 (0)