@@ -74,8 +74,9 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
7474 if ( Fma . IsSupported )
7575 {
7676 float * bufferStart = this . bufferPtr ;
77- float * bufferEnd = bufferStart + ( this . Length & ~ 1 ) ;
78- Vector256 < float > result256 = Vector256 < float > . Zero ;
77+ float * bufferEnd = bufferStart + ( this . Length & ~ 3 ) ;
78+ Vector256 < float > result256_0 = Vector256 < float > . Zero ;
79+ Vector256 < float > result256_1 = Vector256 < float > . Zero ;
7980 var mask = Vector256 . Create ( 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 ) ;
8081
8182 while ( bufferStart < bufferEnd )
@@ -87,19 +88,36 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
8788 //
8889 // vmovsd xmm2, [rax] ; load *(double*)bufferStart into xmm2 as [ab, _]
8990 // vpermps ymm2, ymm1, ymm2 ; permute as a float YMM register to [a, a, a, a, b, b, b, b]
90- // vfmadd231ps ymm0, ymm2, [r8] ; result256 = FMA(pixels, factors) + result256
91+ // vfmadd231ps ymm0, ymm2, [r8] ; result256_0 = FMA(pixels, factors) + result256_0
9192 //
9293 // For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212.
93- result256 = Fma . MultiplyAdd (
94+ // Additionally, we're also unrolling two computations per each loop iterations to leverage the
95+ // fact that most CPUs have two ports to schedule multiply operations for FMA instructions.
96+ result256_0 = Fma . MultiplyAdd (
9497 Unsafe . As < Vector4 , Vector256 < float > > ( ref rowStartRef ) ,
9598 Avx2 . PermuteVar8x32 ( Vector256 . CreateScalarUnsafe ( * ( double * ) bufferStart ) . AsSingle ( ) , mask ) ,
96- result256 ) ;
99+ result256_0 ) ;
97100
98- bufferStart += 2 ;
99- rowStartRef = ref Unsafe . Add ( ref rowStartRef , 2 ) ;
101+ result256_1 = Fma . MultiplyAdd (
102+ Unsafe . As < Vector4 , Vector256 < float > > ( ref Unsafe . Add ( ref rowStartRef , 2 ) ) ,
103+ Avx2 . PermuteVar8x32 ( Vector256 . CreateScalarUnsafe ( * ( double * ) ( bufferStart + 2 ) ) . AsSingle ( ) , mask ) ,
104+ result256_1 ) ;
105+
106+ bufferStart += 4 ;
107+ rowStartRef = ref Unsafe . Add ( ref rowStartRef , 4 ) ;
108+ }
109+
110+ result256_0 = Avx . Add ( result256_0 , result256_1 ) ;
111+
112+ if ( ( this . Length & 3 ) >= 2 )
113+ {
114+ result256_0 = Fma . MultiplyAdd (
115+ Unsafe . As < Vector4 , Vector256 < float > > ( ref rowStartRef ) ,
116+ Avx2 . PermuteVar8x32 ( Vector256 . CreateScalarUnsafe ( * ( double * ) bufferStart ) . AsSingle ( ) , mask ) ,
117+ result256_0 ) ;
100118 }
101119
102- Vector128 < float > result128 = Sse . Add ( result256 . GetLower ( ) , result256 . GetUpper ( ) ) ;
120+ Vector128 < float > result128 = Sse . Add ( result256_0 . GetLower ( ) , result256_0 . GetUpper ( ) ) ;
103121
104122 if ( ( this . Length & 1 ) != 0 )
105123 {
0 commit comments