Merge pull request #2355 from SixLabors/bp/fixCalcModeScore

JimBobSquarePants · web-flow · commit 5de22f407135 · 2023-02-12T21:37:05.000+10:00
Fix mode score calculation for SSE2/AVX2 version
diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -14,7 +14,7 @@ internal static class LossyUtils
 {
     // Note: method name in libwebp reference implementation is called VP8SSE16x16.
     [MethodImpl(InliningOptions.ShortMethod)]
-    public static int Vp8_Sse16X16(Span<byte> a, Span<byte> b)
+    public static int Vp8_Sse16x16(Span<byte> a, Span<byte> b)
     {
         if (Avx2.IsSupported)
         {
@@ -31,7 +31,7 @@ public static int Vp8_Sse16X16(Span<byte> a, Span<byte> b)
 
     // Note: method name in libwebp reference implementation is called VP8SSE16x8.
     [MethodImpl(InliningOptions.ShortMethod)]
-    public static int Vp8_Sse16X8(Span<byte> a, Span<byte> b)
+    public static int Vp8_Sse16x8(Span<byte> a, Span<byte> b)
     {
         if (Avx2.IsSupported)
         {
@@ -48,7 +48,7 @@ public static int Vp8_Sse16X8(Span<byte> a, Span<byte> b)
 
     // Note: method name in libwebp reference implementation is called VP8SSE4x4.
     [MethodImpl(InliningOptions.ShortMethod)]
-    public static int Vp8_Sse4X4(Span<byte> a, Span<byte> b)
+    public static int Vp8_Sse4x4(Span<byte> a, Span<byte> b)
     {
         if (Avx2.IsSupported)
         {
@@ -77,8 +77,8 @@ public static int Vp8_Sse4X4(Span<byte> a, Span<byte> b)
             Vector256<byte> b01s = Avx2.UnpackLow(b01.AsByte(), Vector256<byte>.Zero);
 
             // subtract, square and accumulate.
-            Vector256<byte> d0 = Avx2.SubtractSaturate(a01s, b01s);
-            Vector256<int> e0 = Avx2.MultiplyAddAdjacent(d0.AsInt16(), d0.AsInt16());
+            Vector256<short> d0 = Avx2.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16());
+            Vector256<int> e0 = Avx2.MultiplyAddAdjacent(d0, d0);
 
             return Numerics.ReduceSum(e0);
         }
@@ -110,10 +110,10 @@ public static int Vp8_Sse4X4(Span<byte> a, Span<byte> b)
             Vector128<byte> b23s = Sse2.UnpackLow(b23.AsByte(), Vector128<byte>.Zero);
 
             // subtract, square and accumulate.
-            Vector128<byte> d0 = Sse2.SubtractSaturate(a01s, b01s);
-            Vector128<byte> d1 = Sse2.SubtractSaturate(a23s, b23s);
-            Vector128<int> e0 = Sse2.MultiplyAddAdjacent(d0.AsInt16(), d0.AsInt16());
-            Vector128<int> e1 = Sse2.MultiplyAddAdjacent(d1.AsInt16(), d1.AsInt16());
+            Vector128<short> d0 = Sse2.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16());
+            Vector128<short> d1 = Sse2.SubtractSaturate(a23s.AsInt16(), b23s.AsInt16());
+            Vector128<int> e0 = Sse2.MultiplyAddAdjacent(d0, d0);
+            Vector128<int> e1 = Sse2.MultiplyAddAdjacent(d1, d1);
             Vector128<int> sum = Sse2.Add(e0, e1);
 
             return Numerics.ReduceSum(sum);
@@ -126,18 +126,16 @@ public static int Vp8_Sse4X4(Span<byte> a, Span<byte> b)
     public static int Vp8_SseNxN(Span<byte> a, Span<byte> b, int w, int h)
     {
         int count = 0;
-        int aOffset = 0;
-        int bOffset = 0;
+        int offset = 0;
         for (int y = 0; y < h; y++)
         {
             for (int x = 0; x < w; x++)
             {
-                int diff = a[aOffset + x] - b[bOffset + x];
+                int diff = a[offset + x] - b[offset + x];
                 count += diff * diff;
             }
 
-            aOffset += WebpConstants.Bps;
-            bOffset += WebpConstants.Bps;
+            offset += WebpConstants.Bps;
         }
 
         return count;
diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
@@ -53,7 +53,7 @@ public static void PickBestIntra16(Vp8EncIterator it, ref Vp8ModeScore rd, Vp8Se
             rdCur.Nz = (uint)ReconstructIntra16(it, dqm, rdCur, tmpDst, mode);
 
             // Measure RD-score.
-            rdCur.D = LossyUtils.Vp8_Sse16X16(src, tmpDst);
+            rdCur.D = LossyUtils.Vp8_Sse16x16(src, tmpDst);
             rdCur.SD = tlambda != 0 ? Mult8B(tlambda, LossyUtils.Vp8Disto16X16(src, tmpDst, WeightY, scratch)) : 0;
             rdCur.H = WebpConstants.Vp8FixedCostsI16[mode];
             rdCur.R = it.GetCostLuma16(rdCur, proba, res);
@@ -145,7 +145,7 @@ public static bool PickBestIntra4(Vp8EncIterator it, ref Vp8ModeScore rd, Vp8Seg
                 rdTmp.Nz = (uint)ReconstructIntra4(it, dqm, tmpLevels, src, tmpDst, mode);
 
                 // Compute RD-score.
-                rdTmp.D = LossyUtils.Vp8_Sse4X4(src, tmpDst);
+                rdTmp.D = LossyUtils.Vp8_Sse4x4(src, tmpDst);
                 rdTmp.SD = tlambda != 0 ? Mult8B(tlambda, LossyUtils.Vp8Disto4X4(src, tmpDst, WeightY, scratch)) : 0;
                 rdTmp.H = modeCosts[mode];
 
@@ -235,7 +235,7 @@ public static void PickBestUv(Vp8EncIterator it, ref Vp8ModeScore rd, Vp8Segment
             rdUv.Nz = (uint)ReconstructUv(it, dqm, rdUv, tmpDst, mode);
 
             // Compute RD-score
-            rdUv.D = LossyUtils.Vp8_Sse16X8(src, tmpDst);
+            rdUv.D = LossyUtils.Vp8_Sse16x8(src, tmpDst);
             rdUv.SD = 0;    // not calling TDisto here: it tends to flatten areas.
             rdUv.H = WebpConstants.Vp8FixedCostsUv[mode];
             rdUv.R = it.GetCostUv(rdUv, proba, res);
@@ -389,7 +389,7 @@ public static void RefineUsingDistortion(Vp8EncIterator it, Vp8SegmentInfo[] seg
             for (mode = 0; mode < WebpConstants.NumPredModes; ++mode)
             {
                 Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I16ModeOffsets[mode]);
-                long score = (LossyUtils.Vp8_Sse16X16(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsI16[mode] * lambdaDi16);
+                long score = (LossyUtils.Vp8_Sse16x16(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsI16[mode] * lambdaDi16);
 
                 if (mode > 0 && WebpConstants.Vp8FixedCostsI16[mode] > bitLimit)
                 {
@@ -436,7 +436,7 @@ public static void RefineUsingDistortion(Vp8EncIterator it, Vp8SegmentInfo[] seg
                 for (mode = 0; mode < WebpConstants.NumBModes; ++mode)
                 {
                     Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I4ModeOffsets[mode]);
-                    long score = (LossyUtils.Vp8_Sse4X4(src, reference) * WebpConstants.RdDistoMult) + (modeCosts[mode] * lambdaDi4);
+                    long score = (LossyUtils.Vp8_Sse4x4(src, reference) * WebpConstants.RdDistoMult) + (modeCosts[mode] * lambdaDi4);
                     if (score < bestI4Score)
                     {
                         bestI4Mode = mode;
@@ -485,7 +485,7 @@ public static void RefineUsingDistortion(Vp8EncIterator it, Vp8SegmentInfo[] seg
             for (mode = 0; mode < WebpConstants.NumPredModes; ++mode)
             {
                 Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8UvModeOffsets[mode]);
-                long score = (LossyUtils.Vp8_Sse16X8(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsUv[mode] * lambdaDuv);
+                long score = (LossyUtils.Vp8_Sse16x8(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsUv[mode] * lambdaDuv);
                 if (score < bestUvScore)
                 {
                     bestMode = mode;
diff --git a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@ internal static class LossyUtils`
`14`	`14`	`{`
`15`	`15`	`// Note: method name in libwebp reference implementation is called VP8SSE16x16.`
`16`	`16`	`[MethodImpl(InliningOptions.ShortMethod)]`
`17`		`- public static int Vp8_Sse16X16(Span<byte> a, Span<byte> b)`
	`17`	`+ public static int Vp8_Sse16x16(Span<byte> a, Span<byte> b)`
`18`	`18`	`{`
`19`	`19`	`if (Avx2.IsSupported)`
`20`	`20`	`{`
`@@ -31,7 +31,7 @@ public static int Vp8_Sse16X16(Span<byte> a, Span<byte> b)`
`31`	`31`
`32`	`32`	`// Note: method name in libwebp reference implementation is called VP8SSE16x8.`
`33`	`33`	`[MethodImpl(InliningOptions.ShortMethod)]`
`34`		`- public static int Vp8_Sse16X8(Span<byte> a, Span<byte> b)`
	`34`	`+ public static int Vp8_Sse16x8(Span<byte> a, Span<byte> b)`
`35`	`35`	`{`
`36`	`36`	`if (Avx2.IsSupported)`
`37`	`37`	`{`
`@@ -48,7 +48,7 @@ public static int Vp8_Sse16X8(Span<byte> a, Span<byte> b)`
`48`	`48`
`49`	`49`	`// Note: method name in libwebp reference implementation is called VP8SSE4x4.`
`50`	`50`	`[MethodImpl(InliningOptions.ShortMethod)]`
`51`		`- public static int Vp8_Sse4X4(Span<byte> a, Span<byte> b)`
	`51`	`+ public static int Vp8_Sse4x4(Span<byte> a, Span<byte> b)`
`52`	`52`	`{`
`53`	`53`	`if (Avx2.IsSupported)`
`54`	`54`	`{`
`@@ -77,8 +77,8 @@ public static int Vp8_Sse4X4(Span<byte> a, Span<byte> b)`
`77`	`77`	`Vector256<byte> b01s = Avx2.UnpackLow(b01.AsByte(), Vector256<byte>.Zero);`
`78`	`78`
`79`	`79`	`// subtract, square and accumulate.`
`80`		`- Vector256<byte> d0 = Avx2.SubtractSaturate(a01s, b01s);`
`81`		`- Vector256<int> e0 = Avx2.MultiplyAddAdjacent(d0.AsInt16(), d0.AsInt16());`
	`80`	`+ Vector256<short> d0 = Avx2.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16());`
	`81`	`+ Vector256<int> e0 = Avx2.MultiplyAddAdjacent(d0, d0);`
`82`	`82`
`83`	`83`	`return Numerics.ReduceSum(e0);`
`84`	`84`	`}`
`@@ -110,10 +110,10 @@ public static int Vp8_Sse4X4(Span<byte> a, Span<byte> b)`
`110`	`110`	`Vector128<byte> b23s = Sse2.UnpackLow(b23.AsByte(), Vector128<byte>.Zero);`
`111`	`111`
`112`	`112`	`// subtract, square and accumulate.`
`113`		`- Vector128<byte> d0 = Sse2.SubtractSaturate(a01s, b01s);`
`114`		`- Vector128<byte> d1 = Sse2.SubtractSaturate(a23s, b23s);`
`115`		`- Vector128<int> e0 = Sse2.MultiplyAddAdjacent(d0.AsInt16(), d0.AsInt16());`
`116`		`- Vector128<int> e1 = Sse2.MultiplyAddAdjacent(d1.AsInt16(), d1.AsInt16());`
	`113`	`+ Vector128<short> d0 = Sse2.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16());`
	`114`	`+ Vector128<short> d1 = Sse2.SubtractSaturate(a23s.AsInt16(), b23s.AsInt16());`
	`115`	`+ Vector128<int> e0 = Sse2.MultiplyAddAdjacent(d0, d0);`
	`116`	`+ Vector128<int> e1 = Sse2.MultiplyAddAdjacent(d1, d1);`
`117`	`117`	`Vector128<int> sum = Sse2.Add(e0, e1);`
`118`	`118`
`119`	`119`	`return Numerics.ReduceSum(sum);`
`@@ -126,18 +126,16 @@ public static int Vp8_Sse4X4(Span<byte> a, Span<byte> b)`
`126`	`126`	`public static int Vp8_SseNxN(Span<byte> a, Span<byte> b, int w, int h)`
`127`	`127`	`{`
`128`	`128`	`int count = 0;`
`129`		`- int aOffset = 0;`
`130`		`- int bOffset = 0;`
	`129`	`+ int offset = 0;`
`131`	`130`	`for (int y = 0; y < h; y++)`
`132`	`131`	`{`
`133`	`132`	`for (int x = 0; x < w; x++)`
`134`	`133`	`{`
`135`		`- int diff = a[aOffset + x] - b[bOffset + x];`
	`134`	`+ int diff = a[offset + x] - b[offset + x];`
`136`	`135`	`count += diff * diff;`
`137`	`136`	`}`
`138`	`137`
`139`		`- aOffset += WebpConstants.Bps;`
`140`		`- bOffset += WebpConstants.Bps;`
	`138`	`+ offset += WebpConstants.Bps;`
`141`	`139`	`}`
`142`	`140`
`143`	`141`	`return count;`