[loong64] Fix missing rounding in loong64 scaled_mult implementation

OriginCode · OriginCode · commit 554ea2cc5122 · 2025-03-24T23:50:08.000-07:00
The reference semantics of scaled_mult include rounding, but the original implementation did not do so. This is triggering an SkASSERT in the unit test case FilterResult_raster_RescaleWithTransform, from constrained_add's debug checks. The fixed implementation bumps the cost of each scaled_mult from 2 to 5 instruction-count-wise (5 to 8 clock-cycle-wise with the LA464 and LA664 micro-architectures), due to unavailability of rounding multiply operations in current LoongArch spec. However the computation now matches the reference scalar semantics, and proper testing of debug builds is now possible. Change-Id: I45e43a7a7e6d50b4c32c5e69a6d1d7de341eccf1 [Kaiyang Wu: backport to webkit2gtk] Link: https://skia-review.googlesource.com/c/skia/+/908136 Signed-off-by: Kaiyang Wu <origincode@aosc.io>
diff --git a/Source/ThirdParty/skia/src/opts/SkRasterPipeline_opts.h b/Source/ThirdParty/skia/src/opts/SkRasterPipeline_opts.h
@@ -5532,11 +5532,19 @@ SI I16 scaled_mult(I16 a, I16 b) {
 #elif defined(SKRP_CPU_NEON)
     return vqrdmulhq_s16(a, b);
 #elif defined(SKRP_CPU_LASX)
-    I16 res = __lasx_xvmuh_h(a, b);
-    return __lasx_xvslli_h(res, 1);
+    Vec<8, int32_t> even = (Vec<8, int32_t>)__lasx_xvmulwev_w_h((__m256i)a, (__m256i)b);
+    Vec<8, int32_t> odd = (Vec<8, int32_t>)__lasx_xvmulwod_w_h((__m256i)a, (__m256i)b);
+    Vec<8, int32_t> roundingTerm = (Vec<8, int32_t>)__lasx_xvldi(-0xec0);  // v8i32(0x40 << 8)
+    even = (even + roundingTerm) >> 15;
+    odd = (odd + roundingTerm) >> 15;
+    return (I16)__lasx_xvpackev_h((__m256i)odd, (__m256i)even);
 #elif defined(SKRP_CPU_LSX)
-    I16 res = __lsx_vmuh_h(a, b);
-    return __lsx_vslli_h(res, 1);
+    Vec<4, int32_t> even = (Vec<4, int32_t>)__lsx_vmulwev_w_h((__m128i)a, (__m128i)b);
+    Vec<4, int32_t> odd = (Vec<4, int32_t>)__lsx_vmulwod_w_h((__m128i)a, (__m128i)b);
+    Vec<4, int32_t> roundingTerm = (Vec<4, int32_t>)__lsx_vldi(-0xec0);  // v4i32(0x40 << 8)
+    even = (even + roundingTerm) >> 15;
+    odd = (odd + roundingTerm) >> 15;
+    return (I16)__lsx_vpackev_h((__m128i)odd, (__m128i)even);
 #else
     const I32 roundingTerm = I32_(1 << 14);
     return cast<I16>((cast<I32>(a) * cast<I32>(b) + roundingTerm) >> 15);