improvements

awxkee · awxkee · commit ce9a6a81467f · 2023-10-25T16:47:04.000+08:00
diff --git a/JxlCoder.podspec b/JxlCoder.podspec
@@ -1,6 +1,6 @@
 Pod::Spec.new do |s|
     s.name             = 'JxlCoder'
-    s.version          = '1.2.6'
+    s.version          = '1.2.7'
     s.summary          = 'JXL coder for iOS and MacOS'
     s.description      = 'Provides support for JXL files in iOS and MacOS'
     s.homepage         = 'https://github.com/awxkee/jxl-coder-swift'
diff --git a/Sources/jxlc/ScaleInterpolator.h b/Sources/jxlc/ScaleInterpolator.h
@@ -105,7 +105,7 @@ static inline float32x4_t LanczosWindow(const float32x4_t v, const float a) {
     uint32x4_t mask = vcltq_f32(vabsq_f32(v), fullLength);
     float32x4_t rv = vmulq_n_f32(v, M_PI);
     float32x4_t x = vmulq_f32(Sinc(rv), Sinc(vmulq_f32(v, invLength)));
-    x = vbslq_f32(mask, zeros, x);
+    x = vbslq_f32(mask, x, zeros);
     return x;
 }
 
diff --git a/Sources/jxlc/XScaler.mm b/Sources/jxlc/XScaler.mm
@@ -22,6 +22,7 @@
 typedef float (*KernelSample4Func)(float, float, float, float, float);
 #if __arm64__
 typedef float32x4_t (*KernelSample4NEONFunc)(const float32x4_t, const float32x4_t, const float32x4_t,const float32x4_t,const float32x4_t);
+typedef float32x4_t (*KernelWindowNEONFunc)(const float32x4_t, const float);
 #endif
 typedef float (*KernelWindow2Func)(float, const float);
 
@@ -31,6 +32,65 @@
     return result;
 }
 
+static void SetRowF16(int components, int inputWidth, float *rgb, const uint16_t *row, bool useNEONIfAvailable, float weight, int xi) {
+#if __arm64__
+    if (useNEONIfAvailable) {
+        auto row16 = reinterpret_cast<const float16_t*>(&row[clamp(xi, 0, inputWidth - 1)*components]);
+        if (components == 3) {
+            float16x4_t vc = { row16[0], row16[1], row16[2], 0.0f };
+            float32x4_t x = vmulq_n_f32(vcvt_f32_f16(vc), weight);
+            rgb[0] += vgetq_lane_f32(x, 0);
+            rgb[1] += vgetq_lane_f32(x, 1);
+            rgb[2] += vgetq_lane_f32(x, 2);
+        } else if (components == 4) {
+            float16x4_t vc = vld1_f16(row16);
+            float32x4_t x = vmulq_n_f32(vcvt_f32_f16(vc), weight);
+            float32x4_t m = vld1q_f32(rgb);
+            vst1q_f32(rgb, vaddq_f32(m, x));
+        }
+    }
+#endif
+
+    if (!useNEONIfAvailable) {
+        for (int c = 0; c < components; ++c) {
+            half clrf = castU16(row[clamp(xi, 0, inputWidth - 1)*components + c]);
+            float clr = (float)clrf * weight;
+            rgb[c] += clr;
+        }
+    }
+}
+
+#if __arm64__
+__attribute__((always_inline))
+inline void NeonSampleF16Row(const float32x4_t aHigh, const float32x4_t aLow,
+                             int components, float dyWeight, int inputHeight, int inputWidth,
+                             float kx1, float lanczosFA, KernelWindowNEONFunc neonSampler,
+                             float *rgb, const uint8_t *src8, int srcStride, float srcX,
+                             bool useNEONIfAvailable, float &weightSum, int yj) {
+    const float32x4_t vkx = vdupq_n_f32(kx1);
+    const float32x4_t vSrcX = vdupq_n_f32(srcX);
+    const float32x4_t xjLow = vaddq_f32(vkx, aLow);
+    const float32x4_t xjHigh = vaddq_f32(vkx, aHigh);
+    float32x4_t vdxLow = vsubq_f32(vSrcX, xjLow);
+    float32x4_t vdxHigh = vsubq_f32(vSrcX, xjHigh);
+    float32x4_t lowWeight = vmulq_n_f32(neonSampler(vdxLow, lanczosFA), dyWeight);
+    float32x4_t highWeight = vmulq_n_f32(neonSampler(vdxHigh, lanczosFA), dyWeight);
+
+    weightSum += vaddvq_f32(lowWeight);
+    highWeight = vsetq_lane_f32(0, highWeight, 2);
+    highWeight = vsetq_lane_f32(0, highWeight, 3);
+    weightSum += vaddvq_f32(highWeight);
+
+    auto row = reinterpret_cast<const uint16_t*>(src8 + clamp(yj, 0, inputHeight - 1) * srcStride);
+    SetRowF16(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(lowWeight, 0), vgetq_lane_f32(xjLow, 0));
+    SetRowF16(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(lowWeight, 1), vgetq_lane_f32(xjLow, 1));
+    SetRowF16(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(lowWeight, 2), vgetq_lane_f32(xjLow, 2));
+    SetRowF16(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(lowWeight, 3), vgetq_lane_f32(xjLow, 3));
+    SetRowF16(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(highWeight, 0), vgetq_lane_f32(xjHigh, 0));
+    SetRowF16(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(highWeight, 1), vgetq_lane_f32(xjHigh, 1));
+}
+#endif
+
 static void scaleRowF16(int components, int dstStride, int inputHeight, int inputWidth, XSampler option, uint16_t *output, int outputWidth, const uint8_t *src8, int srcStride, bool useNEONIfAvailable, float xScale, int y, float yScale) {
     auto dst8 = reinterpret_cast<uint8_t*>(output) + y * dstStride;
     auto dst16 = reinterpret_cast<uint16_t*>(dst8);
@@ -208,65 +268,86 @@ static void scaleRowF16(int components, int dstStride, int inputHeight, int inpu
             }
         } else if (option == lanczos || option == hann) {
             KernelWindow2Func sampler;
+#if __arm64__
+            KernelWindowNEONFunc neonSampler;
+#endif
             switch (option) {
                 case hann:
                     sampler = HannWindow<float>;
+#if __arm64__
+                    neonSampler = HannWindow;
+#endif
                     break;
                 default:
                     sampler = LanczosWindow<float>;
+#if __arm64__
+                    neonSampler = LanczosWindow;
+#endif
             }
-
             float rgb[components];
             fill(rgb, rgb + components, 0.0f);
 
-            int a = 3;
+            constexpr int a = 3;
             constexpr float lanczosFA = float(3.0f);
 
             float kx1 = floor(srcX);
             float ky1 = floor(srcY);
 
             float weightSum(0.0f);
 
+#if __arm64__
+            const float32x4_t aLow = { -2, -1, 0, 1 };
+            const float32x4_t aHigh = { 2, 3, 0, 0 };
+            const float32x4_t vky = vdupq_n_f32(ky1);
+            const float32x4_t vSrcY = vdupq_n_f32(srcY);
+            const float32x4_t yjLow = vaddq_f32(vky, aLow);
+            const float32x4_t yjHigh = vaddq_f32(vky, aHigh);
+            float32x4_t vdyLow = vsubq_f32(vSrcY, yjLow);
+            float32x4_t vdyHigh = vsubq_f32(vSrcY, yjHigh);
+
+            float32x4_t wLow = neonSampler(vdyLow, lanczosFA);
+            float32x4_t wHigh = neonSampler(vdyLow, lanczosFA);
+
+            NeonSampleF16Row(aHigh, aLow, components, vgetq_lane_f32(wLow, 0), inputHeight, inputWidth, kx1,
+                             lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum,
+                             vgetq_lane_f32(yjLow, 0));
+
+            NeonSampleF16Row(aHigh, aLow, components, vgetq_lane_f32(wLow, 1), inputHeight, inputWidth, kx1,
+                             lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum,
+                             vgetq_lane_f32(yjLow, 1));
+
+            NeonSampleF16Row(aHigh, aLow, components, vgetq_lane_f32(wLow, 2), inputHeight, inputWidth, kx1,
+                             lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum,
+                             vgetq_lane_f32(yjLow, 2));
+
+            NeonSampleF16Row(aHigh, aLow, components, vgetq_lane_f32(wLow, 3), inputHeight, inputWidth, kx1,
+                             lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum,
+                             vgetq_lane_f32(yjLow, 3));
+
+            NeonSampleF16Row(aHigh, aLow, components, vgetq_lane_f32(wHigh, 0), inputHeight, inputWidth, kx1,
+                             lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum,
+                             vgetq_lane_f32(yjHigh, 0));
+
+            NeonSampleF16Row(aHigh, aLow, components, vgetq_lane_f32(wHigh, 1), inputHeight, inputWidth, kx1,
+                             lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum,
+                             vgetq_lane_f32(yjHigh, 1));
+#else
             for (int j = -a + 1; j <= a; j++) {
+                int yj = ky1 + j;
+                float dy = float(srcY) - (float(ky1) + (float)j);
+                float dyWeight = sampler(dy, lanczosFA);
                 for (int i = -a + 1; i <= a; i++) {
                     int xi = kx1 + i;
-                    int yj = ky1 + j;
                     float dx = float(srcX) - (float(kx1) + (float)i);
-                    float dy = float(srcY) - (float(ky1) + (float)j);
-                    float weight = sampler(dx, lanczosFA) * sampler(dy, lanczosFA);
+                    float weight = sampler(dx, lanczosFA) * dyWeight;
                     weightSum += weight;
 
                     auto row = reinterpret_cast<const uint16_t*>(src8 + clamp(yj, 0, inputHeight - 1) * srcStride);
 
-#if __arm64__
-                    if (useNEONIfAvailable) {
-                        auto row16 = reinterpret_cast<const float16_t*>(&row[clamp(xi, 0, inputWidth - 1)*components]);
-                        if (components == 3) {
-                            float16x4_t vc = { row16[0],
-                                row16[1],
-                                row16[2], 0.0f };
-                            float32x4_t x = vmulq_n_f32(vcvt_f32_f16(vc), weight);
-                            rgb[0] += vgetq_lane_f32(x, 0);
-                            rgb[1] += vgetq_lane_f32(x, 1);
-                            rgb[2] += vgetq_lane_f32(x, 2);
-                        } else if (components == 4) {
-                            float16x4_t vc = vld1_f16(row16);
-                            float32x4_t x = vmulq_n_f32(vcvt_f32_f16(vc), weight);
-                            float32x4_t m = vld1q_f32(rgb);
-                            vst1q_f32(rgb, vaddq_f32(m, x));
-                        }
-                    }
-#endif
-
-                    if (!useNEONIfAvailable) {
-                        for (int c = 0; c < components; ++c) {
-                            half clrf = castU16(row[clamp(xi, 0, inputWidth - 1)*components + c]);
-                            float clr = (float)clrf * weight;
-                            rgb[c] += clr;
-                        }
-                    }
+                    SetRowF16(components, inputWidth, rgb, row, useNEONIfAvailable, weight, xi);
                 }
             }
+#endif
             bool useNeonAccumulator = components == 4 || components == 3;
 #if __arm64__
             if (useNEONIfAvailable && useNeonAccumulator) {
@@ -350,7 +431,7 @@ void scaleImageFloat16(uint16_t* input,
                               option, output, outputWidth, src8, srcStride, useNEONIfAvailable,
                               xScale, yScale]() {
             for (int y = start; y < end; ++y) {
-                scaleRowF16(components, dstStride, inputHeight, inputWidth, option, 
+                scaleRowF16(components, dstStride, inputHeight, inputWidth, option,
                             output, outputWidth, src8, srcStride, useNEONIfAvailable, xScale, y, yScale);
             }
         });
@@ -467,21 +548,21 @@ void scaleImageU16(uint16_t* input,
                 fill(rgb, rgb + components, 0.0f);
 
                 constexpr float lanczosFA = float(3.0f);
-
-                int a = 3;
+                constexpr int a = 3;
 
                 float kx1 = floor(srcX);
                 float ky1 = floor(srcY);
 
                 float weightSum(0.0f);
 
                 for (int j = -a + 1; j <= a; j++) {
+                    int yj = ky1 + j;
+                    float dy = float(srcY) - (float(ky1) + (float)j);
+                    float dyWeight = sampler(dy, (float)lanczosFA);
                     for (int i = -a + 1; i <= a; i++) {
                         int xi = kx1 + i;
-                        int yj = ky1 + j;
                         float dx = float(srcX) - (float(kx1) + (float)i);
-                        float dy = float(srcY) - (float(ky1) + (float)j);
-                        float weight = sampler(dx, (float)lanczosFA) * sampler(dy, (float)lanczosFA);
+                        float weight = sampler(dx, (float)lanczosFA) * dyWeight;
                         weightSum += weight;
 
                         auto row = reinterpret_cast<const uint16_t*>(src8 + clamp(yj, 0, inputHeight - 1) * srcStride);
@@ -557,6 +638,37 @@ static void SetRowU8(int components, int inputWidth, float *rgb, const uint8_t *
     }
 }
 
+#if __arm64
+__attribute__((always_inline))
+inline void NeonSampleU8Row(const float32x4_t aHigh, const float32x4_t aLow,
+                            int components, float dyWeight, int inputHeight, int inputWidth,
+                            float kx1, float lanczosFA, KernelWindowNEONFunc neonSampler,
+                            float *rgb, const uint8_t *src8, int srcStride, float srcX,
+                            bool useNEONIfAvailable, float &weightSum, int yj) {
+    const float32x4_t vkx = vdupq_n_f32(kx1);
+    const float32x4_t vSrcX = vdupq_n_f32(srcX);
+    const float32x4_t xjLow = vaddq_f32(vkx, aLow);
+    const float32x4_t xjHigh = vaddq_f32(vkx, aHigh);
+    float32x4_t vdxLow = vsubq_f32(vSrcX, xjLow);
+    float32x4_t vdxHigh = vsubq_f32(vSrcX, xjHigh);
+    float32x4_t lowWeight = vmulq_n_f32(neonSampler(vdxLow, lanczosFA), dyWeight);
+    float32x4_t highWeight = vmulq_n_f32(neonSampler(vdxHigh, lanczosFA), dyWeight);
+
+    weightSum += vaddvq_f32(lowWeight);
+    highWeight = vsetq_lane_f32(0, highWeight, 2);
+    highWeight = vsetq_lane_f32(0, highWeight, 3);
+    weightSum += vaddvq_f32(highWeight);
+
+    auto row = reinterpret_cast<const uint8_t*>(src8 + clamp(yj, 0, inputHeight - 1) * srcStride);
+    SetRowU8(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(lowWeight, 0), vgetq_lane_f32(xjLow, 0));
+    SetRowU8(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(lowWeight, 1), vgetq_lane_f32(xjLow, 1));
+    SetRowU8(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(lowWeight, 2), vgetq_lane_f32(xjLow, 2));
+    SetRowU8(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(lowWeight, 3), vgetq_lane_f32(xjLow, 3));
+    SetRowU8(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(highWeight, 0), vgetq_lane_f32(xjHigh, 0));
+    SetRowU8(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(highWeight, 1), vgetq_lane_f32(xjHigh, 1));
+}
+#endif
+
 static void scaleRowU8(int components, int dstStride, int inputHeight, int inputWidth, float maxColors, XSampler option, uint8_t *output, int outputWidth, const uint8_t *src8, int srcStride, bool useNEONIfAvailable, float xScale, size_t y, float yScale) {
     auto dst8 = reinterpret_cast<uint8_t*>(output + y * dstStride);
     auto dst = reinterpret_cast<uint8_t*>(dst8);
@@ -738,42 +850,89 @@ static void scaleRowU8(int components, int dstStride, int inputHeight, int input
             }
         } else if (option == lanczos || option == hann) {
             KernelWindow2Func sampler;
+#if __arm64__
+            KernelWindowNEONFunc neonSampler;
+#endif
             switch (option) {
                 case hann:
                     sampler = HannWindow<float>;
+#if __arm64__
+                    neonSampler = HannWindow;
+#endif
                     break;
                 default:
                     sampler = LanczosWindow<float>;
+#if __arm64__
+                    neonSampler = LanczosWindow;
+#endif
             }
             float rgb[4];
             fill(rgb, rgb + 4, 0.0f);
 
             constexpr float lanczosFA = float(3.0f);
 
-            int a = 3;
+            constexpr int a = 3;
 
             float kx1 = floor(srcX);
             float ky1 = floor(srcY);
 
             float weightSum(0.0f);
 
+            const bool useNeonAccumulator = components == 4 || components == 3;
+
+#if __arm64__
+            const float32x4_t aLow = { -2, -1, 0, 1 };
+            const float32x4_t aHigh = { 2, 3, 0, 0 };
+            const float32x4_t vky = vdupq_n_f32(ky1);
+            const float32x4_t vSrcY = vdupq_n_f32(srcY);
+            const float32x4_t yjLow = vaddq_f32(vky, aLow);
+            const float32x4_t yjHigh = vaddq_f32(vky, aHigh);
+            float32x4_t vdyLow = vsubq_f32(vSrcY, yjLow);
+            float32x4_t vdyHigh = vsubq_f32(vSrcY, yjHigh);
+
+            float32x4_t wLow = neonSampler(vdyLow, lanczosFA);
+            float32x4_t wHigh = neonSampler(vdyLow, lanczosFA);
+
+            NeonSampleU8Row(aHigh, aLow, components, vgetq_lane_f32(wLow, 0), inputHeight, inputWidth, kx1,
+                            lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum,
+                            vgetq_lane_f32(yjLow, 0));
+
+            NeonSampleU8Row(aHigh, aLow, components, vgetq_lane_f32(wLow, 1), inputHeight, inputWidth, kx1,
+                            lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum,
+                            vgetq_lane_f32(yjLow, 1));
+
+            NeonSampleU8Row(aHigh, aLow, components, vgetq_lane_f32(wLow, 2), inputHeight, inputWidth, kx1,
+                            lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum,
+                            vgetq_lane_f32(yjLow, 2));
+
+            NeonSampleU8Row(aHigh, aLow, components, vgetq_lane_f32(wLow, 3), inputHeight, inputWidth, kx1,
+                            lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum,
+                            vgetq_lane_f32(yjLow, 3));
+
+            NeonSampleU8Row(aHigh, aLow, components, vgetq_lane_f32(wHigh, 0), inputHeight, inputWidth, kx1,
+                            lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum,
+                            vgetq_lane_f32(yjHigh, 0));
+
+            NeonSampleU8Row(aHigh, aLow, components, vgetq_lane_f32(wHigh, 1), inputHeight, inputWidth, kx1,
+                            lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum,
+                            vgetq_lane_f32(yjHigh, 1));
+#else
             for (int j = -a + 1; j <= a; j++) {
+                int yj = ky1 + j;
+                float dy = float(srcY) - (float(ky1) + (float)j);
+                float dyWeight = sampler(dy, (float)lanczosFA);
                 for (int i = -a + 1; i <= a; i++) {
                     int xi = kx1 + i;
-                    int yj = ky1 + j;
                     float dx = float(srcX) - (float(kx1) + (float)i);
-                    float dy = float(srcY) - (float(ky1) + (float)j);
-                    float weight = sampler(dx, (float)lanczosFA) * sampler(dy, (float)lanczosFA);
+                    float weight = sampler(dx, (float)lanczosFA) * dyWeight;
                     weightSum += weight;
 
-                    bool useNeonAccumulator = components == 4 || components == 3;
-
                     auto row = reinterpret_cast<const uint8_t*>(src8 + clamp(yj, 0, inputHeight - 1) * srcStride);
                     SetRowU8(components, inputWidth, rgb, row, useNEONIfAvailable, weight, xi);
                 }
             }
+#endif
 
-            bool useNeonAccumulator = components == 4 || components == 3;
 #if __arm64__
             if (useNEONIfAvailable && useNeonAccumulator) {
                 if (components == 4) {

Original file line number	Diff line number	Diff line change
`@@ -105,7 +105,7 @@ static inline float32x4_t LanczosWindow(const float32x4_t v, const float a) {`
`105`	`105`	`uint32x4_t mask = vcltq_f32(vabsq_f32(v), fullLength);`
`106`	`106`	`float32x4_t rv = vmulq_n_f32(v, M_PI);`
`107`	`107`	`float32x4_t x = vmulq_f32(Sinc(rv), Sinc(vmulq_f32(v, invLength)));`
`108`		`- x = vbslq_f32(mask, zeros, x);`
	`108`	`+ x = vbslq_f32(mask, x, zeros);`
`109`	`109`	`return x;`
`110`	`110`	`}`
`111`	`111`