|
22 | 22 | typedef float (*KernelSample4Func)(float, float, float, float, float); |
23 | 23 | #if __arm64__ |
24 | 24 | typedef float32x4_t (*KernelSample4NEONFunc)(const float32x4_t, const float32x4_t, const float32x4_t,const float32x4_t,const float32x4_t); |
| 25 | +typedef float32x4_t (*KernelWindowNEONFunc)(const float32x4_t, const float); |
25 | 26 | #endif |
26 | 27 | typedef float (*KernelWindow2Func)(float, const float); |
27 | 28 |
|
|
31 | 32 | return result; |
32 | 33 | } |
33 | 34 |
|
| 35 | +static void SetRowF16(int components, int inputWidth, float *rgb, const uint16_t *row, bool useNEONIfAvailable, float weight, int xi) { |
| 36 | +#if __arm64__ |
| 37 | + if (useNEONIfAvailable) { |
| 38 | + auto row16 = reinterpret_cast<const float16_t*>(&row[clamp(xi, 0, inputWidth - 1)*components]); |
| 39 | + if (components == 3) { |
| 40 | + float16x4_t vc = { row16[0], row16[1], row16[2], 0.0f }; |
| 41 | + float32x4_t x = vmulq_n_f32(vcvt_f32_f16(vc), weight); |
| 42 | + rgb[0] += vgetq_lane_f32(x, 0); |
| 43 | + rgb[1] += vgetq_lane_f32(x, 1); |
| 44 | + rgb[2] += vgetq_lane_f32(x, 2); |
| 45 | + } else if (components == 4) { |
| 46 | + float16x4_t vc = vld1_f16(row16); |
| 47 | + float32x4_t x = vmulq_n_f32(vcvt_f32_f16(vc), weight); |
| 48 | + float32x4_t m = vld1q_f32(rgb); |
| 49 | + vst1q_f32(rgb, vaddq_f32(m, x)); |
| 50 | + } |
| 51 | + } |
| 52 | +#endif |
| 53 | + |
| 54 | + if (!useNEONIfAvailable) { |
| 55 | + for (int c = 0; c < components; ++c) { |
| 56 | + half clrf = castU16(row[clamp(xi, 0, inputWidth - 1)*components + c]); |
| 57 | + float clr = (float)clrf * weight; |
| 58 | + rgb[c] += clr; |
| 59 | + } |
| 60 | + } |
| 61 | +} |
| 62 | + |
| 63 | +#if __arm64__ |
| 64 | +__attribute__((always_inline)) |
| 65 | +inline void NeonSampleF16Row(const float32x4_t aHigh, const float32x4_t aLow, |
| 66 | + int components, float dyWeight, int inputHeight, int inputWidth, |
| 67 | + float kx1, float lanczosFA, KernelWindowNEONFunc neonSampler, |
| 68 | + float *rgb, const uint8_t *src8, int srcStride, float srcX, |
| 69 | + bool useNEONIfAvailable, float &weightSum, int yj) { |
| 70 | + const float32x4_t vkx = vdupq_n_f32(kx1); |
| 71 | + const float32x4_t vSrcX = vdupq_n_f32(srcX); |
| 72 | + const float32x4_t xjLow = vaddq_f32(vkx, aLow); |
| 73 | + const float32x4_t xjHigh = vaddq_f32(vkx, aHigh); |
| 74 | + float32x4_t vdxLow = vsubq_f32(vSrcX, xjLow); |
| 75 | + float32x4_t vdxHigh = vsubq_f32(vSrcX, xjHigh); |
| 76 | + float32x4_t lowWeight = vmulq_n_f32(neonSampler(vdxLow, lanczosFA), dyWeight); |
| 77 | + float32x4_t highWeight = vmulq_n_f32(neonSampler(vdxHigh, lanczosFA), dyWeight); |
| 78 | + |
| 79 | + weightSum += vaddvq_f32(lowWeight); |
| 80 | + highWeight = vsetq_lane_f32(0, highWeight, 2); |
| 81 | + highWeight = vsetq_lane_f32(0, highWeight, 3); |
| 82 | + weightSum += vaddvq_f32(highWeight); |
| 83 | + |
| 84 | + auto row = reinterpret_cast<const uint16_t*>(src8 + clamp(yj, 0, inputHeight - 1) * srcStride); |
| 85 | + SetRowF16(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(lowWeight, 0), vgetq_lane_f32(xjLow, 0)); |
| 86 | + SetRowF16(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(lowWeight, 1), vgetq_lane_f32(xjLow, 1)); |
| 87 | + SetRowF16(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(lowWeight, 2), vgetq_lane_f32(xjLow, 2)); |
| 88 | + SetRowF16(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(lowWeight, 3), vgetq_lane_f32(xjLow, 3)); |
| 89 | + SetRowF16(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(highWeight, 0), vgetq_lane_f32(xjHigh, 0)); |
| 90 | + SetRowF16(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(highWeight, 1), vgetq_lane_f32(xjHigh, 1)); |
| 91 | +} |
| 92 | +#endif |
| 93 | + |
34 | 94 | static void scaleRowF16(int components, int dstStride, int inputHeight, int inputWidth, XSampler option, uint16_t *output, int outputWidth, const uint8_t *src8, int srcStride, bool useNEONIfAvailable, float xScale, int y, float yScale) { |
35 | 95 | auto dst8 = reinterpret_cast<uint8_t*>(output) + y * dstStride; |
36 | 96 | auto dst16 = reinterpret_cast<uint16_t*>(dst8); |
@@ -208,65 +268,86 @@ static void scaleRowF16(int components, int dstStride, int inputHeight, int inpu |
208 | 268 | } |
209 | 269 | } else if (option == lanczos || option == hann) { |
210 | 270 | KernelWindow2Func sampler; |
| 271 | +#if __arm64__ |
| 272 | + KernelWindowNEONFunc neonSampler; |
| 273 | +#endif |
211 | 274 | switch (option) { |
212 | 275 | case hann: |
213 | 276 | sampler = HannWindow<float>; |
| 277 | +#if __arm64__ |
| 278 | + neonSampler = HannWindow; |
| 279 | +#endif |
214 | 280 | break; |
215 | 281 | default: |
216 | 282 | sampler = LanczosWindow<float>; |
| 283 | +#if __arm64__ |
| 284 | + neonSampler = LanczosWindow; |
| 285 | +#endif |
217 | 286 | } |
218 | | - |
219 | 287 | float rgb[components]; |
220 | 288 | fill(rgb, rgb + components, 0.0f); |
221 | 289 |
|
222 | | - int a = 3; |
| 290 | + constexpr int a = 3; |
223 | 291 | constexpr float lanczosFA = float(3.0f); |
224 | 292 |
|
225 | 293 | float kx1 = floor(srcX); |
226 | 294 | float ky1 = floor(srcY); |
227 | 295 |
|
228 | 296 | float weightSum(0.0f); |
229 | 297 |
|
| 298 | +#if __arm64__ |
| 299 | + const float32x4_t aLow = { -2, -1, 0, 1 }; |
| 300 | + const float32x4_t aHigh = { 2, 3, 0, 0 }; |
| 301 | + const float32x4_t vky = vdupq_n_f32(ky1); |
| 302 | + const float32x4_t vSrcY = vdupq_n_f32(srcY); |
| 303 | + const float32x4_t yjLow = vaddq_f32(vky, aLow); |
| 304 | + const float32x4_t yjHigh = vaddq_f32(vky, aHigh); |
| 305 | + float32x4_t vdyLow = vsubq_f32(vSrcY, yjLow); |
| 306 | + float32x4_t vdyHigh = vsubq_f32(vSrcY, yjHigh); |
| 307 | + |
| 308 | + float32x4_t wLow = neonSampler(vdyLow, lanczosFA); |
| 309 | + float32x4_t wHigh = neonSampler(vdyLow, lanczosFA); |
| 310 | + |
| 311 | + NeonSampleF16Row(aHigh, aLow, components, vgetq_lane_f32(wLow, 0), inputHeight, inputWidth, kx1, |
| 312 | + lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum, |
| 313 | + vgetq_lane_f32(yjLow, 0)); |
| 314 | + |
| 315 | + NeonSampleF16Row(aHigh, aLow, components, vgetq_lane_f32(wLow, 1), inputHeight, inputWidth, kx1, |
| 316 | + lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum, |
| 317 | + vgetq_lane_f32(yjLow, 1)); |
| 318 | + |
| 319 | + NeonSampleF16Row(aHigh, aLow, components, vgetq_lane_f32(wLow, 2), inputHeight, inputWidth, kx1, |
| 320 | + lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum, |
| 321 | + vgetq_lane_f32(yjLow, 2)); |
| 322 | + |
| 323 | + NeonSampleF16Row(aHigh, aLow, components, vgetq_lane_f32(wLow, 3), inputHeight, inputWidth, kx1, |
| 324 | + lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum, |
| 325 | + vgetq_lane_f32(yjLow, 3)); |
| 326 | + |
| 327 | + NeonSampleF16Row(aHigh, aLow, components, vgetq_lane_f32(wHigh, 0), inputHeight, inputWidth, kx1, |
| 328 | + lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum, |
| 329 | + vgetq_lane_f32(yjHigh, 0)); |
| 330 | + |
| 331 | + NeonSampleF16Row(aHigh, aLow, components, vgetq_lane_f32(wHigh, 1), inputHeight, inputWidth, kx1, |
| 332 | + lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum, |
| 333 | + vgetq_lane_f32(yjHigh, 1)); |
| 334 | +#else |
230 | 335 | for (int j = -a + 1; j <= a; j++) { |
| 336 | + int yj = ky1 + j; |
| 337 | + float dy = float(srcY) - (float(ky1) + (float)j); |
| 338 | + float dyWeight = sampler(dy, lanczosFA); |
231 | 339 | for (int i = -a + 1; i <= a; i++) { |
232 | 340 | int xi = kx1 + i; |
233 | | - int yj = ky1 + j; |
234 | 341 | float dx = float(srcX) - (float(kx1) + (float)i); |
235 | | - float dy = float(srcY) - (float(ky1) + (float)j); |
236 | | - float weight = sampler(dx, lanczosFA) * sampler(dy, lanczosFA); |
| 342 | + float weight = sampler(dx, lanczosFA) * dyWeight; |
237 | 343 | weightSum += weight; |
238 | 344 |
|
239 | 345 | auto row = reinterpret_cast<const uint16_t*>(src8 + clamp(yj, 0, inputHeight - 1) * srcStride); |
240 | 346 |
|
241 | | -#if __arm64__ |
242 | | - if (useNEONIfAvailable) { |
243 | | - auto row16 = reinterpret_cast<const float16_t*>(&row[clamp(xi, 0, inputWidth - 1)*components]); |
244 | | - if (components == 3) { |
245 | | - float16x4_t vc = { row16[0], |
246 | | - row16[1], |
247 | | - row16[2], 0.0f }; |
248 | | - float32x4_t x = vmulq_n_f32(vcvt_f32_f16(vc), weight); |
249 | | - rgb[0] += vgetq_lane_f32(x, 0); |
250 | | - rgb[1] += vgetq_lane_f32(x, 1); |
251 | | - rgb[2] += vgetq_lane_f32(x, 2); |
252 | | - } else if (components == 4) { |
253 | | - float16x4_t vc = vld1_f16(row16); |
254 | | - float32x4_t x = vmulq_n_f32(vcvt_f32_f16(vc), weight); |
255 | | - float32x4_t m = vld1q_f32(rgb); |
256 | | - vst1q_f32(rgb, vaddq_f32(m, x)); |
257 | | - } |
258 | | - } |
259 | | -#endif |
260 | | - |
261 | | - if (!useNEONIfAvailable) { |
262 | | - for (int c = 0; c < components; ++c) { |
263 | | - half clrf = castU16(row[clamp(xi, 0, inputWidth - 1)*components + c]); |
264 | | - float clr = (float)clrf * weight; |
265 | | - rgb[c] += clr; |
266 | | - } |
267 | | - } |
| 347 | + SetRowF16(components, inputWidth, rgb, row, useNEONIfAvailable, weight, xi); |
268 | 348 | } |
269 | 349 | } |
| 350 | +#endif |
270 | 351 | bool useNeonAccumulator = components == 4 || components == 3; |
271 | 352 | #if __arm64__ |
272 | 353 | if (useNEONIfAvailable && useNeonAccumulator) { |
@@ -350,7 +431,7 @@ void scaleImageFloat16(uint16_t* input, |
350 | 431 | option, output, outputWidth, src8, srcStride, useNEONIfAvailable, |
351 | 432 | xScale, yScale]() { |
352 | 433 | for (int y = start; y < end; ++y) { |
353 | | - scaleRowF16(components, dstStride, inputHeight, inputWidth, option, |
| 434 | + scaleRowF16(components, dstStride, inputHeight, inputWidth, option, |
354 | 435 | output, outputWidth, src8, srcStride, useNEONIfAvailable, xScale, y, yScale); |
355 | 436 | } |
356 | 437 | }); |
@@ -467,21 +548,21 @@ void scaleImageU16(uint16_t* input, |
467 | 548 | fill(rgb, rgb + components, 0.0f); |
468 | 549 |
|
469 | 550 | constexpr float lanczosFA = float(3.0f); |
470 | | - |
471 | | - int a = 3; |
| 551 | + constexpr int a = 3; |
472 | 552 |
|
473 | 553 | float kx1 = floor(srcX); |
474 | 554 | float ky1 = floor(srcY); |
475 | 555 |
|
476 | 556 | float weightSum(0.0f); |
477 | 557 |
|
478 | 558 | for (int j = -a + 1; j <= a; j++) { |
| 559 | + int yj = ky1 + j; |
| 560 | + float dy = float(srcY) - (float(ky1) + (float)j); |
| 561 | + float dyWeight = sampler(dy, (float)lanczosFA); |
479 | 562 | for (int i = -a + 1; i <= a; i++) { |
480 | 563 | int xi = kx1 + i; |
481 | | - int yj = ky1 + j; |
482 | 564 | float dx = float(srcX) - (float(kx1) + (float)i); |
483 | | - float dy = float(srcY) - (float(ky1) + (float)j); |
484 | | - float weight = sampler(dx, (float)lanczosFA) * sampler(dy, (float)lanczosFA); |
| 565 | + float weight = sampler(dx, (float)lanczosFA) * dyWeight; |
485 | 566 | weightSum += weight; |
486 | 567 |
|
487 | 568 | auto row = reinterpret_cast<const uint16_t*>(src8 + clamp(yj, 0, inputHeight - 1) * srcStride); |
@@ -557,6 +638,37 @@ static void SetRowU8(int components, int inputWidth, float *rgb, const uint8_t * |
557 | 638 | } |
558 | 639 | } |
559 | 640 |
|
| 641 | +#if __arm64 |
| 642 | +__attribute__((always_inline)) |
| 643 | +inline void NeonSampleU8Row(const float32x4_t aHigh, const float32x4_t aLow, |
| 644 | + int components, float dyWeight, int inputHeight, int inputWidth, |
| 645 | + float kx1, float lanczosFA, KernelWindowNEONFunc neonSampler, |
| 646 | + float *rgb, const uint8_t *src8, int srcStride, float srcX, |
| 647 | + bool useNEONIfAvailable, float &weightSum, int yj) { |
| 648 | + const float32x4_t vkx = vdupq_n_f32(kx1); |
| 649 | + const float32x4_t vSrcX = vdupq_n_f32(srcX); |
| 650 | + const float32x4_t xjLow = vaddq_f32(vkx, aLow); |
| 651 | + const float32x4_t xjHigh = vaddq_f32(vkx, aHigh); |
| 652 | + float32x4_t vdxLow = vsubq_f32(vSrcX, xjLow); |
| 653 | + float32x4_t vdxHigh = vsubq_f32(vSrcX, xjHigh); |
| 654 | + float32x4_t lowWeight = vmulq_n_f32(neonSampler(vdxLow, lanczosFA), dyWeight); |
| 655 | + float32x4_t highWeight = vmulq_n_f32(neonSampler(vdxHigh, lanczosFA), dyWeight); |
| 656 | + |
| 657 | + weightSum += vaddvq_f32(lowWeight); |
| 658 | + highWeight = vsetq_lane_f32(0, highWeight, 2); |
| 659 | + highWeight = vsetq_lane_f32(0, highWeight, 3); |
| 660 | + weightSum += vaddvq_f32(highWeight); |
| 661 | + |
| 662 | + auto row = reinterpret_cast<const uint8_t*>(src8 + clamp(yj, 0, inputHeight - 1) * srcStride); |
| 663 | + SetRowU8(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(lowWeight, 0), vgetq_lane_f32(xjLow, 0)); |
| 664 | + SetRowU8(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(lowWeight, 1), vgetq_lane_f32(xjLow, 1)); |
| 665 | + SetRowU8(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(lowWeight, 2), vgetq_lane_f32(xjLow, 2)); |
| 666 | + SetRowU8(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(lowWeight, 3), vgetq_lane_f32(xjLow, 3)); |
| 667 | + SetRowU8(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(highWeight, 0), vgetq_lane_f32(xjHigh, 0)); |
| 668 | + SetRowU8(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(highWeight, 1), vgetq_lane_f32(xjHigh, 1)); |
| 669 | +} |
| 670 | +#endif |
| 671 | + |
560 | 672 | static void scaleRowU8(int components, int dstStride, int inputHeight, int inputWidth, float maxColors, XSampler option, uint8_t *output, int outputWidth, const uint8_t *src8, int srcStride, bool useNEONIfAvailable, float xScale, size_t y, float yScale) { |
561 | 673 | auto dst8 = reinterpret_cast<uint8_t*>(output + y * dstStride); |
562 | 674 | auto dst = reinterpret_cast<uint8_t*>(dst8); |
@@ -738,42 +850,89 @@ static void scaleRowU8(int components, int dstStride, int inputHeight, int input |
738 | 850 | } |
739 | 851 | } else if (option == lanczos || option == hann) { |
740 | 852 | KernelWindow2Func sampler; |
| 853 | +#if __arm64__ |
| 854 | + KernelWindowNEONFunc neonSampler; |
| 855 | +#endif |
741 | 856 | switch (option) { |
742 | 857 | case hann: |
743 | 858 | sampler = HannWindow<float>; |
| 859 | +#if __arm64__ |
| 860 | + neonSampler = HannWindow; |
| 861 | +#endif |
744 | 862 | break; |
745 | 863 | default: |
746 | 864 | sampler = LanczosWindow<float>; |
| 865 | +#if __arm64__ |
| 866 | + neonSampler = LanczosWindow; |
| 867 | +#endif |
747 | 868 | } |
748 | 869 | float rgb[4]; |
749 | 870 | fill(rgb, rgb + 4, 0.0f); |
750 | 871 |
|
751 | 872 | constexpr float lanczosFA = float(3.0f); |
752 | 873 |
|
753 | | - int a = 3; |
| 874 | + constexpr int a = 3; |
754 | 875 |
|
755 | 876 | float kx1 = floor(srcX); |
756 | 877 | float ky1 = floor(srcY); |
757 | 878 |
|
758 | 879 | float weightSum(0.0f); |
759 | 880 |
|
| 881 | + const bool useNeonAccumulator = components == 4 || components == 3; |
| 882 | + |
| 883 | +#if __arm64__ |
| 884 | + const float32x4_t aLow = { -2, -1, 0, 1 }; |
| 885 | + const float32x4_t aHigh = { 2, 3, 0, 0 }; |
| 886 | + const float32x4_t vky = vdupq_n_f32(ky1); |
| 887 | + const float32x4_t vSrcY = vdupq_n_f32(srcY); |
| 888 | + const float32x4_t yjLow = vaddq_f32(vky, aLow); |
| 889 | + const float32x4_t yjHigh = vaddq_f32(vky, aHigh); |
| 890 | + float32x4_t vdyLow = vsubq_f32(vSrcY, yjLow); |
| 891 | + float32x4_t vdyHigh = vsubq_f32(vSrcY, yjHigh); |
| 892 | + |
| 893 | + float32x4_t wLow = neonSampler(vdyLow, lanczosFA); |
| 894 | + float32x4_t wHigh = neonSampler(vdyLow, lanczosFA); |
| 895 | + |
| 896 | + NeonSampleU8Row(aHigh, aLow, components, vgetq_lane_f32(wLow, 0), inputHeight, inputWidth, kx1, |
| 897 | + lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum, |
| 898 | + vgetq_lane_f32(yjLow, 0)); |
| 899 | + |
| 900 | + NeonSampleU8Row(aHigh, aLow, components, vgetq_lane_f32(wLow, 1), inputHeight, inputWidth, kx1, |
| 901 | + lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum, |
| 902 | + vgetq_lane_f32(yjLow, 1)); |
| 903 | + |
| 904 | + NeonSampleU8Row(aHigh, aLow, components, vgetq_lane_f32(wLow, 2), inputHeight, inputWidth, kx1, |
| 905 | + lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum, |
| 906 | + vgetq_lane_f32(yjLow, 2)); |
| 907 | + |
| 908 | + NeonSampleU8Row(aHigh, aLow, components, vgetq_lane_f32(wLow, 3), inputHeight, inputWidth, kx1, |
| 909 | + lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum, |
| 910 | + vgetq_lane_f32(yjLow, 3)); |
| 911 | + |
| 912 | + NeonSampleU8Row(aHigh, aLow, components, vgetq_lane_f32(wHigh, 0), inputHeight, inputWidth, kx1, |
| 913 | + lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum, |
| 914 | + vgetq_lane_f32(yjHigh, 0)); |
| 915 | + |
| 916 | + NeonSampleU8Row(aHigh, aLow, components, vgetq_lane_f32(wHigh, 1), inputHeight, inputWidth, kx1, |
| 917 | + lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum, |
| 918 | + vgetq_lane_f32(yjHigh, 1)); |
| 919 | +#else |
760 | 920 | for (int j = -a + 1; j <= a; j++) { |
| 921 | + int yj = ky1 + j; |
| 922 | + float dy = float(srcY) - (float(ky1) + (float)j); |
| 923 | + float dyWeight = sampler(dy, (float)lanczosFA); |
761 | 924 | for (int i = -a + 1; i <= a; i++) { |
762 | 925 | int xi = kx1 + i; |
763 | | - int yj = ky1 + j; |
764 | 926 | float dx = float(srcX) - (float(kx1) + (float)i); |
765 | | - float dy = float(srcY) - (float(ky1) + (float)j); |
766 | | - float weight = sampler(dx, (float)lanczosFA) * sampler(dy, (float)lanczosFA); |
| 927 | + float weight = sampler(dx, (float)lanczosFA) * dyWeight; |
767 | 928 | weightSum += weight; |
768 | 929 |
|
769 | | - bool useNeonAccumulator = components == 4 || components == 3; |
770 | | - |
771 | 930 | auto row = reinterpret_cast<const uint8_t*>(src8 + clamp(yj, 0, inputHeight - 1) * srcStride); |
772 | 931 | SetRowU8(components, inputWidth, rgb, row, useNEONIfAvailable, weight, xi); |
773 | 932 | } |
774 | 933 | } |
| 934 | +#endif |
775 | 935 |
|
776 | | - bool useNeonAccumulator = components == 4 || components == 3; |
777 | 936 | #if __arm64__ |
778 | 937 | if (useNEONIfAvailable && useNeonAccumulator) { |
779 | 938 | if (components == 4) { |
|
0 commit comments