Skip to content

Commit ce9a6a8

Browse files
committed
improvements
1 parent 1629010 commit ce9a6a8

File tree

3 files changed

+206
-47
lines changed

3 files changed

+206
-47
lines changed

JxlCoder.podspec

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Pod::Spec.new do |s|
22
s.name = 'JxlCoder'
3-
s.version = '1.2.6'
3+
s.version = '1.2.7'
44
s.summary = 'JXL coder for iOS and MacOS'
55
s.description = 'Provides support for JXL files in iOS and MacOS'
66
s.homepage = 'https://github.com/awxkee/jxl-coder-swift'

Sources/jxlc/ScaleInterpolator.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ static inline float32x4_t LanczosWindow(const float32x4_t v, const float a) {
105105
uint32x4_t mask = vcltq_f32(vabsq_f32(v), fullLength);
106106
float32x4_t rv = vmulq_n_f32(v, M_PI);
107107
float32x4_t x = vmulq_f32(Sinc(rv), Sinc(vmulq_f32(v, invLength)));
108-
x = vbslq_f32(mask, zeros, x);
108+
x = vbslq_f32(mask, x, zeros);
109109
return x;
110110
}
111111

Sources/jxlc/XScaler.mm

Lines changed: 204 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
typedef float (*KernelSample4Func)(float, float, float, float, float);
2323
#if __arm64__
2424
typedef float32x4_t (*KernelSample4NEONFunc)(const float32x4_t, const float32x4_t, const float32x4_t,const float32x4_t,const float32x4_t);
25+
typedef float32x4_t (*KernelWindowNEONFunc)(const float32x4_t, const float);
2526
#endif
2627
typedef float (*KernelWindow2Func)(float, const float);
2728

@@ -31,6 +32,65 @@
3132
return result;
3233
}
3334

35+
static void SetRowF16(int components, int inputWidth, float *rgb, const uint16_t *row, bool useNEONIfAvailable, float weight, int xi) {
36+
#if __arm64__
37+
if (useNEONIfAvailable) {
38+
auto row16 = reinterpret_cast<const float16_t*>(&row[clamp(xi, 0, inputWidth - 1)*components]);
39+
if (components == 3) {
40+
float16x4_t vc = { row16[0], row16[1], row16[2], 0.0f };
41+
float32x4_t x = vmulq_n_f32(vcvt_f32_f16(vc), weight);
42+
rgb[0] += vgetq_lane_f32(x, 0);
43+
rgb[1] += vgetq_lane_f32(x, 1);
44+
rgb[2] += vgetq_lane_f32(x, 2);
45+
} else if (components == 4) {
46+
float16x4_t vc = vld1_f16(row16);
47+
float32x4_t x = vmulq_n_f32(vcvt_f32_f16(vc), weight);
48+
float32x4_t m = vld1q_f32(rgb);
49+
vst1q_f32(rgb, vaddq_f32(m, x));
50+
}
51+
}
52+
#endif
53+
54+
if (!useNEONIfAvailable) {
55+
for (int c = 0; c < components; ++c) {
56+
half clrf = castU16(row[clamp(xi, 0, inputWidth - 1)*components + c]);
57+
float clr = (float)clrf * weight;
58+
rgb[c] += clr;
59+
}
60+
}
61+
}
62+
63+
#if __arm64__
64+
__attribute__((always_inline))
65+
inline void NeonSampleF16Row(const float32x4_t aHigh, const float32x4_t aLow,
66+
int components, float dyWeight, int inputHeight, int inputWidth,
67+
float kx1, float lanczosFA, KernelWindowNEONFunc neonSampler,
68+
float *rgb, const uint8_t *src8, int srcStride, float srcX,
69+
bool useNEONIfAvailable, float &weightSum, int yj) {
70+
const float32x4_t vkx = vdupq_n_f32(kx1);
71+
const float32x4_t vSrcX = vdupq_n_f32(srcX);
72+
const float32x4_t xjLow = vaddq_f32(vkx, aLow);
73+
const float32x4_t xjHigh = vaddq_f32(vkx, aHigh);
74+
float32x4_t vdxLow = vsubq_f32(vSrcX, xjLow);
75+
float32x4_t vdxHigh = vsubq_f32(vSrcX, xjHigh);
76+
float32x4_t lowWeight = vmulq_n_f32(neonSampler(vdxLow, lanczosFA), dyWeight);
77+
float32x4_t highWeight = vmulq_n_f32(neonSampler(vdxHigh, lanczosFA), dyWeight);
78+
79+
weightSum += vaddvq_f32(lowWeight);
80+
highWeight = vsetq_lane_f32(0, highWeight, 2);
81+
highWeight = vsetq_lane_f32(0, highWeight, 3);
82+
weightSum += vaddvq_f32(highWeight);
83+
84+
auto row = reinterpret_cast<const uint16_t*>(src8 + clamp(yj, 0, inputHeight - 1) * srcStride);
85+
SetRowF16(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(lowWeight, 0), vgetq_lane_f32(xjLow, 0));
86+
SetRowF16(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(lowWeight, 1), vgetq_lane_f32(xjLow, 1));
87+
SetRowF16(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(lowWeight, 2), vgetq_lane_f32(xjLow, 2));
88+
SetRowF16(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(lowWeight, 3), vgetq_lane_f32(xjLow, 3));
89+
SetRowF16(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(highWeight, 0), vgetq_lane_f32(xjHigh, 0));
90+
SetRowF16(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(highWeight, 1), vgetq_lane_f32(xjHigh, 1));
91+
}
92+
#endif
93+
3494
static void scaleRowF16(int components, int dstStride, int inputHeight, int inputWidth, XSampler option, uint16_t *output, int outputWidth, const uint8_t *src8, int srcStride, bool useNEONIfAvailable, float xScale, int y, float yScale) {
3595
auto dst8 = reinterpret_cast<uint8_t*>(output) + y * dstStride;
3696
auto dst16 = reinterpret_cast<uint16_t*>(dst8);
@@ -208,65 +268,86 @@ static void scaleRowF16(int components, int dstStride, int inputHeight, int inpu
208268
}
209269
} else if (option == lanczos || option == hann) {
210270
KernelWindow2Func sampler;
271+
#if __arm64__
272+
KernelWindowNEONFunc neonSampler;
273+
#endif
211274
switch (option) {
212275
case hann:
213276
sampler = HannWindow<float>;
277+
#if __arm64__
278+
neonSampler = HannWindow;
279+
#endif
214280
break;
215281
default:
216282
sampler = LanczosWindow<float>;
283+
#if __arm64__
284+
neonSampler = LanczosWindow;
285+
#endif
217286
}
218-
219287
float rgb[components];
220288
fill(rgb, rgb + components, 0.0f);
221289

222-
int a = 3;
290+
constexpr int a = 3;
223291
constexpr float lanczosFA = float(3.0f);
224292

225293
float kx1 = floor(srcX);
226294
float ky1 = floor(srcY);
227295

228296
float weightSum(0.0f);
229297

298+
#if __arm64__
299+
const float32x4_t aLow = { -2, -1, 0, 1 };
300+
const float32x4_t aHigh = { 2, 3, 0, 0 };
301+
const float32x4_t vky = vdupq_n_f32(ky1);
302+
const float32x4_t vSrcY = vdupq_n_f32(srcY);
303+
const float32x4_t yjLow = vaddq_f32(vky, aLow);
304+
const float32x4_t yjHigh = vaddq_f32(vky, aHigh);
305+
float32x4_t vdyLow = vsubq_f32(vSrcY, yjLow);
306+
float32x4_t vdyHigh = vsubq_f32(vSrcY, yjHigh);
307+
308+
float32x4_t wLow = neonSampler(vdyLow, lanczosFA);
309+
float32x4_t wHigh = neonSampler(vdyLow, lanczosFA);
310+
311+
NeonSampleF16Row(aHigh, aLow, components, vgetq_lane_f32(wLow, 0), inputHeight, inputWidth, kx1,
312+
lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum,
313+
vgetq_lane_f32(yjLow, 0));
314+
315+
NeonSampleF16Row(aHigh, aLow, components, vgetq_lane_f32(wLow, 1), inputHeight, inputWidth, kx1,
316+
lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum,
317+
vgetq_lane_f32(yjLow, 1));
318+
319+
NeonSampleF16Row(aHigh, aLow, components, vgetq_lane_f32(wLow, 2), inputHeight, inputWidth, kx1,
320+
lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum,
321+
vgetq_lane_f32(yjLow, 2));
322+
323+
NeonSampleF16Row(aHigh, aLow, components, vgetq_lane_f32(wLow, 3), inputHeight, inputWidth, kx1,
324+
lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum,
325+
vgetq_lane_f32(yjLow, 3));
326+
327+
NeonSampleF16Row(aHigh, aLow, components, vgetq_lane_f32(wHigh, 0), inputHeight, inputWidth, kx1,
328+
lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum,
329+
vgetq_lane_f32(yjHigh, 0));
330+
331+
NeonSampleF16Row(aHigh, aLow, components, vgetq_lane_f32(wHigh, 1), inputHeight, inputWidth, kx1,
332+
lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum,
333+
vgetq_lane_f32(yjHigh, 1));
334+
#else
230335
for (int j = -a + 1; j <= a; j++) {
336+
int yj = ky1 + j;
337+
float dy = float(srcY) - (float(ky1) + (float)j);
338+
float dyWeight = sampler(dy, lanczosFA);
231339
for (int i = -a + 1; i <= a; i++) {
232340
int xi = kx1 + i;
233-
int yj = ky1 + j;
234341
float dx = float(srcX) - (float(kx1) + (float)i);
235-
float dy = float(srcY) - (float(ky1) + (float)j);
236-
float weight = sampler(dx, lanczosFA) * sampler(dy, lanczosFA);
342+
float weight = sampler(dx, lanczosFA) * dyWeight;
237343
weightSum += weight;
238344

239345
auto row = reinterpret_cast<const uint16_t*>(src8 + clamp(yj, 0, inputHeight - 1) * srcStride);
240346

241-
#if __arm64__
242-
if (useNEONIfAvailable) {
243-
auto row16 = reinterpret_cast<const float16_t*>(&row[clamp(xi, 0, inputWidth - 1)*components]);
244-
if (components == 3) {
245-
float16x4_t vc = { row16[0],
246-
row16[1],
247-
row16[2], 0.0f };
248-
float32x4_t x = vmulq_n_f32(vcvt_f32_f16(vc), weight);
249-
rgb[0] += vgetq_lane_f32(x, 0);
250-
rgb[1] += vgetq_lane_f32(x, 1);
251-
rgb[2] += vgetq_lane_f32(x, 2);
252-
} else if (components == 4) {
253-
float16x4_t vc = vld1_f16(row16);
254-
float32x4_t x = vmulq_n_f32(vcvt_f32_f16(vc), weight);
255-
float32x4_t m = vld1q_f32(rgb);
256-
vst1q_f32(rgb, vaddq_f32(m, x));
257-
}
258-
}
259-
#endif
260-
261-
if (!useNEONIfAvailable) {
262-
for (int c = 0; c < components; ++c) {
263-
half clrf = castU16(row[clamp(xi, 0, inputWidth - 1)*components + c]);
264-
float clr = (float)clrf * weight;
265-
rgb[c] += clr;
266-
}
267-
}
347+
SetRowF16(components, inputWidth, rgb, row, useNEONIfAvailable, weight, xi);
268348
}
269349
}
350+
#endif
270351
bool useNeonAccumulator = components == 4 || components == 3;
271352
#if __arm64__
272353
if (useNEONIfAvailable && useNeonAccumulator) {
@@ -350,7 +431,7 @@ void scaleImageFloat16(uint16_t* input,
350431
option, output, outputWidth, src8, srcStride, useNEONIfAvailable,
351432
xScale, yScale]() {
352433
for (int y = start; y < end; ++y) {
353-
scaleRowF16(components, dstStride, inputHeight, inputWidth, option,
434+
scaleRowF16(components, dstStride, inputHeight, inputWidth, option,
354435
output, outputWidth, src8, srcStride, useNEONIfAvailable, xScale, y, yScale);
355436
}
356437
});
@@ -467,21 +548,21 @@ void scaleImageU16(uint16_t* input,
467548
fill(rgb, rgb + components, 0.0f);
468549

469550
constexpr float lanczosFA = float(3.0f);
470-
471-
int a = 3;
551+
constexpr int a = 3;
472552

473553
float kx1 = floor(srcX);
474554
float ky1 = floor(srcY);
475555

476556
float weightSum(0.0f);
477557

478558
for (int j = -a + 1; j <= a; j++) {
559+
int yj = ky1 + j;
560+
float dy = float(srcY) - (float(ky1) + (float)j);
561+
float dyWeight = sampler(dy, (float)lanczosFA);
479562
for (int i = -a + 1; i <= a; i++) {
480563
int xi = kx1 + i;
481-
int yj = ky1 + j;
482564
float dx = float(srcX) - (float(kx1) + (float)i);
483-
float dy = float(srcY) - (float(ky1) + (float)j);
484-
float weight = sampler(dx, (float)lanczosFA) * sampler(dy, (float)lanczosFA);
565+
float weight = sampler(dx, (float)lanczosFA) * dyWeight;
485566
weightSum += weight;
486567

487568
auto row = reinterpret_cast<const uint16_t*>(src8 + clamp(yj, 0, inputHeight - 1) * srcStride);
@@ -557,6 +638,37 @@ static void SetRowU8(int components, int inputWidth, float *rgb, const uint8_t *
557638
}
558639
}
559640

641+
#if __arm64
642+
__attribute__((always_inline))
643+
inline void NeonSampleU8Row(const float32x4_t aHigh, const float32x4_t aLow,
644+
int components, float dyWeight, int inputHeight, int inputWidth,
645+
float kx1, float lanczosFA, KernelWindowNEONFunc neonSampler,
646+
float *rgb, const uint8_t *src8, int srcStride, float srcX,
647+
bool useNEONIfAvailable, float &weightSum, int yj) {
648+
const float32x4_t vkx = vdupq_n_f32(kx1);
649+
const float32x4_t vSrcX = vdupq_n_f32(srcX);
650+
const float32x4_t xjLow = vaddq_f32(vkx, aLow);
651+
const float32x4_t xjHigh = vaddq_f32(vkx, aHigh);
652+
float32x4_t vdxLow = vsubq_f32(vSrcX, xjLow);
653+
float32x4_t vdxHigh = vsubq_f32(vSrcX, xjHigh);
654+
float32x4_t lowWeight = vmulq_n_f32(neonSampler(vdxLow, lanczosFA), dyWeight);
655+
float32x4_t highWeight = vmulq_n_f32(neonSampler(vdxHigh, lanczosFA), dyWeight);
656+
657+
weightSum += vaddvq_f32(lowWeight);
658+
highWeight = vsetq_lane_f32(0, highWeight, 2);
659+
highWeight = vsetq_lane_f32(0, highWeight, 3);
660+
weightSum += vaddvq_f32(highWeight);
661+
662+
auto row = reinterpret_cast<const uint8_t*>(src8 + clamp(yj, 0, inputHeight - 1) * srcStride);
663+
SetRowU8(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(lowWeight, 0), vgetq_lane_f32(xjLow, 0));
664+
SetRowU8(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(lowWeight, 1), vgetq_lane_f32(xjLow, 1));
665+
SetRowU8(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(lowWeight, 2), vgetq_lane_f32(xjLow, 2));
666+
SetRowU8(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(lowWeight, 3), vgetq_lane_f32(xjLow, 3));
667+
SetRowU8(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(highWeight, 0), vgetq_lane_f32(xjHigh, 0));
668+
SetRowU8(components, inputWidth, rgb, row, useNEONIfAvailable, vgetq_lane_f32(highWeight, 1), vgetq_lane_f32(xjHigh, 1));
669+
}
670+
#endif
671+
560672
static void scaleRowU8(int components, int dstStride, int inputHeight, int inputWidth, float maxColors, XSampler option, uint8_t *output, int outputWidth, const uint8_t *src8, int srcStride, bool useNEONIfAvailable, float xScale, size_t y, float yScale) {
561673
auto dst8 = reinterpret_cast<uint8_t*>(output + y * dstStride);
562674
auto dst = reinterpret_cast<uint8_t*>(dst8);
@@ -738,42 +850,89 @@ static void scaleRowU8(int components, int dstStride, int inputHeight, int input
738850
}
739851
} else if (option == lanczos || option == hann) {
740852
KernelWindow2Func sampler;
853+
#if __arm64__
854+
KernelWindowNEONFunc neonSampler;
855+
#endif
741856
switch (option) {
742857
case hann:
743858
sampler = HannWindow<float>;
859+
#if __arm64__
860+
neonSampler = HannWindow;
861+
#endif
744862
break;
745863
default:
746864
sampler = LanczosWindow<float>;
865+
#if __arm64__
866+
neonSampler = LanczosWindow;
867+
#endif
747868
}
748869
float rgb[4];
749870
fill(rgb, rgb + 4, 0.0f);
750871

751872
constexpr float lanczosFA = float(3.0f);
752873

753-
int a = 3;
874+
constexpr int a = 3;
754875

755876
float kx1 = floor(srcX);
756877
float ky1 = floor(srcY);
757878

758879
float weightSum(0.0f);
759880

881+
const bool useNeonAccumulator = components == 4 || components == 3;
882+
883+
#if __arm64__
884+
const float32x4_t aLow = { -2, -1, 0, 1 };
885+
const float32x4_t aHigh = { 2, 3, 0, 0 };
886+
const float32x4_t vky = vdupq_n_f32(ky1);
887+
const float32x4_t vSrcY = vdupq_n_f32(srcY);
888+
const float32x4_t yjLow = vaddq_f32(vky, aLow);
889+
const float32x4_t yjHigh = vaddq_f32(vky, aHigh);
890+
float32x4_t vdyLow = vsubq_f32(vSrcY, yjLow);
891+
float32x4_t vdyHigh = vsubq_f32(vSrcY, yjHigh);
892+
893+
float32x4_t wLow = neonSampler(vdyLow, lanczosFA);
894+
float32x4_t wHigh = neonSampler(vdyLow, lanczosFA);
895+
896+
NeonSampleU8Row(aHigh, aLow, components, vgetq_lane_f32(wLow, 0), inputHeight, inputWidth, kx1,
897+
lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum,
898+
vgetq_lane_f32(yjLow, 0));
899+
900+
NeonSampleU8Row(aHigh, aLow, components, vgetq_lane_f32(wLow, 1), inputHeight, inputWidth, kx1,
901+
lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum,
902+
vgetq_lane_f32(yjLow, 1));
903+
904+
NeonSampleU8Row(aHigh, aLow, components, vgetq_lane_f32(wLow, 2), inputHeight, inputWidth, kx1,
905+
lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum,
906+
vgetq_lane_f32(yjLow, 2));
907+
908+
NeonSampleU8Row(aHigh, aLow, components, vgetq_lane_f32(wLow, 3), inputHeight, inputWidth, kx1,
909+
lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum,
910+
vgetq_lane_f32(yjLow, 3));
911+
912+
NeonSampleU8Row(aHigh, aLow, components, vgetq_lane_f32(wHigh, 0), inputHeight, inputWidth, kx1,
913+
lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum,
914+
vgetq_lane_f32(yjHigh, 0));
915+
916+
NeonSampleU8Row(aHigh, aLow, components, vgetq_lane_f32(wHigh, 1), inputHeight, inputWidth, kx1,
917+
lanczosFA, neonSampler, rgb, src8, srcStride, srcX, useNEONIfAvailable, weightSum,
918+
vgetq_lane_f32(yjHigh, 1));
919+
#else
760920
for (int j = -a + 1; j <= a; j++) {
921+
int yj = ky1 + j;
922+
float dy = float(srcY) - (float(ky1) + (float)j);
923+
float dyWeight = sampler(dy, (float)lanczosFA);
761924
for (int i = -a + 1; i <= a; i++) {
762925
int xi = kx1 + i;
763-
int yj = ky1 + j;
764926
float dx = float(srcX) - (float(kx1) + (float)i);
765-
float dy = float(srcY) - (float(ky1) + (float)j);
766-
float weight = sampler(dx, (float)lanczosFA) * sampler(dy, (float)lanczosFA);
927+
float weight = sampler(dx, (float)lanczosFA) * dyWeight;
767928
weightSum += weight;
768929

769-
bool useNeonAccumulator = components == 4 || components == 3;
770-
771930
auto row = reinterpret_cast<const uint8_t*>(src8 + clamp(yj, 0, inputHeight - 1) * srcStride);
772931
SetRowU8(components, inputWidth, rgb, row, useNEONIfAvailable, weight, xi);
773932
}
774933
}
934+
#endif
775935

776-
bool useNeonAccumulator = components == 4 || components == 3;
777936
#if __arm64__
778937
if (useNEONIfAvailable && useNeonAccumulator) {
779938
if (components == 4) {

0 commit comments

Comments
 (0)