Skip to content

Commit b1a0b01

Browse files
committed
Implemented eff_hsla_sat for AArch64 ASIMD
1 parent 4b70fb4 commit b1a0b01

File tree

5 files changed

+141
-19
lines changed

5 files changed

+141
-19
lines changed

CHANGELOG

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* Implemented axis_apply_log1 and axis_apply_log2 optimized for AArch64 ASIMD.
77
* Implemented fill_rgba and fill_hsla for AArch64 ASIMD.
88
* Implemented rgba_to_hsla, hsla_to_rgba, rgba_to_bgra32, rgba32_to_bgra32 for AArch64 ASIMD.
9-
* Implemented eff_hsla_hue for AArch64 ASIMD.
9+
* Implemented eff_hsla_hue, eff_hsla_sat for AArch64 ASIMD.
1010

1111
=== 1.0.6 ===
1212
* Updated build scripts.

include/private/dsp/arch/aarch64/asimd/graphics/effects.h

Lines changed: 135 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -39,16 +39,16 @@ namespace lsp
3939
)
4040

4141
#define EFF_HSLA_HUE_CORE \
42-
/* q0 = v0 */ \
43-
/* q1 = v1 */ \
44-
/* q8 = h */ \
45-
/* q9 = s */ \
46-
/* q10 = l */ \
47-
/* q11 = a */ \
48-
/* q12 = T */ \
49-
/* q13 = KT */ \
50-
/* q14 = 0 */ \
51-
/* q15 = 1 */ \
42+
/* v0 = v[0] */ \
43+
/* v1 = v[1] */ \
44+
/* v8 = h */ \
45+
/* v9 = s */ \
46+
/* v10 = l */ \
47+
/* v11 = a */ \
48+
/* v12 = T */ \
49+
/* v13 = KT */ \
50+
/* v14 = 0 */ \
51+
/* v15 = 1 */ \
5252
__ASM_EMIT("mov v4.16b, v1.16b") /* v4 = v[1] */ \
5353
__ASM_EMIT("fsub v2.4s, v15.4s, v0.4s") /* v2 = 1 - v */ \
5454
__ASM_EMIT("fsub v6.4s, v15.4s, v4.4s") \
@@ -103,10 +103,10 @@ namespace lsp
103103
{
104104
ARCH_AARCH64_ASM
105105
(
106-
__ASM_EMIT("ld4r {v8.4s, v9.4s, v10.4s, v11.4s}, [%[eff]]") /* v8 = h, v9 = s, v10 = l, v11 = a */
106+
__ASM_EMIT("ld4r {v8.4s, v9.4s, v10.4s, v11.4s}, [%[eff]]") /* v8 = h, v9 = s, v10 = l, v11 = a */
107107
__ASM_EMIT("add %[eff], %[eff], #0x10")
108-
__ASM_EMIT("ldp q14, q15, [%[XC]]") /* v14 = 0.0, v15 = 1.0 */
109-
__ASM_EMIT("ld1r {v12.4s}, [%[eff]]") /* v12 = t */
108+
__ASM_EMIT("ldp q14, q15, [%[XC]]") /* v14 = 0.0, v15 = 1.0 */
109+
__ASM_EMIT("ld1r {v12.4s}, [%[eff]]") /* v12 = t */
110110
__ASM_EMIT("frecpe v0.4s, v12.4s") /* v0 = TD */
111111
__ASM_EMIT("frecps v1.4s, v0.4s, v12.4s") /* v1 = (2 - TD*T) */
112112
__ASM_EMIT("fmul v0.4s, v1.4s, v0.4s") /* v0 = t' = TD * (2 - TD*T) */
@@ -179,6 +179,128 @@ namespace lsp
179179

180180
#undef EFF_HSLA_HUE_CORE
181181

182+
#define EFF_HSLA_SAT_CORE \
183+
/* v0 = v[0] */ \
184+
/* v1 = v[1] */ \
185+
/* v8 = h */ \
186+
/* v9 = s */ \
187+
/* v10 = l */ \
188+
/* v11 = a */ \
189+
/* v14 = T */ \
190+
/* v15 = KT */ \
191+
__ASM_EMIT("fabs v5.4s, v1.4s") /* v5 = V1 = abs(v1) */ \
192+
__ASM_EMIT("fabs v1.4s, v0.4s") /* v1 = V = abs(v) */ \
193+
__ASM_EMIT("fsub v3.4s, v14.4s, v1.4s") /* v3 = T - V */ \
194+
__ASM_EMIT("fsub v7.4s, v14.4s, v5.4s") \
195+
__ASM_EMIT("fcmgt v2.4s, v3.4s, #0.0") /* v2 = [(T-V) > 0] */ \
196+
__ASM_EMIT("fcmgt v6.4s, v7.4s, #0.0") \
197+
__ASM_EMIT("fmul v3.4s, v3.4s, v15.4s") /* v3 = (T-V)*KT */ \
198+
__ASM_EMIT("fmul v7.4s, v7.4s, v15.4s") \
199+
__ASM_EMIT("bit v1.16b, v14.16b, v2.16b") /* v1 = ES = V&[(T-V) <= 0] | T&[(T-V) > 0] */ \
200+
__ASM_EMIT("bit v5.16b, v14.16b, v6.16b") \
201+
__ASM_EMIT("and v3.16b, v3.16b, v2.16b") /* v3 = A = ((T-V)*KT) & [(T-V) > 0] */ \
202+
__ASM_EMIT("and v7.16b, v7.16b, v6.16b") \
203+
__ASM_EMIT("fmul v1.4s, v1.4s, v9.4s") /* v1 = ES*s = S */ \
204+
__ASM_EMIT("fmul v5.4s, v5.4s, v9.4s") \
205+
__ASM_EMIT("mov v0.16b, v8.16b") \
206+
__ASM_EMIT("mov v2.16b, v10.16b") \
207+
__ASM_EMIT("mov v4.16b, v8.16b") \
208+
__ASM_EMIT("mov v6.16b, v10.16b")
209+
210+
/*
211+
kt = 1.0f / eff->thresh;
212+
value = (value >= 0.0f) ? value : -value;
213+
214+
if ((eff->thresh - value) <= 0)
215+
{
216+
dst[1] = eff->s * value;
217+
dst[3] = 0.0f;
218+
}
219+
else
220+
{
221+
dst[1] = eff->s * eff->thresh;
222+
dst[3] = (eff->thresh - value) * kt;
223+
}
224+
225+
dst[0] = eff->h;
226+
dst[2] = eff->l;
227+
*/
228+
229+
void eff_hsla_sat(float *dst, const float *v, const dsp::hsla_sat_eff_t *eff, size_t count)
230+
{
231+
ARCH_AARCH64_ASM
232+
(
233+
__ASM_EMIT("ld4r {v8.4s, v9.4s, v10.4s, v11.4s}, [%[eff]]") /* v8 = h, v9 = s, v10 = l, v11 = a */
234+
__ASM_EMIT("add %[eff], %[eff], #0x10")
235+
__ASM_EMIT("ld1r {v14.4s}, [%[eff]]") /* v14 = T */
236+
__ASM_EMIT("frecpe v0.4s, v14.4s") /* v0 = TD */
237+
__ASM_EMIT("frecps v1.4s, v0.4s, v14.4s") /* v1 = (2 - TD*T) */
238+
__ASM_EMIT("fmul v0.4s, v1.4s, v0.4s") /* v0 = t' = TD * (2 - TD*T) */
239+
__ASM_EMIT("frecps v1.4s, v0.4s, v14.4s") /* v1 = (2 - TD*t') */
240+
__ASM_EMIT("fmul v15.4s, v1.4s, v0.4s") /* v15 = KT = 1/t = t' * (2 - TD*t') */
241+
242+
//-----------------------------------------------------------------
243+
// 8x blocks
244+
__ASM_EMIT("subs %[count], %[count], #8")
245+
__ASM_EMIT("b.lo 2f")
246+
__ASM_EMIT("1:")
247+
__ASM_EMIT("ldp q0, q1, [%[src]]") /* v8 = v[0], v9 = v[1] */
248+
EFF_HSLA_SAT_CORE
249+
__ASM_EMIT("subs %[count], %[count], #8")
250+
__ASM_EMIT("st4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[dst]]")
251+
__ASM_EMIT("add %[dst], %[dst], 0x40")
252+
__ASM_EMIT("add %[src], %[src], 0x20")
253+
__ASM_EMIT("st4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[dst]]")
254+
__ASM_EMIT("add %[dst], %[dst], 0x40")
255+
__ASM_EMIT("b.hs 1b")
256+
257+
//-----------------------------------------------------------------
258+
// 1x-8x block
259+
__ASM_EMIT("2:")
260+
__ASM_EMIT("adds %[count], %[count], #8")
261+
__ASM_EMIT("b.ls 14f")
262+
__ASM_EMIT("tst %[count], #4")
263+
__ASM_EMIT("b.eq 4f")
264+
__ASM_EMIT("ldr q0, [%[src]]")
265+
__ASM_EMIT("add %[src], %[src], 0x10")
266+
__ASM_EMIT("4:")
267+
__ASM_EMIT("tst %[count], #2")
268+
__ASM_EMIT("b.eq 6f")
269+
__ASM_EMIT("ld1 {v1.2s}, [%[src]]")
270+
__ASM_EMIT("add %[src], %[src], 0x08")
271+
__ASM_EMIT("6:")
272+
__ASM_EMIT("tst %[count], #1")
273+
__ASM_EMIT("b.eq 8f")
274+
__ASM_EMIT("ld1 {v1.s}[2], [%[src]]")
275+
__ASM_EMIT("8:")
276+
EFF_HSLA_SAT_CORE
277+
__ASM_EMIT("tst %[count], #4")
278+
__ASM_EMIT("b.eq 10f")
279+
__ASM_EMIT("st4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[dst]]")
280+
__ASM_EMIT("add %[dst], %[dst], 0x40")
281+
__ASM_EMIT("10:")
282+
__ASM_EMIT("tst %[count], #2")
283+
__ASM_EMIT("b.eq 12f")
284+
__ASM_EMIT("st4 {v4.2s, v5.2s, v6.2s, v7.2s}, [%[dst]]")
285+
__ASM_EMIT("add %[dst], %[dst], 0x20")
286+
__ASM_EMIT("12:")
287+
__ASM_EMIT("tst %[count], #1")
288+
__ASM_EMIT("b.eq 14f")
289+
__ASM_EMIT("st4 {v4.s, v5.s, v6.s, v7.s}[2], [%[dst]]")
290+
// End
291+
__ASM_EMIT("14:")
292+
293+
: [dst] "+r" (dst), [src] "+r" (v), [count] "+r" (count),
294+
[eff] "+r" (eff)
295+
: [XC] "r" (&EFF_HSLA_HUE_XC[0])
296+
: "cc", "memory",
297+
"v0", "v1", "v2", "v3",
298+
"v4", "v5", "v6", "v7",
299+
"v8", "v9", "v10", "v11",
300+
"v12", "v13", "v14", "v15"
301+
);
302+
}
303+
182304
} /* namespace asimd */
183305
} /* namespace lsp */
184306

src/main/aarch64/asimd.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -411,7 +411,7 @@
411411
EXPORT1(rgba_to_bgra32);
412412

413413
EXPORT1(eff_hsla_hue);
414-
// EXPORT1(eff_hsla_sat);
414+
EXPORT1(eff_hsla_sat);
415415
// EXPORT1(eff_hsla_light);
416416
// EXPORT1(eff_hsla_alpha);
417417
}

src/test/ptest/graphics/effects.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ namespace lsp
164164
namespace asimd
165165
{
166166
void eff_hsla_hue(float *dst, const float *v, const dsp::hsla_hue_eff_t *eff, size_t count);
167-
// void eff_hsla_sat(float *dst, const float *v, const dsp::hsla_sat_eff_t *eff, size_t count);
167+
void eff_hsla_sat(float *dst, const float *v, const dsp::hsla_sat_eff_t *eff, size_t count);
168168
// void eff_hsla_light(float *dst, const float *v, const dsp::hsla_light_eff_t *eff, size_t count);
169169
// void eff_hsla_alpha(float *dst, const float *v, const dsp::hsla_alpha_eff_t *eff, size_t count);
170170
}
@@ -249,7 +249,7 @@ template <class eff_t>
249249
IF_ARCH_X86(call("sse2::eff_hsla_sat", dst, src, count, &sat, sse2::eff_hsla_sat));
250250
IF_ARCH_X86_64(call("avx2::x64_eff_hsla_sat", dst, src, count, &sat, avx2::x64_eff_hsla_sat));
251251
IF_ARCH_ARM(call("neon_d32::eff_hsla_sat", dst, src, count, &sat, neon_d32::eff_hsla_sat));
252-
// IF_ARCH_AARCH64(call("asimd::eff_hsla_sat", dst, src, count, &sat, asimd::eff_hsla_sat));
252+
IF_ARCH_AARCH64(call("asimd::eff_hsla_sat", dst, src, count, &sat, asimd::eff_hsla_sat));
253253
PTEST_SEPARATOR;
254254

255255
call("static::eff_hsla_light", dst, src, count, &light, eff_hsla_light);

src/test/utest/graphics/effects.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ namespace lsp
6868
namespace asimd
6969
{
7070
void eff_hsla_hue(float *dst, const float *v, const dsp::hsla_hue_eff_t *eff, size_t count);
71-
// void eff_hsla_sat(float *dst, const float *v, const dsp::hsla_sat_eff_t *eff, size_t count);
71+
void eff_hsla_sat(float *dst, const float *v, const dsp::hsla_sat_eff_t *eff, size_t count);
7272
// void eff_hsla_light(float *dst, const float *v, const dsp::hsla_light_eff_t *eff, size_t count);
7373
// void eff_hsla_alpha(float *dst, const float *v, const dsp::hsla_alpha_eff_t *eff, size_t count);
7474
}
@@ -176,7 +176,7 @@ UTEST_BEGIN("dsp.graphics", effects)
176176
IF_ARCH_ARM(call("neon_d32::eff_hsla_alpha", 16, generic::eff_hsla_alpha, neon_d32::eff_hsla_alpha, &alpha));
177177

178178
IF_ARCH_AARCH64(call("asimd::eff_hsla_hue", 16, generic::eff_hsla_hue, asimd::eff_hsla_hue, &hue));
179-
// IF_ARCH_AARCH64(call("asimd::eff_hsla_sat", 16, generic::eff_hsla_sat, asimd::eff_hsla_sat, &sat));
179+
IF_ARCH_AARCH64(call("asimd::eff_hsla_sat", 16, generic::eff_hsla_sat, asimd::eff_hsla_sat, &sat));
180180
// IF_ARCH_AARCH64(call("asimd::eff_hsla_light", 16, generic::eff_hsla_light, asimd::eff_hsla_light, &light));
181181
// IF_ARCH_AARCH64(call("asimd::eff_hsla_alpha", 16, generic::eff_hsla_alpha, asimd::eff_hsla_alpha, &alpha));
182182
}

0 commit comments

Comments
 (0)