@@ -39,16 +39,16 @@ namespace lsp
39
39
)
40
40
41
41
#define EFF_HSLA_HUE_CORE \
42
- /* q0 = v0 */ \
43
- /* q1 = v1 */ \
44
- /* q8 = h */ \
45
- /* q9 = s */ \
46
- /* q10 = l */ \
47
- /* q11 = a */ \
48
- /* q12 = T */ \
49
- /* q13 = KT */ \
50
- /* q14 = 0 */ \
51
- /* q15 = 1 */ \
42
+ /* v0 = v[0] */ \
43
+ /* v1 = v[1] */ \
44
+ /* v8 = h */ \
45
+ /* v9 = s */ \
46
+ /* v10 = l */ \
47
+ /* v11 = a */ \
48
+ /* v12 = T */ \
49
+ /* v13 = KT */ \
50
+ /* v14 = 0 */ \
51
+ /* v15 = 1 */ \
52
52
__ASM_EMIT (" mov v4.16b, v1.16b" ) /* v4 = v[1] */ \
53
53
__ASM_EMIT (" fsub v2.4s, v15.4s, v0.4s" ) /* v2 = 1 - v */ \
54
54
__ASM_EMIT (" fsub v6.4s, v15.4s, v4.4s" ) \
@@ -103,10 +103,10 @@ namespace lsp
103
103
{
104
104
ARCH_AARCH64_ASM
105
105
(
106
- __ASM_EMIT (" ld4r {v8.4s, v9.4s, v10.4s, v11.4s}, [%[eff]]" ) /* v8 = h, v9 = s, v10 = l, v11 = a */
106
+ __ASM_EMIT (" ld4r {v8.4s, v9.4s, v10.4s, v11.4s}, [%[eff]]" ) /* v8 = h, v9 = s, v10 = l, v11 = a */
107
107
__ASM_EMIT (" add %[eff], %[eff], #0x10" )
108
- __ASM_EMIT (" ldp q14, q15, [%[XC]]" ) /* v14 = 0.0, v15 = 1.0 */
109
- __ASM_EMIT (" ld1r {v12.4s}, [%[eff]]" ) /* v12 = t */
108
+ __ASM_EMIT (" ldp q14, q15, [%[XC]]" ) /* v14 = 0.0, v15 = 1.0 */
109
+ __ASM_EMIT (" ld1r {v12.4s}, [%[eff]]" ) /* v12 = t */
110
110
__ASM_EMIT (" frecpe v0.4s, v12.4s" ) /* v0 = TD */
111
111
__ASM_EMIT (" frecps v1.4s, v0.4s, v12.4s" ) /* v1 = (2 - TD*T) */
112
112
__ASM_EMIT (" fmul v0.4s, v1.4s, v0.4s" ) /* v0 = t' = TD * (2 - TD*T) */
@@ -179,6 +179,128 @@ namespace lsp
179
179
180
180
#undef EFF_HSLA_HUE_CORE
181
181
182
+ #define EFF_HSLA_SAT_CORE \
183
+ /* v0 = v[0] */ \
184
+ /* v1 = v[1] */ \
185
+ /* v8 = h */ \
186
+ /* v9 = s */ \
187
+ /* v10 = l */ \
188
+ /* v11 = a */ \
189
+ /* v14 = T */ \
190
+ /* v15 = KT */ \
191
+ __ASM_EMIT (" fabs v5.4s, v1.4s" ) /* v5 = V1 = abs(v1) */ \
192
+ __ASM_EMIT (" fabs v1.4s, v0.4s" ) /* v1 = V = abs(v) */ \
193
+ __ASM_EMIT (" fsub v3.4s, v14.4s, v1.4s" ) /* v3 = T - V */ \
194
+ __ASM_EMIT (" fsub v7.4s, v14.4s, v5.4s" ) \
195
+ __ASM_EMIT (" fcmgt v2.4s, v3.4s, #0.0" ) /* v2 = [(T-V) > 0] */ \
196
+ __ASM_EMIT (" fcmgt v6.4s, v7.4s, #0.0" ) \
197
+ __ASM_EMIT (" fmul v3.4s, v3.4s, v15.4s" ) /* v3 = (T-V)*KT */ \
198
+ __ASM_EMIT (" fmul v7.4s, v7.4s, v15.4s" ) \
199
+ __ASM_EMIT (" bit v1.16b, v14.16b, v2.16b" ) /* v1 = ES = V&[(T-V) <= 0] | T&[(T-V) > 0] */ \
200
+ __ASM_EMIT (" bit v5.16b, v14.16b, v6.16b" ) \
201
+ __ASM_EMIT (" and v3.16b, v3.16b, v2.16b" ) /* v3 = A = ((T-V)*KT) & [(T-V) > 0] */ \
202
+ __ASM_EMIT (" and v7.16b, v7.16b, v6.16b" ) \
203
+ __ASM_EMIT (" fmul v1.4s, v1.4s, v9.4s" ) /* v1 = ES*s = S */ \
204
+ __ASM_EMIT (" fmul v5.4s, v5.4s, v9.4s" ) \
205
+ __ASM_EMIT (" mov v0.16b, v8.16b" ) \
206
+ __ASM_EMIT (" mov v2.16b, v10.16b" ) \
207
+ __ASM_EMIT (" mov v4.16b, v8.16b" ) \
208
+ __ASM_EMIT (" mov v6.16b, v10.16b" )
209
+
210
+ /*
211
+ kt = 1.0f / eff->thresh;
212
+ value = (value >= 0.0f) ? value : -value;
213
+
214
+ if ((eff->thresh - value) <= 0)
215
+ {
216
+ dst[1] = eff->s * value;
217
+ dst[3] = 0.0f;
218
+ }
219
+ else
220
+ {
221
+ dst[1] = eff->s * eff->thresh;
222
+ dst[3] = (eff->thresh - value) * kt;
223
+ }
224
+
225
+ dst[0] = eff->h;
226
+ dst[2] = eff->l;
227
+ */
228
+
229
+ void eff_hsla_sat (float *dst, const float *v, const dsp::hsla_sat_eff_t *eff, size_t count)
230
+ {
231
+ ARCH_AARCH64_ASM
232
+ (
233
+ __ASM_EMIT (" ld4r {v8.4s, v9.4s, v10.4s, v11.4s}, [%[eff]]" ) /* v8 = h, v9 = s, v10 = l, v11 = a */
234
+ __ASM_EMIT (" add %[eff], %[eff], #0x10" )
235
+ __ASM_EMIT (" ld1r {v14.4s}, [%[eff]]" ) /* v14 = T */
236
+ __ASM_EMIT (" frecpe v0.4s, v14.4s" ) /* v0 = TD */
237
+ __ASM_EMIT (" frecps v1.4s, v0.4s, v14.4s" ) /* v1 = (2 - TD*T) */
238
+ __ASM_EMIT (" fmul v0.4s, v1.4s, v0.4s" ) /* v0 = t' = TD * (2 - TD*T) */
239
+ __ASM_EMIT (" frecps v1.4s, v0.4s, v14.4s" ) /* v1 = (2 - TD*t') */
240
+ __ASM_EMIT (" fmul v15.4s, v1.4s, v0.4s" ) /* v15 = KT = 1/t = t' * (2 - TD*t') */
241
+
242
+ // -----------------------------------------------------------------
243
+ // 8x blocks
244
+ __ASM_EMIT (" subs %[count], %[count], #8" )
245
+ __ASM_EMIT (" b.lo 2f" )
246
+ __ASM_EMIT (" 1:" )
247
+ __ASM_EMIT (" ldp q0, q1, [%[src]]" ) /* v8 = v[0], v9 = v[1] */
248
+ EFF_HSLA_SAT_CORE
249
+ __ASM_EMIT (" subs %[count], %[count], #8" )
250
+ __ASM_EMIT (" st4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[dst]]" )
251
+ __ASM_EMIT (" add %[dst], %[dst], 0x40" )
252
+ __ASM_EMIT (" add %[src], %[src], 0x20" )
253
+ __ASM_EMIT (" st4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[dst]]" )
254
+ __ASM_EMIT (" add %[dst], %[dst], 0x40" )
255
+ __ASM_EMIT (" b.hs 1b" )
256
+
257
+ // -----------------------------------------------------------------
258
+ // 1x-8x block
259
+ __ASM_EMIT (" 2:" )
260
+ __ASM_EMIT (" adds %[count], %[count], #8" )
261
+ __ASM_EMIT (" b.ls 14f" )
262
+ __ASM_EMIT (" tst %[count], #4" )
263
+ __ASM_EMIT (" b.eq 4f" )
264
+ __ASM_EMIT (" ldr q0, [%[src]]" )
265
+ __ASM_EMIT (" add %[src], %[src], 0x10" )
266
+ __ASM_EMIT (" 4:" )
267
+ __ASM_EMIT (" tst %[count], #2" )
268
+ __ASM_EMIT (" b.eq 6f" )
269
+ __ASM_EMIT (" ld1 {v1.2s}, [%[src]]" )
270
+ __ASM_EMIT (" add %[src], %[src], 0x08" )
271
+ __ASM_EMIT (" 6:" )
272
+ __ASM_EMIT (" tst %[count], #1" )
273
+ __ASM_EMIT (" b.eq 8f" )
274
+ __ASM_EMIT (" ld1 {v1.s}[2], [%[src]]" )
275
+ __ASM_EMIT (" 8:" )
276
+ EFF_HSLA_SAT_CORE
277
+ __ASM_EMIT (" tst %[count], #4" )
278
+ __ASM_EMIT (" b.eq 10f" )
279
+ __ASM_EMIT (" st4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[dst]]" )
280
+ __ASM_EMIT (" add %[dst], %[dst], 0x40" )
281
+ __ASM_EMIT (" 10:" )
282
+ __ASM_EMIT (" tst %[count], #2" )
283
+ __ASM_EMIT (" b.eq 12f" )
284
+ __ASM_EMIT (" st4 {v4.2s, v5.2s, v6.2s, v7.2s}, [%[dst]]" )
285
+ __ASM_EMIT (" add %[dst], %[dst], 0x20" )
286
+ __ASM_EMIT (" 12:" )
287
+ __ASM_EMIT (" tst %[count], #1" )
288
+ __ASM_EMIT (" b.eq 14f" )
289
+ __ASM_EMIT (" st4 {v4.s, v5.s, v6.s, v7.s}[2], [%[dst]]" )
290
+ // End
291
+ __ASM_EMIT (" 14:" )
292
+
293
+ : [dst] " +r" (dst), [src] " +r" (v), [count] " +r" (count),
294
+ [eff] " +r" (eff)
295
+ : [XC] " r" (&EFF_HSLA_HUE_XC[0 ])
296
+ : " cc" , " memory" ,
297
+ " v0" , " v1" , " v2" , " v3" ,
298
+ " v4" , " v5" , " v6" , " v7" ,
299
+ " v8" , " v9" , " v10" , " v11" ,
300
+ " v12" , " v13" , " v14" , " v15"
301
+ );
302
+ }
303
+
182
304
} /* namespace asimd */
183
305
} /* namespace lsp */
184
306
0 commit comments