@@ -292,7 +292,7 @@ namespace lsp
292
292
293
293
: [dst] " +r" (dst), [src] " +r" (v), [count] " +r" (count),
294
294
[eff] " +r" (eff)
295
- : [XC] " r " (&EFF_HSLA_HUE_XC[ 0 ])
295
+ :
296
296
: " cc" , " memory" ,
297
297
" v0" , " v1" , " v2" , " v3" ,
298
298
" v4" , " v5" , " v6" , " v7" ,
@@ -301,6 +301,130 @@ namespace lsp
301
301
);
302
302
}
303
303
304
+ #define EFF_HSLA_LIGHT_CORE \
305
+ /* v0 = v[0] */ \
306
+ /* v1 = v[1] */ \
307
+ /* v8 = h */ \
308
+ /* v9 = s */ \
309
+ /* v10 = l */ \
310
+ /* v11 = a */ \
311
+ /* v14 = T */ \
312
+ /* v15 = KT */ \
313
+ __ASM_EMIT (" fabs v5.4s, v1.4s" ) /* v5 = V1 = abs(v1) */ \
314
+ __ASM_EMIT (" fabs v1.4s, v0.4s" ) /* v1 = V = abs(v) */ \
315
+ __ASM_EMIT (" fsub v3.4s, v14.4s, v1.4s" ) /* v3 = T - V */ \
316
+ __ASM_EMIT (" fsub v7.4s, v14.4s, v5.4s" ) \
317
+ __ASM_EMIT (" fcmgt v2.4s, v3.4s, #0.0" ) /* v2 = [(T-V) > 0] */ \
318
+ __ASM_EMIT (" fcmgt v6.4s, v7.4s, #0.0" ) \
319
+ __ASM_EMIT (" fmul v3.4s, v3.4s, v15.4s" ) /* v3 = (T-V)*KT */ \
320
+ __ASM_EMIT (" fmul v7.4s, v7.4s, v15.4s" ) \
321
+ __ASM_EMIT (" bit v1.16b, v14.16b, v2.16b" ) /* v1 = EL = V&[(T-V) <= 0] | T&[(T-V) > 0] */ \
322
+ __ASM_EMIT (" bit v5.16b, v14.16b, v6.16b" ) \
323
+ __ASM_EMIT (" and v3.16b, v3.16b, v2.16b" ) /* v3 = A = ((T-V)*KT) & [(T-V) > 0] */ \
324
+ __ASM_EMIT (" and v7.16b, v7.16b, v6.16b" ) \
325
+ __ASM_EMIT (" fmul v2.4s, v1.4s, v10.4s" ) /* v2 = EL*l = L */ \
326
+ __ASM_EMIT (" fmul v6.4s, v5.4s, v10.4s" ) \
327
+ __ASM_EMIT (" mov v0.16b, v8.16b" ) \
328
+ __ASM_EMIT (" mov v1.16b, v9.16b" ) \
329
+ __ASM_EMIT (" mov v4.16b, v8.16b" ) \
330
+ __ASM_EMIT (" mov v5.16b, v9.16b" )
331
+
332
+ /*
333
+ kt = 1.0f / eff->thresh;
334
+ value = (value >= 0.0f) ? value : -value;
335
+
336
+ if ((eff->thresh - value) <= 0)
337
+ {
338
+ dst[2] = eff->l * value;
339
+ dst[3] = 0.0f;
340
+ }
341
+ else
342
+ {
343
+ dst[2] = eff->l * eff->thresh;
344
+ dst[3] = (eff->thresh - value) * kt;
345
+ }
346
+
347
+ dst[0] = eff->h;
348
+ dst[1] = eff->s;
349
+ */
350
+
351
+ void eff_hsla_light (float *dst, const float *v, const dsp::hsla_light_eff_t *eff, size_t count)
352
+ {
353
+ ARCH_AARCH64_ASM
354
+ (
355
+ __ASM_EMIT (" ld4r {v8.4s, v9.4s, v10.4s, v11.4s}, [%[eff]]" ) /* v8 = h, v9 = s, v10 = l, v11 = a */
356
+ __ASM_EMIT (" add %[eff], %[eff], #0x10" )
357
+ __ASM_EMIT (" ld1r {v14.4s}, [%[eff]]" ) /* v14 = T */
358
+ __ASM_EMIT (" frecpe v0.4s, v14.4s" ) /* v0 = TD */
359
+ __ASM_EMIT (" frecps v1.4s, v0.4s, v14.4s" ) /* v1 = (2 - TD*T) */
360
+ __ASM_EMIT (" fmul v0.4s, v1.4s, v0.4s" ) /* v0 = t' = TD * (2 - TD*T) */
361
+ __ASM_EMIT (" frecps v1.4s, v0.4s, v14.4s" ) /* v1 = (2 - TD*t') */
362
+ __ASM_EMIT (" fmul v15.4s, v1.4s, v0.4s" ) /* v15 = KT = 1/t = t' * (2 - TD*t') */
363
+
364
+ // -----------------------------------------------------------------
365
+ // 8x blocks
366
+ __ASM_EMIT (" subs %[count], %[count], #8" )
367
+ __ASM_EMIT (" b.lo 2f" )
368
+ __ASM_EMIT (" 1:" )
369
+ __ASM_EMIT (" ldp q0, q1, [%[src]]" ) /* v8 = v[0], v9 = v[1] */
370
+ EFF_HSLA_LIGHT_CORE
371
+ __ASM_EMIT (" subs %[count], %[count], #8" )
372
+ __ASM_EMIT (" st4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[dst]]" )
373
+ __ASM_EMIT (" add %[dst], %[dst], 0x40" )
374
+ __ASM_EMIT (" add %[src], %[src], 0x20" )
375
+ __ASM_EMIT (" st4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[dst]]" )
376
+ __ASM_EMIT (" add %[dst], %[dst], 0x40" )
377
+ __ASM_EMIT (" b.hs 1b" )
378
+
379
+ // -----------------------------------------------------------------
380
+ // 1x-8x block
381
+ __ASM_EMIT (" 2:" )
382
+ __ASM_EMIT (" adds %[count], %[count], #8" )
383
+ __ASM_EMIT (" b.ls 14f" )
384
+ __ASM_EMIT (" tst %[count], #4" )
385
+ __ASM_EMIT (" b.eq 4f" )
386
+ __ASM_EMIT (" ldr q0, [%[src]]" )
387
+ __ASM_EMIT (" add %[src], %[src], 0x10" )
388
+ __ASM_EMIT (" 4:" )
389
+ __ASM_EMIT (" tst %[count], #2" )
390
+ __ASM_EMIT (" b.eq 6f" )
391
+ __ASM_EMIT (" ld1 {v1.2s}, [%[src]]" )
392
+ __ASM_EMIT (" add %[src], %[src], 0x08" )
393
+ __ASM_EMIT (" 6:" )
394
+ __ASM_EMIT (" tst %[count], #1" )
395
+ __ASM_EMIT (" b.eq 8f" )
396
+ __ASM_EMIT (" ld1 {v1.s}[2], [%[src]]" )
397
+ __ASM_EMIT (" 8:" )
398
+ EFF_HSLA_LIGHT_CORE
399
+ __ASM_EMIT (" tst %[count], #4" )
400
+ __ASM_EMIT (" b.eq 10f" )
401
+ __ASM_EMIT (" st4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[dst]]" )
402
+ __ASM_EMIT (" add %[dst], %[dst], 0x40" )
403
+ __ASM_EMIT (" 10:" )
404
+ __ASM_EMIT (" tst %[count], #2" )
405
+ __ASM_EMIT (" b.eq 12f" )
406
+ __ASM_EMIT (" st4 {v4.2s, v5.2s, v6.2s, v7.2s}, [%[dst]]" )
407
+ __ASM_EMIT (" add %[dst], %[dst], 0x20" )
408
+ __ASM_EMIT (" 12:" )
409
+ __ASM_EMIT (" tst %[count], #1" )
410
+ __ASM_EMIT (" b.eq 14f" )
411
+ __ASM_EMIT (" st4 {v4.s, v5.s, v6.s, v7.s}[2], [%[dst]]" )
412
+ // End
413
+ __ASM_EMIT (" 14:" )
414
+
415
+ : [dst] " +r" (dst), [src] " +r" (v), [count] " +r" (count),
416
+ [eff] " +r" (eff)
417
+ :
418
+ : " cc" , " memory" ,
419
+ " v0" , " v1" , " v2" , " v3" ,
420
+ " v4" , " v5" , " v6" , " v7" ,
421
+ " v8" , " v9" , " v10" , " v11" ,
422
+ " v12" , " v13" , " v14" , " v15"
423
+ );
424
+ }
425
+
426
+ #undef EFF_HSLA_LIGHT_CORE
427
+
304
428
} /* namespace asimd */
305
429
} /* namespace lsp */
306
430
0 commit comments