Skip to content

Commit 5256707

Browse files
committed
AVX-512 implementation of complex_rcp1 and complex_rcp2
1 parent f107893 commit 5256707

File tree

4 files changed

+185
-4
lines changed

4 files changed

+185
-4
lines changed

include/private/dsp/arch/x86/avx512/complex.h

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -689,6 +689,169 @@ namespace lsp
689689

690690
#undef COMPLEX_DIV2_CORE
691691

692+
#define COMPLEX_RCP_CORE(DST, SRC, SEL) \
693+
__ASM_EMIT ("xor %[off], %[off]") \
694+
__ASM_EMIT ("vmovaps 0x00 + %[CC], %%zmm6") /* zmm6 = 1 */ \
695+
/* x32 blocks */ \
696+
__ASM_EMIT32("subl $32, %[count]") \
697+
__ASM_EMIT64("sub $32, %[count]") \
698+
__ASM_EMIT ("jb 2f") \
699+
__ASM_EMIT ("1:") \
700+
__ASM_EMIT ("vmovups 0x00(%[" SRC "_re], %[off]), %%zmm0") /* zmm0 = ar */ \
701+
__ASM_EMIT ("vmovups 0x40(%[" SRC "_re], %[off]), %%zmm1") \
702+
__ASM_EMIT ("vmovups 0x00(%[" SRC "_im], %[off]), %%zmm2") /* zmm2 = ai */ \
703+
__ASM_EMIT ("vmovups 0x40(%[" SRC "_im], %[off]), %%zmm3") \
704+
__ASM_EMIT ("vmulps %%zmm0, %%zmm0, %%zmm4") /* zmm4 = ar*ar */ \
705+
__ASM_EMIT ("vmulps %%zmm1, %%zmm1, %%zmm5") \
706+
__ASM_EMIT ("vfmadd231ps %%zmm2, %%zmm2, %%zmm4") /* zmm4 = R = ar*ar+ai*ai */ \
707+
__ASM_EMIT ("vfmadd231ps %%zmm3, %%zmm3, %%zmm5") \
708+
__ASM_EMIT ("vdivps %%zmm4, %%zmm6, %%zmm4") /* zmm4 = 1/R */ \
709+
__ASM_EMIT ("vdivps %%zmm5, %%zmm6, %%zmm5") \
710+
__ASM_EMIT ("vpxord 0x40 + %[CC], %%zmm2, %%zmm2") /* zmm2 = -ai */ \
711+
__ASM_EMIT ("vpxord 0x40 + %[CC], %%zmm3, %%zmm3") \
712+
__ASM_EMIT ("vmulps %%zmm4, %%zmm0, %%zmm0") /* zmm0 = ar/R */ \
713+
__ASM_EMIT ("vmulps %%zmm5, %%zmm1, %%zmm1") \
714+
__ASM_EMIT ("vmulps %%zmm4, %%zmm2, %%zmm2") /* zmm1 = -ai/R */ \
715+
__ASM_EMIT ("vmulps %%zmm5, %%zmm3, %%zmm3") \
716+
__ASM_EMIT ("vmovups %%zmm0, 0x00(%[" DST "_re], %[off])") \
717+
__ASM_EMIT ("vmovups %%zmm1, 0x40(%[" DST "_re], %[off])") \
718+
__ASM_EMIT ("vmovups %%zmm2, 0x00(%[" DST "_im], %[off])") \
719+
__ASM_EMIT ("vmovups %%zmm3, 0x40(%[" DST "_im], %[off])") \
720+
__ASM_EMIT ("add $0x80, %[off]") \
721+
__ASM_EMIT32("subl $32, %[count]") \
722+
__ASM_EMIT64("sub $32, %[count]") \
723+
__ASM_EMIT ("jae 1b") \
724+
__ASM_EMIT ("2:") \
725+
/* x16 blocks */ \
726+
__ASM_EMIT32("addl $16, %[count]") \
727+
__ASM_EMIT64("add $16, %[count]") \
728+
__ASM_EMIT ("jl 4f") \
729+
__ASM_EMIT ("vmovups 0x00(%[" SRC "_re], %[off]), %%ymm0") /* ymm0 = ar */ \
730+
__ASM_EMIT ("vmovups 0x20(%[" SRC "_re], %[off]), %%ymm1") \
731+
__ASM_EMIT ("vmovups 0x00(%[" SRC "_im], %[off]), %%ymm2") /* ymm2 = ai */ \
732+
__ASM_EMIT ("vmovups 0x20(%[" SRC "_im], %[off]), %%ymm3") \
733+
__ASM_EMIT ("vmulps %%ymm0, %%ymm0, %%ymm4") /* ymm4 = ar*ar */ \
734+
__ASM_EMIT ("vmulps %%ymm1, %%ymm1, %%ymm5") \
735+
__ASM_EMIT ("vfmadd231ps %%ymm2, %%ymm2, %%ymm4") /* ymm4 = R = ar*ar+ai*ai */ \
736+
__ASM_EMIT ("vfmadd231ps %%ymm3, %%ymm3, %%ymm5") \
737+
__ASM_EMIT ("vdivps %%ymm4, %%ymm6, %%ymm4") /* ymm4 = 1/R */ \
738+
__ASM_EMIT ("vdivps %%ymm5, %%ymm6, %%ymm5") \
739+
__ASM_EMIT ("vpxord 0x40 + %[CC], %%ymm2, %%ymm2") /* ymm2 = -ai */ \
740+
__ASM_EMIT ("vpxord 0x40 + %[CC], %%ymm3, %%ymm3") \
741+
__ASM_EMIT ("vmulps %%ymm4, %%ymm0, %%ymm0") /* ymm0 = ar/R */ \
742+
__ASM_EMIT ("vmulps %%ymm5, %%ymm1, %%ymm1") \
743+
__ASM_EMIT ("vmulps %%ymm4, %%ymm2, %%ymm2") /* ymm1 = -ai/R */ \
744+
__ASM_EMIT ("vmulps %%ymm5, %%ymm3, %%ymm3") \
745+
__ASM_EMIT ("vmovups %%ymm0, 0x00(%[" DST "_re], %[off])") \
746+
__ASM_EMIT ("vmovups %%ymm1, 0x20(%[" DST "_re], %[off])") \
747+
__ASM_EMIT ("vmovups %%ymm2, 0x00(%[" DST "_im], %[off])") \
748+
__ASM_EMIT ("vmovups %%ymm3, 0x20(%[" DST "_im], %[off])") \
749+
__ASM_EMIT ("add $0x40, %[off]") \
750+
__ASM_EMIT32("subl $16, %[count]") \
751+
__ASM_EMIT64("sub $16, %[count]") \
752+
__ASM_EMIT ("jae 1b") \
753+
__ASM_EMIT ("4:") \
754+
/* 8x block */ \
755+
__ASM_EMIT32("addl $8, %[count]") \
756+
__ASM_EMIT64("add $8, %[count]") \
757+
__ASM_EMIT ("jl 6f") \
758+
__ASM_EMIT ("vmovups 0x00(%[" SRC "_re], %[off]), %%xmm0") /* xmm0 = ar */ \
759+
__ASM_EMIT ("vmovups 0x10(%[" SRC "_re], %[off]), %%xmm1") \
760+
__ASM_EMIT ("vmovups 0x00(%[" SRC "_im], %[off]), %%xmm2") /* xmm2 = ai */ \
761+
__ASM_EMIT ("vmovups 0x10(%[" SRC "_im], %[off]), %%xmm3") \
762+
__ASM_EMIT ("vmulps %%xmm0, %%xmm0, %%xmm4") /* xmm4 = ar*ar */ \
763+
__ASM_EMIT ("vmulps %%xmm1, %%xmm1, %%xmm5") \
764+
__ASM_EMIT ("vfmadd231ps %%xmm2, %%xmm2, %%xmm4") /* xmm4 = R = ar*ar+ai*ai */ \
765+
__ASM_EMIT ("vfmadd231ps %%xmm3, %%xmm3, %%xmm5") \
766+
__ASM_EMIT ("vdivps %%xmm4, %%xmm6, %%xmm4") /* xmm4 = 1/R */ \
767+
__ASM_EMIT ("vdivps %%xmm5, %%xmm6, %%xmm5") \
768+
__ASM_EMIT ("vpxord 0x40 + %[CC], %%xmm2, %%xmm2") /* xmm2 = -ai */ \
769+
__ASM_EMIT ("vpxord 0x40 + %[CC], %%xmm3, %%xmm3") \
770+
__ASM_EMIT ("vmulps %%xmm4, %%xmm0, %%xmm0") /* xmm0 = ar/R */ \
771+
__ASM_EMIT ("vmulps %%xmm5, %%xmm1, %%xmm1") \
772+
__ASM_EMIT ("vmulps %%xmm4, %%xmm2, %%xmm2") /* xmm1 = -ai/R */ \
773+
__ASM_EMIT ("vmulps %%xmm5, %%xmm3, %%xmm3") \
774+
__ASM_EMIT ("vmovups %%xmm0, 0x00(%[" DST "_re], %[off])") \
775+
__ASM_EMIT ("vmovups %%xmm1, 0x10(%[" DST "_re], %[off])") \
776+
__ASM_EMIT ("vmovups %%xmm2, 0x00(%[" DST "_im], %[off])") \
777+
__ASM_EMIT ("vmovups %%xmm3, 0x10(%[" DST "_im], %[off])") \
778+
__ASM_EMIT32("subl $8, %[count]") \
779+
__ASM_EMIT64("sub $8, %[count]") \
780+
__ASM_EMIT ("add $0x20, %[off]") \
781+
__ASM_EMIT ("6:") \
782+
/* 4x block */ \
783+
__ASM_EMIT32("addl $4, %[count]") \
784+
__ASM_EMIT64("add $4, %[count]") \
785+
__ASM_EMIT ("jl 8f") \
786+
__ASM_EMIT ("vmovups 0x00(%[" SRC "_re], %[off]), %%xmm0") /* xmm0 = ar */ \
787+
__ASM_EMIT ("vmovups 0x00(%[" SRC "_im], %[off]), %%xmm2") /* xmm2 = ai */ \
788+
__ASM_EMIT ("vmulps %%xmm0, %%xmm0, %%xmm4") /* xmm4 = ar*ar */ \
789+
__ASM_EMIT ("vfmadd231ps %%xmm2, %%xmm2, %%xmm4") /* xmm4 = R = ar*ar+ai*ai */ \
790+
__ASM_EMIT ("vdivps %%xmm4, %%xmm6, %%xmm4") /* xmm4 = 1/R */ \
791+
__ASM_EMIT ("vpxord 0x40 + %[CC], %%xmm2, %%xmm2") /* xmm2 = -ai */ \
792+
__ASM_EMIT ("vmulps %%xmm4, %%xmm0, %%xmm0") /* xmm0 = ar/R */ \
793+
__ASM_EMIT ("vmulps %%xmm4, %%xmm2, %%xmm2") /* xmm1 = -ai/R */ \
794+
__ASM_EMIT ("vmovups %%xmm0, 0x00(%[" DST "_re], %[off])") \
795+
__ASM_EMIT ("vmovups %%xmm2, 0x00(%[" DST "_im], %[off])") \
796+
__ASM_EMIT32("subl $4, %[count]") \
797+
__ASM_EMIT64("sub $4, %[count]") \
798+
__ASM_EMIT ("add $0x10, %[off]") \
799+
__ASM_EMIT ("8:") \
800+
/* 1x blocks */ \
801+
__ASM_EMIT32("addl $3, %[count]") \
802+
__ASM_EMIT64("add $3, %[count]") \
803+
__ASM_EMIT ("jl 10f") \
804+
__ASM_EMIT ("9:") \
805+
__ASM_EMIT ("vmovss 0x00(%[" SRC "_re], %[off]), %%xmm0") /* xmm0 = ar */ \
806+
__ASM_EMIT ("vmovss 0x00(%[" SRC "_im], %[off]), %%xmm2") /* xmm2 = ai */ \
807+
__ASM_EMIT ("vmulss %%xmm0, %%xmm0, %%xmm4") /* xmm4 = ar*ar */ \
808+
__ASM_EMIT ("vfmadd231ss %%xmm2, %%xmm2, %%xmm4") /* xmm4 = R = ar*ar+ai*ai */ \
809+
__ASM_EMIT ("vdivss %%xmm4, %%xmm6, %%xmm4") /* xmm4 = 1/R */ \
810+
__ASM_EMIT ("vpxord 0x40 + %[CC], %%xmm2, %%xmm2") /* xmm2 = -ai */ \
811+
__ASM_EMIT ("vmulss %%xmm4, %%xmm0, %%xmm0") /* xmm0 = ar/R */ \
812+
__ASM_EMIT ("vmulss %%xmm4, %%xmm2, %%xmm2") /* xmm1 = -ai/R */ \
813+
__ASM_EMIT ("vmovss %%xmm0, 0x00(%[" DST "_re], %[off])") \
814+
__ASM_EMIT ("vmovss %%xmm2, 0x00(%[" DST "_im], %[off])") \
815+
__ASM_EMIT ("add $0x04, %[off]") \
816+
__ASM_EMIT32("decl %[count]") \
817+
__ASM_EMIT64("dec %[count]") \
818+
__ASM_EMIT ("jge 9b") \
819+
__ASM_EMIT ("10:")
820+
821+
void complex_rcp1(float *dst_re, float *dst_im, size_t count)
822+
{
823+
IF_ARCH_X86(size_t off);
824+
ARCH_X86_ASM
825+
(
826+
COMPLEX_RCP_CORE("dst", "dst", FMA_OFF)
827+
: [count] "+r" (count), [off] "=&r" (off)
828+
: [dst_re] "r" (dst_re), [dst_im] "r" (dst_im),
829+
[CC] "o" (complex_div_const)
830+
: "cc", "memory",
831+
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
832+
"%xmm4", "%xmm5", "%xmm6", "%xmm7"
833+
);
834+
}
835+
836+
void complex_rcp2(float *dst_re, float *dst_im, const float *src_re, const float *src_im, size_t count)
837+
{
838+
IF_ARCH_X86(size_t off);
839+
ARCH_X86_ASM
840+
(
841+
COMPLEX_RCP_CORE("dst", "src", FMA_OFF)
842+
: [count] X86_PGREG (count),
843+
[off] "=&r" (off)
844+
: [dst_re] "r" (dst_re), [dst_im] "r" (dst_im),
845+
[src_re] "r" (src_re), [src_im] "r" (src_im),
846+
[CC] "o" (complex_div_const)
847+
: "cc", "memory",
848+
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
849+
"%xmm4", "%xmm5", "%xmm6", "%xmm7"
850+
);
851+
}
852+
853+
#undef COMPLEX_RCP_CORE
854+
692855

693856
} /* namespace avx512 */
694857
} /* namespace lsp */

src/main/x86/avx512.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,8 @@
8383
CEXPORT1(vl, complex_div2);
8484
CEXPORT1(vl, complex_rdiv2);
8585
CEXPORT1(vl, complex_div3);
86+
CEXPORT1(vl, complex_rcp1);
87+
CEXPORT1(vl, complex_rcp2);
8688
}
8789
} /* namespace avx2 */
8890
} /* namespace lsp */

src/test/ptest/complex/rcp.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
2-
* Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/>
3-
* (C) 2020 Vladimir Sadovnikov <[email protected]>
2+
* Copyright (C) 2023 Linux Studio Plugins Project <https://lsp-plug.in/>
3+
* (C) 2023 Vladimir Sadovnikov <[email protected]>
44
*
55
* This file is part of lsp-dsp-lib
66
* Created on: 31 мар. 2020 г.
@@ -51,6 +51,12 @@ namespace lsp
5151
void complex_rcp1_fma3(float *dst_re, float *dst_im, size_t count);
5252
void complex_rcp2_fma3(float *dst_re, float *dst_im, const float *src_re, const float *src_im, size_t count);
5353
}
54+
55+
namespace avx512
56+
{
57+
void complex_rcp1(float *dst_re, float *dst_im, size_t count);
58+
void complex_rcp2(float *dst_re, float *dst_im, const float *src_re, const float *src_im, size_t count);
59+
}
5460
)
5561

5662
IF_ARCH_ARM(
@@ -133,6 +139,7 @@ PTEST_BEGIN("dsp.complex", rcp, 5, 1000)
133139
IF_ARCH_X86(CALL1(sse::complex_rcp1));
134140
IF_ARCH_X86(CALL1(avx::complex_rcp1));
135141
IF_ARCH_X86(CALL1(avx::complex_rcp1_fma3));
142+
IF_ARCH_X86(CALL1(avx512::complex_rcp1));
136143
IF_ARCH_ARM(CALL1(neon_d32::complex_rcp1));
137144
IF_ARCH_AARCH64(CALL1(asimd::complex_rcp1));
138145
PTEST_SEPARATOR;
@@ -141,6 +148,7 @@ PTEST_BEGIN("dsp.complex", rcp, 5, 1000)
141148
IF_ARCH_X86(CALL2(sse::complex_rcp2));
142149
IF_ARCH_X86(CALL2(avx::complex_rcp2));
143150
IF_ARCH_X86(CALL2(avx::complex_rcp2_fma3));
151+
IF_ARCH_X86(CALL2(avx512::complex_rcp2));
144152
IF_ARCH_ARM(CALL2(neon_d32::complex_rcp2));
145153
IF_ARCH_AARCH64(CALL2(asimd::complex_rcp2));
146154
PTEST_SEPARATOR2;

src/test/utest/complex/rcp.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
2-
* Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/>
3-
* (C) 2020 Vladimir Sadovnikov <[email protected]>
2+
* Copyright (C) 2023 Linux Studio Plugins Project <https://lsp-plug.in/>
3+
* (C) 2023 Vladimir Sadovnikov <[email protected]>
44
*
55
* This file is part of lsp-dsp-lib
66
* Created on: 31 мар. 2020 г.
@@ -46,6 +46,12 @@ namespace lsp
4646
void complex_rcp1_fma3(float *dst_re, float *dst_im, size_t count);
4747
void complex_rcp2_fma3(float *dst_re, float *dst_im, const float *src_re, const float *src_im, size_t count);
4848
}
49+
50+
namespace avx512
51+
{
52+
void complex_rcp1(float *dst_re, float *dst_im, size_t count);
53+
void complex_rcp2(float *dst_re, float *dst_im, const float *src_re, const float *src_im, size_t count);
54+
}
4955
)
5056

5157
IF_ARCH_ARM(
@@ -165,6 +171,8 @@ UTEST_BEGIN("dsp.complex", rcp)
165171
IF_ARCH_X86(CALL(avx::complex_rcp2, 32));
166172
IF_ARCH_X86(CALL(avx::complex_rcp1_fma3, 32));
167173
IF_ARCH_X86(CALL(avx::complex_rcp2_fma3, 32));
174+
IF_ARCH_X86(CALL(avx512::complex_rcp1, 64));
175+
IF_ARCH_X86(CALL(avx512::complex_rcp2, 64));
168176

169177
IF_ARCH_ARM(CALL(neon_d32::complex_rcp1, 16));
170178
IF_ARCH_ARM(CALL(neon_d32::complex_rcp2, 16));

0 commit comments

Comments
 (0)