@@ -689,6 +689,169 @@ namespace lsp
689
689
690
690
#undef COMPLEX_DIV2_CORE
691
691
692
+ #define COMPLEX_RCP_CORE (DST, SRC, SEL ) \
693
+ __ASM_EMIT (" xor %[off], %[off]" ) \
694
+ __ASM_EMIT (" vmovaps 0x00 + %[CC], %%zmm6" ) /* zmm6 = 1 */ \
695
+ /* x32 blocks */ \
696
+ __ASM_EMIT32 (" subl $32, %[count]" ) \
697
+ __ASM_EMIT64 (" sub $32, %[count]" ) \
698
+ __ASM_EMIT (" jb 2f" ) \
699
+ __ASM_EMIT (" 1:" ) \
700
+ __ASM_EMIT (" vmovups 0x00(%[" SRC " _re], %[off]), %%zmm0" ) /* zmm0 = ar */ \
701
+ __ASM_EMIT (" vmovups 0x40(%[" SRC " _re], %[off]), %%zmm1" ) \
702
+ __ASM_EMIT (" vmovups 0x00(%[" SRC " _im], %[off]), %%zmm2" ) /* zmm2 = ai */ \
703
+ __ASM_EMIT (" vmovups 0x40(%[" SRC " _im], %[off]), %%zmm3" ) \
704
+ __ASM_EMIT (" vmulps %%zmm0, %%zmm0, %%zmm4" ) /* zmm4 = ar*ar */ \
705
+ __ASM_EMIT (" vmulps %%zmm1, %%zmm1, %%zmm5" ) \
706
+ __ASM_EMIT (" vfmadd231ps %%zmm2, %%zmm2, %%zmm4" ) /* zmm4 = R = ar*ar+ai*ai */ \
707
+ __ASM_EMIT (" vfmadd231ps %%zmm3, %%zmm3, %%zmm5" ) \
708
+ __ASM_EMIT (" vdivps %%zmm4, %%zmm6, %%zmm4" ) /* zmm4 = 1/R */ \
709
+ __ASM_EMIT (" vdivps %%zmm5, %%zmm6, %%zmm5" ) \
710
+ __ASM_EMIT (" vpxord 0x40 + %[CC], %%zmm2, %%zmm2" ) /* zmm2 = -ai */ \
711
+ __ASM_EMIT (" vpxord 0x40 + %[CC], %%zmm3, %%zmm3" ) \
712
+ __ASM_EMIT (" vmulps %%zmm4, %%zmm0, %%zmm0" ) /* zmm0 = ar/R */ \
713
+ __ASM_EMIT (" vmulps %%zmm5, %%zmm1, %%zmm1" ) \
714
+ __ASM_EMIT (" vmulps %%zmm4, %%zmm2, %%zmm2" ) /* zmm1 = -ai/R */ \
715
+ __ASM_EMIT (" vmulps %%zmm5, %%zmm3, %%zmm3" ) \
716
+ __ASM_EMIT (" vmovups %%zmm0, 0x00(%[" DST " _re], %[off])" ) \
717
+ __ASM_EMIT (" vmovups %%zmm1, 0x40(%[" DST " _re], %[off])" ) \
718
+ __ASM_EMIT (" vmovups %%zmm2, 0x00(%[" DST " _im], %[off])" ) \
719
+ __ASM_EMIT (" vmovups %%zmm3, 0x40(%[" DST " _im], %[off])" ) \
720
+ __ASM_EMIT (" add $0x80, %[off]" ) \
721
+ __ASM_EMIT32 (" subl $32, %[count]" ) \
722
+ __ASM_EMIT64 (" sub $32, %[count]" ) \
723
+ __ASM_EMIT (" jae 1b" ) \
724
+ __ASM_EMIT (" 2:" ) \
725
+ /* x16 blocks */ \
726
+ __ASM_EMIT32 (" addl $16, %[count]" ) \
727
+ __ASM_EMIT64 (" add $16, %[count]" ) \
728
+ __ASM_EMIT (" jl 4f" ) \
729
+ __ASM_EMIT (" vmovups 0x00(%[" SRC " _re], %[off]), %%ymm0" ) /* ymm0 = ar */ \
730
+ __ASM_EMIT (" vmovups 0x20(%[" SRC " _re], %[off]), %%ymm1" ) \
731
+ __ASM_EMIT (" vmovups 0x00(%[" SRC " _im], %[off]), %%ymm2" ) /* ymm2 = ai */ \
732
+ __ASM_EMIT (" vmovups 0x20(%[" SRC " _im], %[off]), %%ymm3" ) \
733
+ __ASM_EMIT (" vmulps %%ymm0, %%ymm0, %%ymm4" ) /* ymm4 = ar*ar */ \
734
+ __ASM_EMIT (" vmulps %%ymm1, %%ymm1, %%ymm5" ) \
735
+ __ASM_EMIT (" vfmadd231ps %%ymm2, %%ymm2, %%ymm4" ) /* ymm4 = R = ar*ar+ai*ai */ \
736
+ __ASM_EMIT (" vfmadd231ps %%ymm3, %%ymm3, %%ymm5" ) \
737
+ __ASM_EMIT (" vdivps %%ymm4, %%ymm6, %%ymm4" ) /* ymm4 = 1/R */ \
738
+ __ASM_EMIT (" vdivps %%ymm5, %%ymm6, %%ymm5" ) \
739
+ __ASM_EMIT (" vpxord 0x40 + %[CC], %%ymm2, %%ymm2" ) /* ymm2 = -ai */ \
740
+ __ASM_EMIT (" vpxord 0x40 + %[CC], %%ymm3, %%ymm3" ) \
741
+ __ASM_EMIT (" vmulps %%ymm4, %%ymm0, %%ymm0" ) /* ymm0 = ar/R */ \
742
+ __ASM_EMIT (" vmulps %%ymm5, %%ymm1, %%ymm1" ) \
743
+ __ASM_EMIT (" vmulps %%ymm4, %%ymm2, %%ymm2" ) /* ymm1 = -ai/R */ \
744
+ __ASM_EMIT (" vmulps %%ymm5, %%ymm3, %%ymm3" ) \
745
+ __ASM_EMIT (" vmovups %%ymm0, 0x00(%[" DST " _re], %[off])" ) \
746
+ __ASM_EMIT (" vmovups %%ymm1, 0x20(%[" DST " _re], %[off])" ) \
747
+ __ASM_EMIT (" vmovups %%ymm2, 0x00(%[" DST " _im], %[off])" ) \
748
+ __ASM_EMIT (" vmovups %%ymm3, 0x20(%[" DST " _im], %[off])" ) \
749
+ __ASM_EMIT (" add $0x40, %[off]" ) \
750
+ __ASM_EMIT32 (" subl $16, %[count]" ) \
751
+ __ASM_EMIT64 (" sub $16, %[count]" ) \
752
+ __ASM_EMIT (" jae 1b" ) \
753
+ __ASM_EMIT (" 4:" ) \
754
+ /* 8x block */ \
755
+ __ASM_EMIT32 (" addl $8, %[count]" ) \
756
+ __ASM_EMIT64 (" add $8, %[count]" ) \
757
+ __ASM_EMIT (" jl 6f" ) \
758
+ __ASM_EMIT (" vmovups 0x00(%[" SRC " _re], %[off]), %%xmm0" ) /* xmm0 = ar */ \
759
+ __ASM_EMIT (" vmovups 0x10(%[" SRC " _re], %[off]), %%xmm1" ) \
760
+ __ASM_EMIT (" vmovups 0x00(%[" SRC " _im], %[off]), %%xmm2" ) /* xmm2 = ai */ \
761
+ __ASM_EMIT (" vmovups 0x10(%[" SRC " _im], %[off]), %%xmm3" ) \
762
+ __ASM_EMIT (" vmulps %%xmm0, %%xmm0, %%xmm4" ) /* xmm4 = ar*ar */ \
763
+ __ASM_EMIT (" vmulps %%xmm1, %%xmm1, %%xmm5" ) \
764
+ __ASM_EMIT (" vfmadd231ps %%xmm2, %%xmm2, %%xmm4" ) /* xmm4 = R = ar*ar+ai*ai */ \
765
+ __ASM_EMIT (" vfmadd231ps %%xmm3, %%xmm3, %%xmm5" ) \
766
+ __ASM_EMIT (" vdivps %%xmm4, %%xmm6, %%xmm4" ) /* xmm4 = 1/R */ \
767
+ __ASM_EMIT (" vdivps %%xmm5, %%xmm6, %%xmm5" ) \
768
+ __ASM_EMIT (" vpxord 0x40 + %[CC], %%xmm2, %%xmm2" ) /* xmm2 = -ai */ \
769
+ __ASM_EMIT (" vpxord 0x40 + %[CC], %%xmm3, %%xmm3" ) \
770
+ __ASM_EMIT (" vmulps %%xmm4, %%xmm0, %%xmm0" ) /* xmm0 = ar/R */ \
771
+ __ASM_EMIT (" vmulps %%xmm5, %%xmm1, %%xmm1" ) \
772
+ __ASM_EMIT (" vmulps %%xmm4, %%xmm2, %%xmm2" ) /* xmm1 = -ai/R */ \
773
+ __ASM_EMIT (" vmulps %%xmm5, %%xmm3, %%xmm3" ) \
774
+ __ASM_EMIT (" vmovups %%xmm0, 0x00(%[" DST " _re], %[off])" ) \
775
+ __ASM_EMIT (" vmovups %%xmm1, 0x10(%[" DST " _re], %[off])" ) \
776
+ __ASM_EMIT (" vmovups %%xmm2, 0x00(%[" DST " _im], %[off])" ) \
777
+ __ASM_EMIT (" vmovups %%xmm3, 0x10(%[" DST " _im], %[off])" ) \
778
+ __ASM_EMIT32 (" subl $8, %[count]" ) \
779
+ __ASM_EMIT64 (" sub $8, %[count]" ) \
780
+ __ASM_EMIT (" add $0x20, %[off]" ) \
781
+ __ASM_EMIT (" 6:" ) \
782
+ /* 4x block */ \
783
+ __ASM_EMIT32 (" addl $4, %[count]" ) \
784
+ __ASM_EMIT64 (" add $4, %[count]" ) \
785
+ __ASM_EMIT (" jl 8f" ) \
786
+ __ASM_EMIT (" vmovups 0x00(%[" SRC " _re], %[off]), %%xmm0" ) /* xmm0 = ar */ \
787
+ __ASM_EMIT (" vmovups 0x00(%[" SRC " _im], %[off]), %%xmm2" ) /* xmm2 = ai */ \
788
+ __ASM_EMIT (" vmulps %%xmm0, %%xmm0, %%xmm4" ) /* xmm4 = ar*ar */ \
789
+ __ASM_EMIT (" vfmadd231ps %%xmm2, %%xmm2, %%xmm4" ) /* xmm4 = R = ar*ar+ai*ai */ \
790
+ __ASM_EMIT (" vdivps %%xmm4, %%xmm6, %%xmm4" ) /* xmm4 = 1/R */ \
791
+ __ASM_EMIT (" vpxord 0x40 + %[CC], %%xmm2, %%xmm2" ) /* xmm2 = -ai */ \
792
+ __ASM_EMIT (" vmulps %%xmm4, %%xmm0, %%xmm0" ) /* xmm0 = ar/R */ \
793
+ __ASM_EMIT (" vmulps %%xmm4, %%xmm2, %%xmm2" ) /* xmm1 = -ai/R */ \
794
+ __ASM_EMIT (" vmovups %%xmm0, 0x00(%[" DST " _re], %[off])" ) \
795
+ __ASM_EMIT (" vmovups %%xmm2, 0x00(%[" DST " _im], %[off])" ) \
796
+ __ASM_EMIT32 (" subl $4, %[count]" ) \
797
+ __ASM_EMIT64 (" sub $4, %[count]" ) \
798
+ __ASM_EMIT (" add $0x10, %[off]" ) \
799
+ __ASM_EMIT (" 8:" ) \
800
+ /* 1x blocks */ \
801
+ __ASM_EMIT32 (" addl $3, %[count]" ) \
802
+ __ASM_EMIT64 (" add $3, %[count]" ) \
803
+ __ASM_EMIT (" jl 10f" ) \
804
+ __ASM_EMIT (" 9:" ) \
805
+ __ASM_EMIT (" vmovss 0x00(%[" SRC " _re], %[off]), %%xmm0" ) /* xmm0 = ar */ \
806
+ __ASM_EMIT (" vmovss 0x00(%[" SRC " _im], %[off]), %%xmm2" ) /* xmm2 = ai */ \
807
+ __ASM_EMIT (" vmulss %%xmm0, %%xmm0, %%xmm4" ) /* xmm4 = ar*ar */ \
808
+ __ASM_EMIT (" vfmadd231ss %%xmm2, %%xmm2, %%xmm4" ) /* xmm4 = R = ar*ar+ai*ai */ \
809
+ __ASM_EMIT (" vdivss %%xmm4, %%xmm6, %%xmm4" ) /* xmm4 = 1/R */ \
810
+ __ASM_EMIT (" vpxord 0x40 + %[CC], %%xmm2, %%xmm2" ) /* xmm2 = -ai */ \
811
+ __ASM_EMIT (" vmulss %%xmm4, %%xmm0, %%xmm0" ) /* xmm0 = ar/R */ \
812
+ __ASM_EMIT (" vmulss %%xmm4, %%xmm2, %%xmm2" ) /* xmm1 = -ai/R */ \
813
+ __ASM_EMIT (" vmovss %%xmm0, 0x00(%[" DST " _re], %[off])" ) \
814
+ __ASM_EMIT (" vmovss %%xmm2, 0x00(%[" DST " _im], %[off])" ) \
815
+ __ASM_EMIT (" add $0x04, %[off]" ) \
816
+ __ASM_EMIT32 (" decl %[count]" ) \
817
+ __ASM_EMIT64 (" dec %[count]" ) \
818
+ __ASM_EMIT (" jge 9b" ) \
819
+ __ASM_EMIT (" 10:" )
820
+
821
+ void complex_rcp1 (float *dst_re, float *dst_im, size_t count)
822
+ {
823
+ IF_ARCH_X86 (size_t off);
824
+ ARCH_X86_ASM
825
+ (
826
+ COMPLEX_RCP_CORE (" dst" , " dst" , FMA_OFF)
827
+ : [count] " +r" (count), [off] " =&r" (off)
828
+ : [dst_re] " r" (dst_re), [dst_im] " r" (dst_im),
829
+ [CC] " o" (complex_div_const)
830
+ : " cc" , " memory" ,
831
+ " %xmm0" , " %xmm1" , " %xmm2" , " %xmm3" ,
832
+ " %xmm4" , " %xmm5" , " %xmm6" , " %xmm7"
833
+ );
834
+ }
835
+
836
+ void complex_rcp2 (float *dst_re, float *dst_im, const float *src_re, const float *src_im, size_t count)
837
+ {
838
+ IF_ARCH_X86 (size_t off);
839
+ ARCH_X86_ASM
840
+ (
841
+ COMPLEX_RCP_CORE (" dst" , " src" , FMA_OFF)
842
+ : [count] X86_PGREG (count),
843
+ [off] " =&r" (off)
844
+ : [dst_re] " r" (dst_re), [dst_im] " r" (dst_im),
845
+ [src_re] " r" (src_re), [src_im] " r" (src_im),
846
+ [CC] " o" (complex_div_const)
847
+ : " cc" , " memory" ,
848
+ " %xmm0" , " %xmm1" , " %xmm2" , " %xmm3" ,
849
+ " %xmm4" , " %xmm5" , " %xmm6" , " %xmm7"
850
+ );
851
+ }
852
+
853
+ #undef COMPLEX_RCP_CORE
854
+
692
855
693
856
} /* namespace avx512 */
694
857
} /* namespace lsp */
0 commit comments