44 *
55 * Authors: Dennis Smit <[email protected] > 66 *
7- * $Id: lv_math.c,v 1.12 2006-02-05 18:45:57 synap Exp $
7+ * $Id: lv_math.c,v 1.13 2006-02-13 20:54:08 synap Exp $
88 *
99 * This program is free software; you can redistribute it and/or modify
1010 * it under the terms of the GNU Lesser General Public License as
@@ -299,7 +299,7 @@ int visual_math_vectorized_add_floats_const_float (float *dest, float *src, visu
299299 * @param dest Pointer to the destination float array.
300300 * @param src Pointer to the source float array.
301301 * @param n The number of items in the array.
302- * @param adder The constant substracter that is substracter from every entry in the source array.
302+ * @param substracter The constant substracter that is substracter from every entry in the source array.
303303 *
304304 * @return VISUAL_OK on succes or -VISUAL_ERROR_NULL on failure.
305305 */
@@ -341,10 +341,10 @@ int visual_math_vectorized_substract_floats_const_float (float *dest, float *src
341341 "\n\t movups 16(%0), %%xmm1"
342342 "\n\t movups 32(%0), %%xmm2"
343343 "\n\t movups 48(%0), %%xmm3"
344- "\n\t addps %%xmm7, %%xmm0"
345- "\n\t addps %%xmm7, %%xmm1"
346- "\n\t addps %%xmm7, %%xmm2"
347- "\n\t addps %%xmm7, %%xmm3"
344+ "\n\t subps %%xmm7, %%xmm0"
345+ "\n\t subps %%xmm7, %%xmm1"
346+ "\n\t subps %%xmm7, %%xmm2"
347+ "\n\t subps %%xmm7, %%xmm3"
348348 "\n\t movntps %%xmm0, (%1)"
349349 "\n\t movntps %%xmm1, 16(%1)"
350350 "\n\t movntps %%xmm2, 32(%1)"
@@ -416,6 +416,104 @@ int visual_math_vectorized_substract_floats_const_float (float *dest, float *src
416416 return VISUAL_OK ;
417417}
418418
419+ int visual_math_vectorized_multiplier_floats_floats (float * dest , float * src1 , float * src2 , visual_size_t n )
420+ {
421+ float * d = dest ;
422+ float * s1 = src1 ;
423+ float * s2 = src2 ;
424+
425+ visual_log_return_val_if_fail (dest != NULL , - VISUAL_ERROR_NULL );
426+ visual_log_return_val_if_fail (src1 != NULL , - VISUAL_ERROR_NULL );
427+ visual_log_return_val_if_fail (src2 != NULL , - VISUAL_ERROR_NULL );
428+
429+ if (visual_cpu_get_sse () && n >= 16 ) {
430+ #if defined(VISUAL_ARCH_X86 ) || defined(VISUAL_ARCH_X86_64 )
431+ while (!VISUAL_ALIGNED (d , 16 )) {
432+ (* d ) = (* s1 ) * (* s2 );
433+
434+ d ++ ;
435+ s1 ++ ;
436+ s2 ++ ;
437+
438+ n -- ;
439+ }
440+
441+ while (n > 16 ) {
442+ __asm __volatile
443+ ("\n\t prefetchnta 256(%0)"
444+ "\n\t prefetchnta 256(%1)"
445+ "\n\t movups (%0), %%xmm0"
446+ "\n\t movups 16(%0), %%xmm1"
447+ "\n\t movups 32(%0), %%xmm2"
448+ "\n\t movups 48(%0), %%xmm3"
449+ "\n\t movups (%1), %%xmm4"
450+ "\n\t movups 16(%1), %%xmm5"
451+ "\n\t movups 32(%1), %%xmm6"
452+ "\n\t movups 48(%1), %%xmm7"
453+ "\n\t mulps %%xmm4, %%xmm0"
454+ "\n\t mulps %%xmm5, %%xmm1"
455+ "\n\t mulps %%xmm6, %%xmm2"
456+ "\n\t mulps %%xmm7, %%xmm3"
457+ "\n\t movntps %%xmm0, (%2)"
458+ "\n\t movntps %%xmm1, 16(%2)"
459+ "\n\t movntps %%xmm2, 32(%2)"
460+ "\n\t movntps %%xmm3, 48(%2)"
461+ :: "r" (s1 ), "r" (s2 ), "r" (d ) : "memory" );
462+
463+ d += 16 ;
464+ s1 += 16 ;
465+ s2 += 16 ;
466+
467+ n -= 16 ;
468+ }
469+ #endif /* VISUAL_ARCH_X86 */
470+ } else if (visual_cpu_get_3dnow ()) {
471+ #if defined(VISUAL_ARCH_X86 ) || defined(VISUAL_ARCH_X86_64 )
472+ while (n > 8 ) {
473+ __asm __volatile
474+ ("\n\t prefetch 256(%0)"
475+ "\n\t movq (%0), %%mm0"
476+ "\n\t movq 8(%0), %%mm1"
477+ "\n\t movq 16(%0), %%mm2"
478+ "\n\t movq 24(%0), %%mm3"
479+ "\n\t movq (%1), %%mm4"
480+ "\n\t movq 8(%1), %%mm5"
481+ "\n\t movq 16(%1), %%mm6"
482+ "\n\t movq 24(%1), %%mm7"
483+ "\n\t pfmul %%mm4, %%mm0"
484+ "\n\t pfmul %%mm5, %%mm1"
485+ "\n\t pfmul %%mm6, %%mm2"
486+ "\n\t pfmul %%mm7, %%mm3"
487+ "\n\t movq %%mm0, (%2)"
488+ "\n\t movq %%mm1, 8(%2)"
489+ "\n\t movq %%mm2, 16(%2)"
490+ "\n\t movq %%mm3, 24(%2)"
491+ :: "r" (s1 ), "r" (s2 ), "r" (d ) : "memory" );
492+
493+ d += 8 ;
494+ s1 += 8 ;
495+ s2 += 8 ;
496+
497+ n -= 8 ;
498+ }
499+
500+ __asm __volatile
501+ ("\n\t emms" );
502+ #endif /* VISUAL_ARCH_X86 */
503+
504+ }
505+
506+ while (n -- ) {
507+ (* d ) = (* s1 ) * (* s2 );
508+
509+ d ++ ;
510+ s1 ++ ;
511+ s2 ++ ;
512+ }
513+
514+ return VISUAL_OK ;
515+ }
516+
419517/**
420518 * Converts an array of floats to integers. With the right cpu features in place this function
421519 * is very optimized.
@@ -780,7 +878,8 @@ int visual_math_vectorized_floats_to_int32s_multiply_denormalise (int32_t *ints,
780878 * Vectorized square root for single precision floats. This function works best with data
781879 * sizes larger than 16 or equal to 16.
782880 *
783- * @param vector The vector of floats of which the square roots will be calculated.
881+ * @param dest The destination vector of floats in which the results are placed.
882+ * @param src The source vector of floats of which the square roots will be calculated.
784883 * @param n The number of floats in the vector.
785884 *
786885 * @return VISUAL_OK on succes or -VISUAL_ERROR_NULL on failure.
@@ -805,10 +904,10 @@ int visual_math_vectorized_sqrt_floats (float *dest, float *src, visual_size_t n
805904 while (n > 16 ) {
806905 __asm __volatile
807906 ("\n\t prefetchnta 256(%0)"
808- "\n\t movaps (%0), %%xmm0"
809- "\n\t movaps 16(%0), %%xmm1"
810- "\n\t movaps 32(%0), %%xmm2"
811- "\n\t movaps 48(%0), %%xmm3"
907+ "\n\t movups (%0), %%xmm0"
908+ "\n\t movups 16(%0), %%xmm1"
909+ "\n\t movups 32(%0), %%xmm2"
910+ "\n\t movups 48(%0), %%xmm3"
812911 "\n\t sqrtps %%xmm0, %%xmm4"
813912 "\n\t sqrtps %%xmm1, %%xmm5"
814913 "\n\t sqrtps %%xmm2, %%xmm6"
@@ -837,6 +936,80 @@ int visual_math_vectorized_sqrt_floats (float *dest, float *src, visual_size_t n
837936 return VISUAL_OK ;
838937}
839938
939+ /**
940+ * Vectorized complex to norm conversion. Will make norm values from a real and imaginary
941+ * array.
942+ *
943+ * @param dest Pointer to the destination float array.
944+ * @param real Pointer to the real part float array.
945+ * @param imag pointer to the imaginary part float array.
946+ * @param n The number of elements to be converted.
947+ *
948+ * @return VISUAL_OK on succes or -VISUAL_ERROR_NULL on failure.
949+ */
950+ int visual_math_vectorized_complex_to_norm (float * dest , float * real , float * imag , visual_size_t n )
951+ {
952+ float * d = dest ;
953+ float * r = real ;
954+ float * i = imag ;
955+
956+ visual_log_return_val_if_fail (dest != NULL , - VISUAL_ERROR_NULL );
957+ visual_log_return_val_if_fail (real != NULL , - VISUAL_ERROR_NULL );
958+ visual_log_return_val_if_fail (imag != NULL , - VISUAL_ERROR_NULL );
959+
960+ if (visual_cpu_get_sse () && n >= 16 ) {
961+
962+ #if defined(VISUAL_ARCH_X86 ) || defined(VISUAL_ARCH_X86_64 )
963+ while (!VISUAL_ALIGNED (d , 16 )) {
964+ * d = sqrtf (((* r ) * (* r )) + ((* i ) * (* i )));
965+
966+ d ++ ;
967+ r ++ ;
968+ i ++ ;
969+
970+ n -- ;
971+ }
972+
973+ while (n > 8 ) {
974+ __asm __volatile
975+ ("\n\t prefetchnta 256(%0)"
976+ "\n\t prefetchnta 256(%1)"
977+ "\n\t movups (%0), %%xmm0"
978+ "\n\t movups 16(%0), %%xmm2"
979+ "\n\t movups (%1), %%xmm1"
980+ "\n\t movups 16(%1), %%xmm3"
981+ "\n\t mulps %%xmm0, %%xmm0"
982+ "\n\t mulps %%xmm1, %%xmm1"
983+ "\n\t mulps %%xmm2, %%xmm2"
984+ "\n\t mulps %%xmm3, %%xmm3"
985+ "\n\t addps %%xmm0, %%xmm1"
986+ "\n\t addps %%xmm2, %%xmm3"
987+ "\n\t sqrtps %%xmm1, %%xmm0"
988+ "\n\t sqrtps %%xmm3, %%xmm2"
989+ "\n\t movntps %%xmm0, (%2)"
990+ "\n\t movntps %%xmm2, 16(%2)"
991+ :: "r" (r ), "r" (i ), "r" (d ) : "memory" );
992+
993+ d += 8 ;
994+ i += 8 ;
995+ r += 8 ;
996+
997+ n -= 8 ;
998+ }
999+ #endif /* VISUAL_ARCH_X86 */
1000+ }
1001+
1002+ while (n -- ) {
1003+ * d = sqrtf (((* r ) * (* r )) + ((* i ) * (* i )));
1004+
1005+ d ++ ;
1006+ r ++ ;
1007+ i ++ ;
1008+ }
1009+
1010+ return VISUAL_OK ;
1011+ }
1012+
8401013/**
8411014 * Vectorized complex to norm conversion and result value scaler. Will make norm values from a real and imaginary
8421015 * array, after the conversion has been made it will be multiplied by the scaler.
@@ -882,35 +1055,26 @@ int visual_math_vectorized_complex_to_norm_scale (float *dest, float *real, floa
8821055 ("\n\t movups (%0), %%xmm7"
8831056 :: "r" (packed_scaler ) : "memory" );
8841057
885- /* FIXME optimize more, look into how we can get it atleast partially aligned, right */
8861058 while (n > 8 ) {
8871059 __asm __volatile
8881060 ("\n\t prefetchnta 256(%0)"
8891061 "\n\t prefetchnta 256(%1)"
890-
8911062 "\n\t movups (%0), %%xmm0"
8921063 "\n\t movups 16(%0), %%xmm2"
893-
8941064 "\n\t movups (%1), %%xmm1"
8951065 "\n\t movups 16(%1), %%xmm3"
896-
8971066 "\n\t mulps %%xmm0, %%xmm0"
8981067 "\n\t mulps %%xmm1, %%xmm1"
899-
9001068 "\n\t mulps %%xmm2, %%xmm2"
9011069 "\n\t mulps %%xmm3, %%xmm3"
902-
9031070 "\n\t addps %%xmm0, %%xmm1"
9041071 "\n\t addps %%xmm2, %%xmm3"
905-
9061072 "\n\t sqrtps %%xmm1, %%xmm0"
9071073 "\n\t sqrtps %%xmm3, %%xmm2"
908-
9091074 "\n\t mulps %%xmm7, %%xmm0"
9101075 "\n\t mulps %%xmm7, %%xmm2"
911-
912- "\n\t movups %%xmm0, (%2)"
913- "\n\t movups %%xmm2, 16(%2)"
1076+ "\n\t movntps %%xmm0, (%2)"
1077+ "\n\t movntps %%xmm2, 16(%2)"
9141078 :: "r" (r ), "r" (i ), "r" (d ) : "memory" );
9151079
9161080 d += 8 ;
0 commit comments