libpgmath: make it clang friendly

pawosm-arm · kiranchandramohan · commit 31e549df3980 · 2020-08-26T18:23:21.000+01:00
This commit adresses following problems encountered during
libpgmath compilation with the Clang C++ compiler:

* Clang C++ compiler needs explicit casts from C integer types
  to stdint's int32_t and int64_t.

* Clang C++ compiler needs qualified names when dealing with
  vrd2_t types typedef'ed inside of C++ structures (yet since
  desired type is known at the place of use, GNU C++ compiler
  can infer itself which of vrd2_t types the author had in mind).

* Clang C++ compiler cannot infer which overloaded conversion
  operator to chose when converting between __m128* types.

  What GNU C++ compiler does in such a place is:

  1. It generates a call to overloaded cast to long double()
     operator on __m128-typed value.

  2. It generates a call to desired __m128* constructor
     overloaded to accept long double,

  There were alternative proposals to solve this issue, e.g.
  adding better suited overloads of casting operator
  (unfortunately, this didn't help) or casting always to
  long double (which works with both GNU and Clang compilers,
  unfortunately, this approach could not gain acceptance as it
  makes the code less meaningful).

The best solution for those issues would be to fix the Clang C++
compiler. To my knowledge it will not happen in the forseeable
future.

Signed-off-by: Paul Osmialowski &lt;pawel.osmialowski@arm.com&gt;
diff --git a/runtime/libpgmath/lib/common/acos/fma3/ssacos.cpp b/runtime/libpgmath/lib/common/acos/fma3/ssacos.cpp
@@ -67,7 +67,11 @@ float __fss_acos_fma3(float const a)
         _sq = _mm_setr_ps(0.0f, sq, 0.0f, 0.0f);
         p1 = _mm_fmadd_ps(p, _x2_x, F);
 
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+        __m128 pi_mask = (__m128)((long double)_mm_cmpgt_epi32(ZERO, (__m128i)((long double)_a)));
+#else
         __m128 pi_mask = (__m128)_mm_cmpgt_epi32(ZERO, (__m128i)_a);
+#endif
         pi_mask = _mm_and_ps(pi_mask, PI);
         p1 = _mm_fmsub_ps(_sq, p1, pi_mask);
 
diff --git a/runtime/libpgmath/lib/common/acos/fma3/vdacos2.cpp b/runtime/libpgmath/lib/common/acos/fma3/vdacos2.cpp
@@ -23,7 +23,11 @@ __m128d __fvd_acos_fma3(__m128d const a)
     __m128i const ABS_MASK  = _mm_set1_epi64x(ABS_MASK_LL);
     __m128d const ZERO      = _mm_set1_pd(0.0);
     __m128d const ONE       = _mm_set1_pd(1.0);
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    __m128d const SGN_MASK  = (__m128d)((long double)_mm_set1_epi64x(SGN_MASK_LL));
+#else
     __m128d const SGN_MASK  = (__m128d)_mm_set1_epi64x(SGN_MASK_LL);
+#endif
     __m128d const THRESHOLD = _mm_set1_pd(THRESHOLD_D);
     __m128d const PI_HI     = _mm_set1_pd(PI_HI_D);
 
@@ -61,7 +65,11 @@ __m128d __fvd_acos_fma3(__m128d const a)
     __m128d res, cmp, sign, fix;
     __m128d p0hi, p0lo, p1hi, p1lo;
 
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    x  = _mm_and_pd(a, (__m128d)((long double)ABS_MASK));
+#else
     x  = _mm_and_pd(a, (__m128d)ABS_MASK);
+#endif
     x2 = _mm_mul_pd(a, a);
     sq = _mm_sub_pd(ONE, x);
     sq = _mm_sqrt_pd(sq);
diff --git a/runtime/libpgmath/lib/common/acos/fma3/vsacos4.cpp b/runtime/libpgmath/lib/common/acos/fma3/vsacos4.cpp
@@ -19,11 +19,20 @@ extern "C" __m128 __fvs_acos_fma3(__m128 const a);
 
 __m128 __fvs_acos_fma3(__m128 const a)
 {
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    __m128  const ABS_MASK      = (__m128)((long double)_mm_set1_epi32(ABS_MASK_I));
+    __m128  const SGN_MASK      = (__m128)((long double)_mm_set1_epi32(SGN_MASK_I));
+#else
     __m128  const ABS_MASK      = (__m128)_mm_set1_epi32(ABS_MASK_I);
     __m128  const SGN_MASK      = (__m128)_mm_set1_epi32(SGN_MASK_I);
+#endif
     __m128  const ONE           = _mm_set1_ps(1.0f);
     __m128i const ZERO          = _mm_set1_epi32(0);
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    __m128i const THRESHOLD     = (__m128i)((long double)_mm_set1_ps(THRESHOLD_F));
+#else
     __m128i const THRESHOLD     = (__m128i)_mm_set1_ps(THRESHOLD_F);
+#endif
     __m128  const PI            = _mm_set1_ps(PI_F);
 
     // p0 coefficients
@@ -47,8 +56,13 @@ __m128 __fvs_acos_fma3(__m128 const a)
     sq = _mm_sub_ps(ONE, x);
     sq = _mm_sqrt_ps(sq); // sqrt(1 - |a|)
 
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    __m128 pi_mask = (__m128)((long double)_mm_cmpgt_epi32(ZERO, (__m128i)((long double)a)));
+    cmp0 = (__m128)((long double)_mm_cmpgt_epi32((__m128i)((long double)x), THRESHOLD));
+#else
     __m128 pi_mask = (__m128)_mm_cmpgt_epi32(ZERO, (__m128i)a);
     cmp0 = (__m128)_mm_cmpgt_epi32((__m128i)x, THRESHOLD);
+#endif
 
     // polynomials evaluation
     x2 = _mm_mul_ps(a, a);
diff --git a/runtime/libpgmath/lib/common/arm64intrin.h b/runtime/libpgmath/lib/common/arm64intrin.h
@@ -899,14 +899,14 @@ struct __s128i {
 
   inline __s128i& operator+=(unsigned int i) {
     if (i != 0U)
-      xird = xird + i;
+      xird = xird + static_cast<int32_t>(i);
 
     return *this;
   }
 
   inline __s128i& operator+=(int i) {
     if (i != 0)
-      xird = xird + i;
+      xird = xird + static_cast<int32_t>(i);
 
     return *this;
   }
@@ -922,7 +922,7 @@ struct __s128i {
 
   inline __s128i& operator+=(unsigned long l) {
     if (l != 0UL)
-      xlrd = xlrd + l;
+      xlrd = xlrd + static_cast<int64_t>(l);
 
     return *this;
   }
@@ -1281,7 +1281,7 @@ vec_ld(int v, float vld[4])
 {
   __m128 r(vld);
   r += v;
-  return r.operator vrd2_t();
+  return r.operator __m128::vrd2_t();
 }
 
 static inline __m128::vrd2_t
@@ -1290,7 +1290,7 @@ vec_ld(unsigned int v, float vld[4])
 {
   __m128 r(vld);
   r += v;
-  return r.operator vrd2_t();
+  return r.operator __m128::vrd2_t();
 }
 
 static inline __m128d
diff --git a/runtime/libpgmath/lib/common/asin/fma3/vdasin2.cpp b/runtime/libpgmath/lib/common/asin/fma3/vdasin2.cpp
@@ -24,8 +24,13 @@ __m128d __fvd_asin_fma3(__m128d const a)
     __m128i const ABS_MASK  = _mm_set1_epi64x(ABS_MASK_LL);
     __m128d const ZERO      = _mm_set1_pd(0.0);
     __m128d const ONE       = _mm_set1_pd(1.0);
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    __m128d const SGN_MASK  = (__m128d)((long double)_mm_set1_epi64x(SGN_MASK_LL));
+    __m128d const THRESHOLD = (__m128d)((long double)_mm_set1_epi64x(THRESHOLD_LL));
+#else
     __m128d const SGN_MASK  = (__m128d)_mm_set1_epi64x(SGN_MASK_LL);
     __m128d const THRESHOLD = (__m128d)_mm_set1_epi64x(THRESHOLD_LL);
+#endif
     __m128d const PIO2_HI   = _mm_set1_pd(PIO2_HI_D);
     __m128d const PIO2_LO   = _mm_set1_pd(PIO2_LO_D);
 
@@ -60,7 +65,11 @@ __m128d __fvd_asin_fma3(__m128d const a)
     __m128d sq, p0hi, p0lo, p0, p1hi, p1lo, p1;
     __m128d res, cmp, sign, fix, pio2_lo, pio2_hi;
 
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    x  = _mm_and_pd(a, (__m128d)((long double)ABS_MASK));
+#else
     x  = _mm_and_pd(a, (__m128d)ABS_MASK);
+#endif
     sq = _mm_sub_pd(ONE, x);
     sq = _mm_sqrt_pd(sq);
 
diff --git a/runtime/libpgmath/lib/common/asin/fma3/vsasin4.cpp b/runtime/libpgmath/lib/common/asin/fma3/vsasin4.cpp
@@ -20,10 +20,19 @@
 extern "C" __m128 __fvs_asin_fma3(__m128 const a);
 
 __m128 __fvs_asin_fma3(__m128 const a) {
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    __m128  const ABS_MASK  = (__m128)((long double)_mm_set1_epi32(ABS_MASK_I));
+    __m128  const SGN_MASK  = (__m128)((long double)_mm_set1_epi32(SGN_MASK_I));
+#else
     __m128  const ABS_MASK  = (__m128)_mm_set1_epi32(ABS_MASK_I);
     __m128  const SGN_MASK  = (__m128)_mm_set1_epi32(SGN_MASK_I);
+#endif
     __m128  const ONE       = _mm_set1_ps(1.0f);
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    __m128i const THRESHOLD = (__m128i)((long double)_mm_set1_ps(THRESHOLD_F));
+#else
     __m128i const THRESHOLD = (__m128i)_mm_set1_ps(THRESHOLD_F);
+#endif
     __m128  const PIO2      = _mm_set1_ps(PIO2_F);
 
     // p0 coefficients
@@ -48,7 +57,11 @@ __m128 __fvs_asin_fma3(__m128 const a) {
     sq = _mm_sqrt_ps(sq); // sqrt(1 - |a|)
 
     // sgn(a) * ( |a| > 0.5705 ? pi/2 - sqrt(1 - |x|) * p1(|a|) : p0(|a|) )
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    cmp0 = (__m128)((long double)_mm_cmpgt_epi32((__m128i)((long double)x), THRESHOLD));
+#else
     cmp0 = (__m128)_mm_cmpgt_epi32((__m128i)x, THRESHOLD);
+#endif
 
     // polynomials evaluation
     x2 = _mm_mul_ps(a, a);
diff --git a/runtime/libpgmath/lib/common/exp/fma3/sdexp.cpp b/runtime/libpgmath/lib/common/exp/fma3/sdexp.cpp
@@ -25,14 +25,27 @@ extern "C" double __fsd_exp_fma3(double);
 // handles large cases as well as special cases such as infinities and NaNs
 __m128d __pgm_exp_d_slowpath(__m128d const a, __m128i const i, __m128d const t,  __m128d const z)
 {
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    __m128d const INF        = (__m128d)((long double)_mm_set1_epi64x(INF_D));
+#else
     __m128d const INF        = (__m128d)_mm_set1_epi64x(INF_D);
+#endif
     __m128d const ZERO       = _mm_set1_pd(ZERO_D);
     __m128i const HI_ABS_MASK = _mm_set1_epi64x(HI_ABS_MASK_D);
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    __m128d const UPPERBOUND_1 = (__m128d)((long double)_mm_set1_epi64x(UPPERBOUND_1_D));
+    __m128d const UPPERBOUND_2 = (__m128d)((long double)_mm_set1_epi64x(UPPERBOUND_2_D));
+#else
     __m128d const UPPERBOUND_1 = (__m128d)_mm_set1_epi64x(UPPERBOUND_1_D);
     __m128d const UPPERBOUND_2 = (__m128d)_mm_set1_epi64x(UPPERBOUND_2_D);
+#endif
     __m128i const MULT_CONST = _mm_set1_epi64x(MULT_CONST_D);
 
-    __m128d abs_lt = (__m128d)_mm_and_si128((__m128i)a, HI_ABS_MASK);                    
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    __m128d abs_lt = (__m128d)((long double)_mm_and_si128((__m128i)((long double)a), HI_ABS_MASK));
+#else
+    __m128d abs_lt = (__m128d)_mm_and_si128((__m128i)a, HI_ABS_MASK);
+#endif
 
     __m128d slowpath_mask = (__m128d)_mm_cmp_sd(abs_lt, UPPERBOUND_1, _CMP_LT_OS);       
     __m128d lt_zero_mask = _mm_cmp_sd(a, ZERO, _CMP_LT_OS); // compute a < 0.0           
@@ -48,9 +61,14 @@ __m128d __pgm_exp_d_slowpath(__m128d const a, __m128i const i, __m128d const t,
 
     k = _mm_sub_epi32(i, k);          // k = i - k                              
     __m128i i_scale_acc_2 = _mm_slli_epi64(k, SCALE_D);  // shift to HI and shift 20 
-    __m128d multiplier = (__m128d)_mm_add_epi64(i_scale_acc_2, MULT_CONST);     
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    __m128d multiplier = (__m128d)((long double)_mm_add_epi64(i_scale_acc_2, MULT_CONST));
+    __m128d res = (__m128d)((long double)_mm_add_epi32(i_scale_acc, (__m128i)((long double)t)));
+#else
+    __m128d multiplier = (__m128d)_mm_add_epi64(i_scale_acc_2, MULT_CONST);
+    __m128d res = (__m128d)_mm_add_epi32(i_scale_acc, (__m128i)t);
+#endif
 
-    __m128d res = (__m128d)_mm_add_epi32(i_scale_acc, (__m128i)t);              
     res = _mm_mul_sd(res, multiplier);                                          
 
     __m128d slowpath_blend = _mm_blendv_pd(zero_inf_blend, res, accurate_scale_mask); 
@@ -68,7 +86,11 @@ double __fsd_exp_fma3(double const a_in)
     __m128d const NEG_LN2_HI = _mm_set1_pd(NEG_LN2_HI_D);
     __m128d const NEG_LN2_LO = _mm_set1_pd(NEG_LN2_LO_D);
     __m128d const ZERO       = _mm_set1_pd(ZERO_D);
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    __m128d const INF        = (__m128d)((long double)_mm_set1_epi64x(INF_D));
+#else
     __m128d const INF        = (__m128d)_mm_set1_epi64x(INF_D);
+#endif
 
     __m128d const EXP_POLY_11 = _mm_set1_pd(EXP_POLY_11_D);
     __m128d const EXP_POLY_10 = _mm_set1_pd(EXP_POLY_10_D);
@@ -84,15 +106,24 @@ double __fsd_exp_fma3(double const a_in)
     __m128d const EXP_POLY_0  = _mm_set1_pd(EXP_POLY_0_D);
 
     __m128d const DBL2INT_CVT = _mm_set1_pd(DBL2INT_CVT_D);
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    __m128d const UPPERBOUND_1 = (__m128d)((long double)_mm_set1_epi64x(UPPERBOUND_1_D));
+    __m128d const UPPERBOUND_2 = (__m128d)((long double)_mm_set1_epi64x(UPPERBOUND_2_D));
+#else
     __m128d const UPPERBOUND_1 = (__m128d)_mm_set1_epi64x(UPPERBOUND_1_D);
     __m128d const UPPERBOUND_2 = (__m128d)_mm_set1_epi64x(UPPERBOUND_2_D);
+#endif
 
     __m128i const MULT_CONST = _mm_set1_epi64x(MULT_CONST_D);
     __m128i const HI_ABS_MASK = _mm_set1_epi64x(HI_ABS_MASK_D);
 
     __m128d a = _mm_set1_pd(a_in);
     // calculating exponent; stored in the LO of each 64-bit block
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    __m128i i = (__m128i) ((long double)_mm_fmadd_sd(a, L2E, DBL2INT_CVT));
+#else
     __m128i i = (__m128i) _mm_fmadd_sd(a, L2E, DBL2INT_CVT);
+#endif
 
     // calculate mantissa
     //fast mul rint
@@ -120,15 +151,23 @@ double __fsd_exp_fma3(double const a_in)
     
     // fast scale
     __m128i i_scale = _mm_slli_epi64(i, SCALE_D); 
-    __m128d z = (__m128d)_mm_add_epi32(i_scale, (__m128i)t); 
-
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    __m128d z = (__m128d)((long double)_mm_add_epi32(i_scale, (__m128i)((long double)t)));
+    __m128d abs_a = (__m128d)((long double)_mm_and_si128((__m128i)((long double)a), HI_ABS_MASK));
+#else
+    __m128d z = (__m128d)_mm_add_epi32(i_scale, (__m128i)t);
     __m128d abs_a = (__m128d)_mm_and_si128((__m128i)a, HI_ABS_MASK);
+#endif
 
 #if defined(TARGET_LINUX_POWER)
     int exp_slowmask = _vec_any_nz((__m128i)_mm_cmpgt_epi64((__m128i)abs_a, (__m128i)UPPERBOUND_1));
+#else
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    int exp_slowmask = _mm_movemask_epi8(_mm_cmpgt_epi64((__m128i)((long double)abs_a), (__m128i)((long double)UPPERBOUND_1)));
 #else
     int exp_slowmask = _mm_movemask_epi8(_mm_cmpgt_epi64((__m128i)abs_a, (__m128i)UPPERBOUND_1));
 #endif
+#endif
 
 //    if (exp_slowmask) {
 //        return _mm_cvtsd_f64(__pgm_exp_d_slowpath(a, i, t, z));
diff --git a/runtime/libpgmath/lib/common/exp/fma3/vdexp2.cpp b/runtime/libpgmath/lib/common/exp/fma3/vdexp2.cpp
@@ -22,14 +22,27 @@ extern "C" __m128d __fvd_exp_fma3(__m128d);
 // handles large cases as well as special cases such as infinities and NaNs
 __m128d __pgm_exp_d_vec128_slowpath(__m128d const a, __m128i const i, __m128d const t,  __m128d const z)
 {
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    __m128d const INF        = (__m128d)((long double)_mm_set1_epi64x(INF_D));
+#else
     __m128d const INF        = (__m128d)_mm_set1_epi64x(INF_D);
+#endif
     __m128d const ZERO       = _mm_set1_pd(ZERO_D);
     __m128i const HI_ABS_MASK = _mm_set1_epi64x(HI_ABS_MASK_D);
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    __m128d const UPPERBOUND_1 = (__m128d)((long double)_mm_set1_epi64x(UPPERBOUND_1_D));
+    __m128d const UPPERBOUND_2 = (__m128d)((long double)_mm_set1_epi64x(UPPERBOUND_2_D));
+#else
     __m128d const UPPERBOUND_1 = (__m128d)_mm_set1_epi64x(UPPERBOUND_1_D);
     __m128d const UPPERBOUND_2 = (__m128d)_mm_set1_epi64x(UPPERBOUND_2_D);
+#endif
     __m128i const MULT_CONST = _mm_set1_epi64x(MULT_CONST_D);
 
-    __m128d abs_lt = (__m128d)_mm_and_si128((__m128i)a, HI_ABS_MASK);                    
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    __m128d abs_lt = (__m128d)((long double)_mm_and_si128((__m128i)((long double)a), HI_ABS_MASK));
+#else
+    __m128d abs_lt = (__m128d)_mm_and_si128((__m128i)a, HI_ABS_MASK);
+#endif
 
     __m128d slowpath_mask = (__m128d)_mm_cmp_pd(abs_lt, UPPERBOUND_1, _CMP_LT_OS);       
     __m128d lt_zero_mask = _mm_cmp_pd(a, ZERO, _CMP_LT_OS); // compute a < 0.0           
@@ -45,9 +58,13 @@ __m128d __pgm_exp_d_vec128_slowpath(__m128d const a, __m128i const i, __m128d co
 
     k = _mm_sub_epi32(i, k);          // k = i - k                              
     __m128i i_scale_acc_2 = _mm_slli_epi64(k, SCALE_D);  // shift to HI and shift 20 
-    __m128d multiplier = (__m128d)_mm_add_epi64(i_scale_acc_2, MULT_CONST);     
-
-    __m128d res = (__m128d)_mm_add_epi32(i_scale_acc, (__m128i)t);              
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    __m128d multiplier = (__m128d)((long double)_mm_add_epi64(i_scale_acc_2, MULT_CONST));
+    __m128d res = (__m128d)((long double)_mm_add_epi32(i_scale_acc, (__m128i)((long double)t)));
+#else
+    __m128d multiplier = (__m128d)_mm_add_epi64(i_scale_acc_2, MULT_CONST);
+    __m128d res = (__m128d)_mm_add_epi32(i_scale_acc, (__m128i)t);
+#endif
     res = _mm_mul_pd(res, multiplier);                                          
 
     __m128d slowpath_blend = _mm_blendv_pd(zero_inf_blend, res, accurate_scale_mask); 
@@ -60,7 +77,11 @@ __m128d __fvd_exp_fma3(__m128d const a)
     __m128d const NEG_LN2_HI = _mm_set1_pd(NEG_LN2_HI_D);
     __m128d const NEG_LN2_LO = _mm_set1_pd(NEG_LN2_LO_D);
     __m128d const ZERO       = _mm_set1_pd(ZERO_D);
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    __m128d const INF        = (__m128d)((long double)_mm_set1_epi64x(INF_D));
+#else
     __m128d const INF        = (__m128d)_mm_set1_epi64x(INF_D);
+#endif
 
     __m128d const EXP_POLY_11 = _mm_set1_pd(EXP_POLY_11_D);
     __m128d const EXP_POLY_10 = _mm_set1_pd(EXP_POLY_10_D);
@@ -76,14 +97,23 @@ __m128d __fvd_exp_fma3(__m128d const a)
     __m128d const EXP_POLY_0  = _mm_set1_pd(EXP_POLY_0_D);
 
     __m128d const DBL2INT_CVT = _mm_set1_pd(DBL2INT_CVT_D);
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    __m128d const UPPERBOUND_1 = (__m128d)((long double)_mm_set1_epi64x(UPPERBOUND_1_D));
+    __m128d const UPPERBOUND_2 = (__m128d)((long double)_mm_set1_epi64x(UPPERBOUND_2_D));
+#else
     __m128d const UPPERBOUND_1 = (__m128d)_mm_set1_epi64x(UPPERBOUND_1_D);
     __m128d const UPPERBOUND_2 = (__m128d)_mm_set1_epi64x(UPPERBOUND_2_D);
+#endif
 
     __m128i const MULT_CONST = _mm_set1_epi64x(MULT_CONST_D);
     __m128i const HI_ABS_MASK = _mm_set1_epi64x(HI_ABS_MASK_D);
 
     // calculating exponent; stored in the LO of each 64-bit block
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    __m128i i = (__m128i) ((long double)_mm_fmadd_pd(a, L2E, DBL2INT_CVT));
+#else
     __m128i i = (__m128i) _mm_fmadd_pd(a, L2E, DBL2INT_CVT);
+#endif
 
     // calculate mantissa
     //fast mul rint
@@ -111,15 +141,23 @@ __m128d __fvd_exp_fma3(__m128d const a)
     
     // fast scale
     __m128i i_scale = _mm_slli_epi64(i, SCALE_D); 
-    __m128d z = (__m128d)_mm_add_epi32(i_scale, (__m128i)t); 
-
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    __m128d z = (__m128d)((long double)_mm_add_epi32(i_scale, (__m128i)((long double)t)));
+    __m128d abs_a = (__m128d)((long double)_mm_and_si128((__m128i)((long double)a), HI_ABS_MASK));
+#else
+    __m128d z = (__m128d)_mm_add_epi32(i_scale, (__m128i)t);
     __m128d abs_a = (__m128d)_mm_and_si128((__m128i)a, HI_ABS_MASK);
+#endif
 
 #if defined(TARGET_LINUX_POWER)
     int exp_slowmask = _vec_any_nz((__m128i)_mm_cmpgt_epi64((__m128i)abs_a, (__m128i)UPPERBOUND_1));
+#else
+#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
+    int exp_slowmask = _mm_movemask_epi8(_mm_cmpgt_epi64((__m128i)((long double)abs_a), (__m128i)((long double)UPPERBOUND_1)));
 #else
     int exp_slowmask = _mm_movemask_epi8(_mm_cmpgt_epi64((__m128i)abs_a, (__m128i)UPPERBOUND_1));
 #endif
+#endif
 
 //    if (exp_slowmask) {
 //        return __pgm_exp_d_vec128_slowpath(a, i, t, z);
diff --git a/runtime/libpgmath/lib/common/exp/fma3/vsexp4.cpp b/runtime/libpgmath/lib/common/exp/fma3/vsexp4.cpp

Original file line number	Diff line number	Diff line change
`@@ -899,14 +899,14 @@ struct __s128i {`
`899`	`899`
`900`	`900`	`inline __s128i& operator+=(unsigned int i) {`
`901`	`901`	`if (i != 0U)`
`902`		`- xird = xird + i;`
	`902`	`+ xird = xird + static_cast<int32_t>(i);`
`903`	`903`
`904`	`904`	`return *this;`
`905`	`905`	`}`
`906`	`906`
`907`	`907`	`inline __s128i& operator+=(int i) {`
`908`	`908`	`if (i != 0)`
`909`		`- xird = xird + i;`
	`909`	`+ xird = xird + static_cast<int32_t>(i);`
`910`	`910`
`911`	`911`	`return *this;`
`912`	`912`	`}`
`@@ -922,7 +922,7 @@ struct __s128i {`
`922`	`922`
`923`	`923`	`inline __s128i& operator+=(unsigned long l) {`
`924`	`924`	`if (l != 0UL)`
`925`		`- xlrd = xlrd + l;`
	`925`	`+ xlrd = xlrd + static_cast<int64_t>(l);`
`926`	`926`
`927`	`927`	`return *this;`
`928`	`928`	`}`
`@@ -1281,7 +1281,7 @@ vec_ld(int v, float vld[4])`
`1281`	`1281`	`{`
`1282`	`1282`	`__m128 r(vld);`
`1283`	`1283`	`r += v;`
`1284`		`- return r.operator vrd2_t();`
	`1284`	`+ return r.operator __m128::vrd2_t();`
`1285`	`1285`	`}`
`1286`	`1286`
`1287`	`1287`	`static inline __m128::vrd2_t`
`@@ -1290,7 +1290,7 @@ vec_ld(unsigned int v, float vld[4])`
`1290`	`1290`	`{`
`1291`	`1291`	`__m128 r(vld);`
`1292`	`1292`	`r += v;`
`1293`		`- return r.operator vrd2_t();`
	`1293`	`+ return r.operator __m128::vrd2_t();`
`1294`	`1294`	`}`
`1295`	`1295`
`1296`	`1296`	`static inline __m128d`