Skip to content

Commit 31e549d

Browse files
pawosm-armkiranchandramohan
authored andcommitted
libpgmath: make it clang friendly
This commit adresses following problems encountered during libpgmath compilation with the Clang C++ compiler: * Clang C++ compiler needs explicit casts from C integer types to stdint's int32_t and int64_t. * Clang C++ compiler needs qualified names when dealing with vrd2_t types typedef'ed inside of C++ structures (yet since desired type is known at the place of use, GNU C++ compiler can infer itself which of vrd2_t types the author had in mind). * Clang C++ compiler cannot infer which overloaded conversion operator to chose when converting between __m128* types. What GNU C++ compiler does in such a place is: 1. It generates a call to overloaded cast to long double() operator on __m128-typed value. 2. It generates a call to desired __m128* constructor overloaded to accept long double, There were alternative proposals to solve this issue, e.g. adding better suited overloads of casting operator (unfortunately, this didn't help) or casting always to long double (which works with both GNU and Clang compilers, unfortunately, this approach could not gain acceptance as it makes the code less meaningful). The best solution for those issues would be to fix the Clang C++ compiler. To my knowledge it will not happen in the forseeable future. Signed-off-by: Paul Osmialowski <[email protected]>
1 parent 87f9fda commit 31e549d

File tree

9 files changed

+171
-20
lines changed

9 files changed

+171
-20
lines changed

runtime/libpgmath/lib/common/acos/fma3/ssacos.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,11 @@ float __fss_acos_fma3(float const a)
6767
_sq = _mm_setr_ps(0.0f, sq, 0.0f, 0.0f);
6868
p1 = _mm_fmadd_ps(p, _x2_x, F);
6969

70+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
71+
__m128 pi_mask = (__m128)((long double)_mm_cmpgt_epi32(ZERO, (__m128i)((long double)_a)));
72+
#else
7073
__m128 pi_mask = (__m128)_mm_cmpgt_epi32(ZERO, (__m128i)_a);
74+
#endif
7175
pi_mask = _mm_and_ps(pi_mask, PI);
7276
p1 = _mm_fmsub_ps(_sq, p1, pi_mask);
7377

runtime/libpgmath/lib/common/acos/fma3/vdacos2.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,11 @@ __m128d __fvd_acos_fma3(__m128d const a)
2323
__m128i const ABS_MASK = _mm_set1_epi64x(ABS_MASK_LL);
2424
__m128d const ZERO = _mm_set1_pd(0.0);
2525
__m128d const ONE = _mm_set1_pd(1.0);
26+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
27+
__m128d const SGN_MASK = (__m128d)((long double)_mm_set1_epi64x(SGN_MASK_LL));
28+
#else
2629
__m128d const SGN_MASK = (__m128d)_mm_set1_epi64x(SGN_MASK_LL);
30+
#endif
2731
__m128d const THRESHOLD = _mm_set1_pd(THRESHOLD_D);
2832
__m128d const PI_HI = _mm_set1_pd(PI_HI_D);
2933

@@ -61,7 +65,11 @@ __m128d __fvd_acos_fma3(__m128d const a)
6165
__m128d res, cmp, sign, fix;
6266
__m128d p0hi, p0lo, p1hi, p1lo;
6367

68+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
69+
x = _mm_and_pd(a, (__m128d)((long double)ABS_MASK));
70+
#else
6471
x = _mm_and_pd(a, (__m128d)ABS_MASK);
72+
#endif
6573
x2 = _mm_mul_pd(a, a);
6674
sq = _mm_sub_pd(ONE, x);
6775
sq = _mm_sqrt_pd(sq);

runtime/libpgmath/lib/common/acos/fma3/vsacos4.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,20 @@ extern "C" __m128 __fvs_acos_fma3(__m128 const a);
1919

2020
__m128 __fvs_acos_fma3(__m128 const a)
2121
{
22+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
23+
__m128 const ABS_MASK = (__m128)((long double)_mm_set1_epi32(ABS_MASK_I));
24+
__m128 const SGN_MASK = (__m128)((long double)_mm_set1_epi32(SGN_MASK_I));
25+
#else
2226
__m128 const ABS_MASK = (__m128)_mm_set1_epi32(ABS_MASK_I);
2327
__m128 const SGN_MASK = (__m128)_mm_set1_epi32(SGN_MASK_I);
28+
#endif
2429
__m128 const ONE = _mm_set1_ps(1.0f);
2530
__m128i const ZERO = _mm_set1_epi32(0);
31+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
32+
__m128i const THRESHOLD = (__m128i)((long double)_mm_set1_ps(THRESHOLD_F));
33+
#else
2634
__m128i const THRESHOLD = (__m128i)_mm_set1_ps(THRESHOLD_F);
35+
#endif
2736
__m128 const PI = _mm_set1_ps(PI_F);
2837

2938
// p0 coefficients
@@ -47,8 +56,13 @@ __m128 __fvs_acos_fma3(__m128 const a)
4756
sq = _mm_sub_ps(ONE, x);
4857
sq = _mm_sqrt_ps(sq); // sqrt(1 - |a|)
4958

59+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
60+
__m128 pi_mask = (__m128)((long double)_mm_cmpgt_epi32(ZERO, (__m128i)((long double)a)));
61+
cmp0 = (__m128)((long double)_mm_cmpgt_epi32((__m128i)((long double)x), THRESHOLD));
62+
#else
5063
__m128 pi_mask = (__m128)_mm_cmpgt_epi32(ZERO, (__m128i)a);
5164
cmp0 = (__m128)_mm_cmpgt_epi32((__m128i)x, THRESHOLD);
65+
#endif
5266

5367
// polynomials evaluation
5468
x2 = _mm_mul_ps(a, a);

runtime/libpgmath/lib/common/arm64intrin.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -899,14 +899,14 @@ struct __s128i {
899899

900900
inline __s128i& operator+=(unsigned int i) {
901901
if (i != 0U)
902-
xird = xird + i;
902+
xird = xird + static_cast<int32_t>(i);
903903

904904
return *this;
905905
}
906906

907907
inline __s128i& operator+=(int i) {
908908
if (i != 0)
909-
xird = xird + i;
909+
xird = xird + static_cast<int32_t>(i);
910910

911911
return *this;
912912
}
@@ -922,7 +922,7 @@ struct __s128i {
922922

923923
inline __s128i& operator+=(unsigned long l) {
924924
if (l != 0UL)
925-
xlrd = xlrd + l;
925+
xlrd = xlrd + static_cast<int64_t>(l);
926926

927927
return *this;
928928
}
@@ -1281,7 +1281,7 @@ vec_ld(int v, float vld[4])
12811281
{
12821282
__m128 r(vld);
12831283
r += v;
1284-
return r.operator vrd2_t();
1284+
return r.operator __m128::vrd2_t();
12851285
}
12861286

12871287
static inline __m128::vrd2_t
@@ -1290,7 +1290,7 @@ vec_ld(unsigned int v, float vld[4])
12901290
{
12911291
__m128 r(vld);
12921292
r += v;
1293-
return r.operator vrd2_t();
1293+
return r.operator __m128::vrd2_t();
12941294
}
12951295

12961296
static inline __m128d

runtime/libpgmath/lib/common/asin/fma3/vdasin2.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,13 @@ __m128d __fvd_asin_fma3(__m128d const a)
2424
__m128i const ABS_MASK = _mm_set1_epi64x(ABS_MASK_LL);
2525
__m128d const ZERO = _mm_set1_pd(0.0);
2626
__m128d const ONE = _mm_set1_pd(1.0);
27+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
28+
__m128d const SGN_MASK = (__m128d)((long double)_mm_set1_epi64x(SGN_MASK_LL));
29+
__m128d const THRESHOLD = (__m128d)((long double)_mm_set1_epi64x(THRESHOLD_LL));
30+
#else
2731
__m128d const SGN_MASK = (__m128d)_mm_set1_epi64x(SGN_MASK_LL);
2832
__m128d const THRESHOLD = (__m128d)_mm_set1_epi64x(THRESHOLD_LL);
33+
#endif
2934
__m128d const PIO2_HI = _mm_set1_pd(PIO2_HI_D);
3035
__m128d const PIO2_LO = _mm_set1_pd(PIO2_LO_D);
3136

@@ -60,7 +65,11 @@ __m128d __fvd_asin_fma3(__m128d const a)
6065
__m128d sq, p0hi, p0lo, p0, p1hi, p1lo, p1;
6166
__m128d res, cmp, sign, fix, pio2_lo, pio2_hi;
6267

68+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
69+
x = _mm_and_pd(a, (__m128d)((long double)ABS_MASK));
70+
#else
6371
x = _mm_and_pd(a, (__m128d)ABS_MASK);
72+
#endif
6473
sq = _mm_sub_pd(ONE, x);
6574
sq = _mm_sqrt_pd(sq);
6675

runtime/libpgmath/lib/common/asin/fma3/vsasin4.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,19 @@
2020
extern "C" __m128 __fvs_asin_fma3(__m128 const a);
2121

2222
__m128 __fvs_asin_fma3(__m128 const a) {
23+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
24+
__m128 const ABS_MASK = (__m128)((long double)_mm_set1_epi32(ABS_MASK_I));
25+
__m128 const SGN_MASK = (__m128)((long double)_mm_set1_epi32(SGN_MASK_I));
26+
#else
2327
__m128 const ABS_MASK = (__m128)_mm_set1_epi32(ABS_MASK_I);
2428
__m128 const SGN_MASK = (__m128)_mm_set1_epi32(SGN_MASK_I);
29+
#endif
2530
__m128 const ONE = _mm_set1_ps(1.0f);
31+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
32+
__m128i const THRESHOLD = (__m128i)((long double)_mm_set1_ps(THRESHOLD_F));
33+
#else
2634
__m128i const THRESHOLD = (__m128i)_mm_set1_ps(THRESHOLD_F);
35+
#endif
2736
__m128 const PIO2 = _mm_set1_ps(PIO2_F);
2837

2938
// p0 coefficients
@@ -48,7 +57,11 @@ __m128 __fvs_asin_fma3(__m128 const a) {
4857
sq = _mm_sqrt_ps(sq); // sqrt(1 - |a|)
4958

5059
// sgn(a) * ( |a| > 0.5705 ? pi/2 - sqrt(1 - |x|) * p1(|a|) : p0(|a|) )
60+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
61+
cmp0 = (__m128)((long double)_mm_cmpgt_epi32((__m128i)((long double)x), THRESHOLD));
62+
#else
5163
cmp0 = (__m128)_mm_cmpgt_epi32((__m128i)x, THRESHOLD);
64+
#endif
5265

5366
// polynomials evaluation
5467
x2 = _mm_mul_ps(a, a);

runtime/libpgmath/lib/common/exp/fma3/sdexp.cpp

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,27 @@ extern "C" double __fsd_exp_fma3(double);
2525
// handles large cases as well as special cases such as infinities and NaNs
2626
__m128d __pgm_exp_d_slowpath(__m128d const a, __m128i const i, __m128d const t, __m128d const z)
2727
{
28+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
29+
__m128d const INF = (__m128d)((long double)_mm_set1_epi64x(INF_D));
30+
#else
2831
__m128d const INF = (__m128d)_mm_set1_epi64x(INF_D);
32+
#endif
2933
__m128d const ZERO = _mm_set1_pd(ZERO_D);
3034
__m128i const HI_ABS_MASK = _mm_set1_epi64x(HI_ABS_MASK_D);
35+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
36+
__m128d const UPPERBOUND_1 = (__m128d)((long double)_mm_set1_epi64x(UPPERBOUND_1_D));
37+
__m128d const UPPERBOUND_2 = (__m128d)((long double)_mm_set1_epi64x(UPPERBOUND_2_D));
38+
#else
3139
__m128d const UPPERBOUND_1 = (__m128d)_mm_set1_epi64x(UPPERBOUND_1_D);
3240
__m128d const UPPERBOUND_2 = (__m128d)_mm_set1_epi64x(UPPERBOUND_2_D);
41+
#endif
3342
__m128i const MULT_CONST = _mm_set1_epi64x(MULT_CONST_D);
3443

35-
__m128d abs_lt = (__m128d)_mm_and_si128((__m128i)a, HI_ABS_MASK);
44+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
45+
__m128d abs_lt = (__m128d)((long double)_mm_and_si128((__m128i)((long double)a), HI_ABS_MASK));
46+
#else
47+
__m128d abs_lt = (__m128d)_mm_and_si128((__m128i)a, HI_ABS_MASK);
48+
#endif
3649

3750
__m128d slowpath_mask = (__m128d)_mm_cmp_sd(abs_lt, UPPERBOUND_1, _CMP_LT_OS);
3851
__m128d lt_zero_mask = _mm_cmp_sd(a, ZERO, _CMP_LT_OS); // compute a < 0.0
@@ -48,9 +61,14 @@ __m128d __pgm_exp_d_slowpath(__m128d const a, __m128i const i, __m128d const t,
4861

4962
k = _mm_sub_epi32(i, k); // k = i - k
5063
__m128i i_scale_acc_2 = _mm_slli_epi64(k, SCALE_D); // shift to HI and shift 20
51-
__m128d multiplier = (__m128d)_mm_add_epi64(i_scale_acc_2, MULT_CONST);
64+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
65+
__m128d multiplier = (__m128d)((long double)_mm_add_epi64(i_scale_acc_2, MULT_CONST));
66+
__m128d res = (__m128d)((long double)_mm_add_epi32(i_scale_acc, (__m128i)((long double)t)));
67+
#else
68+
__m128d multiplier = (__m128d)_mm_add_epi64(i_scale_acc_2, MULT_CONST);
69+
__m128d res = (__m128d)_mm_add_epi32(i_scale_acc, (__m128i)t);
70+
#endif
5271

53-
__m128d res = (__m128d)_mm_add_epi32(i_scale_acc, (__m128i)t);
5472
res = _mm_mul_sd(res, multiplier);
5573

5674
__m128d slowpath_blend = _mm_blendv_pd(zero_inf_blend, res, accurate_scale_mask);
@@ -68,7 +86,11 @@ double __fsd_exp_fma3(double const a_in)
6886
__m128d const NEG_LN2_HI = _mm_set1_pd(NEG_LN2_HI_D);
6987
__m128d const NEG_LN2_LO = _mm_set1_pd(NEG_LN2_LO_D);
7088
__m128d const ZERO = _mm_set1_pd(ZERO_D);
89+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
90+
__m128d const INF = (__m128d)((long double)_mm_set1_epi64x(INF_D));
91+
#else
7192
__m128d const INF = (__m128d)_mm_set1_epi64x(INF_D);
93+
#endif
7294

7395
__m128d const EXP_POLY_11 = _mm_set1_pd(EXP_POLY_11_D);
7496
__m128d const EXP_POLY_10 = _mm_set1_pd(EXP_POLY_10_D);
@@ -84,15 +106,24 @@ double __fsd_exp_fma3(double const a_in)
84106
__m128d const EXP_POLY_0 = _mm_set1_pd(EXP_POLY_0_D);
85107

86108
__m128d const DBL2INT_CVT = _mm_set1_pd(DBL2INT_CVT_D);
109+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
110+
__m128d const UPPERBOUND_1 = (__m128d)((long double)_mm_set1_epi64x(UPPERBOUND_1_D));
111+
__m128d const UPPERBOUND_2 = (__m128d)((long double)_mm_set1_epi64x(UPPERBOUND_2_D));
112+
#else
87113
__m128d const UPPERBOUND_1 = (__m128d)_mm_set1_epi64x(UPPERBOUND_1_D);
88114
__m128d const UPPERBOUND_2 = (__m128d)_mm_set1_epi64x(UPPERBOUND_2_D);
115+
#endif
89116

90117
__m128i const MULT_CONST = _mm_set1_epi64x(MULT_CONST_D);
91118
__m128i const HI_ABS_MASK = _mm_set1_epi64x(HI_ABS_MASK_D);
92119

93120
__m128d a = _mm_set1_pd(a_in);
94121
// calculating exponent; stored in the LO of each 64-bit block
122+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
123+
__m128i i = (__m128i) ((long double)_mm_fmadd_sd(a, L2E, DBL2INT_CVT));
124+
#else
95125
__m128i i = (__m128i) _mm_fmadd_sd(a, L2E, DBL2INT_CVT);
126+
#endif
96127

97128
// calculate mantissa
98129
//fast mul rint
@@ -120,15 +151,23 @@ double __fsd_exp_fma3(double const a_in)
120151

121152
// fast scale
122153
__m128i i_scale = _mm_slli_epi64(i, SCALE_D);
123-
__m128d z = (__m128d)_mm_add_epi32(i_scale, (__m128i)t);
124-
154+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
155+
__m128d z = (__m128d)((long double)_mm_add_epi32(i_scale, (__m128i)((long double)t)));
156+
__m128d abs_a = (__m128d)((long double)_mm_and_si128((__m128i)((long double)a), HI_ABS_MASK));
157+
#else
158+
__m128d z = (__m128d)_mm_add_epi32(i_scale, (__m128i)t);
125159
__m128d abs_a = (__m128d)_mm_and_si128((__m128i)a, HI_ABS_MASK);
160+
#endif
126161

127162
#if defined(TARGET_LINUX_POWER)
128163
int exp_slowmask = _vec_any_nz((__m128i)_mm_cmpgt_epi64((__m128i)abs_a, (__m128i)UPPERBOUND_1));
164+
#else
165+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
166+
int exp_slowmask = _mm_movemask_epi8(_mm_cmpgt_epi64((__m128i)((long double)abs_a), (__m128i)((long double)UPPERBOUND_1)));
129167
#else
130168
int exp_slowmask = _mm_movemask_epi8(_mm_cmpgt_epi64((__m128i)abs_a, (__m128i)UPPERBOUND_1));
131169
#endif
170+
#endif
132171

133172
// if (exp_slowmask) {
134173
// return _mm_cvtsd_f64(__pgm_exp_d_slowpath(a, i, t, z));

runtime/libpgmath/lib/common/exp/fma3/vdexp2.cpp

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,27 @@ extern "C" __m128d __fvd_exp_fma3(__m128d);
2222
// handles large cases as well as special cases such as infinities and NaNs
2323
__m128d __pgm_exp_d_vec128_slowpath(__m128d const a, __m128i const i, __m128d const t, __m128d const z)
2424
{
25+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
26+
__m128d const INF = (__m128d)((long double)_mm_set1_epi64x(INF_D));
27+
#else
2528
__m128d const INF = (__m128d)_mm_set1_epi64x(INF_D);
29+
#endif
2630
__m128d const ZERO = _mm_set1_pd(ZERO_D);
2731
__m128i const HI_ABS_MASK = _mm_set1_epi64x(HI_ABS_MASK_D);
32+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
33+
__m128d const UPPERBOUND_1 = (__m128d)((long double)_mm_set1_epi64x(UPPERBOUND_1_D));
34+
__m128d const UPPERBOUND_2 = (__m128d)((long double)_mm_set1_epi64x(UPPERBOUND_2_D));
35+
#else
2836
__m128d const UPPERBOUND_1 = (__m128d)_mm_set1_epi64x(UPPERBOUND_1_D);
2937
__m128d const UPPERBOUND_2 = (__m128d)_mm_set1_epi64x(UPPERBOUND_2_D);
38+
#endif
3039
__m128i const MULT_CONST = _mm_set1_epi64x(MULT_CONST_D);
3140

32-
__m128d abs_lt = (__m128d)_mm_and_si128((__m128i)a, HI_ABS_MASK);
41+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
42+
__m128d abs_lt = (__m128d)((long double)_mm_and_si128((__m128i)((long double)a), HI_ABS_MASK));
43+
#else
44+
__m128d abs_lt = (__m128d)_mm_and_si128((__m128i)a, HI_ABS_MASK);
45+
#endif
3346

3447
__m128d slowpath_mask = (__m128d)_mm_cmp_pd(abs_lt, UPPERBOUND_1, _CMP_LT_OS);
3548
__m128d lt_zero_mask = _mm_cmp_pd(a, ZERO, _CMP_LT_OS); // compute a < 0.0
@@ -45,9 +58,13 @@ __m128d __pgm_exp_d_vec128_slowpath(__m128d const a, __m128i const i, __m128d co
4558

4659
k = _mm_sub_epi32(i, k); // k = i - k
4760
__m128i i_scale_acc_2 = _mm_slli_epi64(k, SCALE_D); // shift to HI and shift 20
48-
__m128d multiplier = (__m128d)_mm_add_epi64(i_scale_acc_2, MULT_CONST);
49-
50-
__m128d res = (__m128d)_mm_add_epi32(i_scale_acc, (__m128i)t);
61+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
62+
__m128d multiplier = (__m128d)((long double)_mm_add_epi64(i_scale_acc_2, MULT_CONST));
63+
__m128d res = (__m128d)((long double)_mm_add_epi32(i_scale_acc, (__m128i)((long double)t)));
64+
#else
65+
__m128d multiplier = (__m128d)_mm_add_epi64(i_scale_acc_2, MULT_CONST);
66+
__m128d res = (__m128d)_mm_add_epi32(i_scale_acc, (__m128i)t);
67+
#endif
5168
res = _mm_mul_pd(res, multiplier);
5269

5370
__m128d slowpath_blend = _mm_blendv_pd(zero_inf_blend, res, accurate_scale_mask);
@@ -60,7 +77,11 @@ __m128d __fvd_exp_fma3(__m128d const a)
6077
__m128d const NEG_LN2_HI = _mm_set1_pd(NEG_LN2_HI_D);
6178
__m128d const NEG_LN2_LO = _mm_set1_pd(NEG_LN2_LO_D);
6279
__m128d const ZERO = _mm_set1_pd(ZERO_D);
80+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
81+
__m128d const INF = (__m128d)((long double)_mm_set1_epi64x(INF_D));
82+
#else
6383
__m128d const INF = (__m128d)_mm_set1_epi64x(INF_D);
84+
#endif
6485

6586
__m128d const EXP_POLY_11 = _mm_set1_pd(EXP_POLY_11_D);
6687
__m128d const EXP_POLY_10 = _mm_set1_pd(EXP_POLY_10_D);
@@ -76,14 +97,23 @@ __m128d __fvd_exp_fma3(__m128d const a)
7697
__m128d const EXP_POLY_0 = _mm_set1_pd(EXP_POLY_0_D);
7798

7899
__m128d const DBL2INT_CVT = _mm_set1_pd(DBL2INT_CVT_D);
100+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
101+
__m128d const UPPERBOUND_1 = (__m128d)((long double)_mm_set1_epi64x(UPPERBOUND_1_D));
102+
__m128d const UPPERBOUND_2 = (__m128d)((long double)_mm_set1_epi64x(UPPERBOUND_2_D));
103+
#else
79104
__m128d const UPPERBOUND_1 = (__m128d)_mm_set1_epi64x(UPPERBOUND_1_D);
80105
__m128d const UPPERBOUND_2 = (__m128d)_mm_set1_epi64x(UPPERBOUND_2_D);
106+
#endif
81107

82108
__m128i const MULT_CONST = _mm_set1_epi64x(MULT_CONST_D);
83109
__m128i const HI_ABS_MASK = _mm_set1_epi64x(HI_ABS_MASK_D);
84110

85111
// calculating exponent; stored in the LO of each 64-bit block
112+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
113+
__m128i i = (__m128i) ((long double)_mm_fmadd_pd(a, L2E, DBL2INT_CVT));
114+
#else
86115
__m128i i = (__m128i) _mm_fmadd_pd(a, L2E, DBL2INT_CVT);
116+
#endif
87117

88118
// calculate mantissa
89119
//fast mul rint
@@ -111,15 +141,23 @@ __m128d __fvd_exp_fma3(__m128d const a)
111141

112142
// fast scale
113143
__m128i i_scale = _mm_slli_epi64(i, SCALE_D);
114-
__m128d z = (__m128d)_mm_add_epi32(i_scale, (__m128i)t);
115-
144+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
145+
__m128d z = (__m128d)((long double)_mm_add_epi32(i_scale, (__m128i)((long double)t)));
146+
__m128d abs_a = (__m128d)((long double)_mm_and_si128((__m128i)((long double)a), HI_ABS_MASK));
147+
#else
148+
__m128d z = (__m128d)_mm_add_epi32(i_scale, (__m128i)t);
116149
__m128d abs_a = (__m128d)_mm_and_si128((__m128i)a, HI_ABS_MASK);
150+
#endif
117151

118152
#if defined(TARGET_LINUX_POWER)
119153
int exp_slowmask = _vec_any_nz((__m128i)_mm_cmpgt_epi64((__m128i)abs_a, (__m128i)UPPERBOUND_1));
154+
#else
155+
#if defined(__clang__) && defined(TARGET_LINUX_ARM64)
156+
int exp_slowmask = _mm_movemask_epi8(_mm_cmpgt_epi64((__m128i)((long double)abs_a), (__m128i)((long double)UPPERBOUND_1)));
120157
#else
121158
int exp_slowmask = _mm_movemask_epi8(_mm_cmpgt_epi64((__m128i)abs_a, (__m128i)UPPERBOUND_1));
122159
#endif
160+
#endif
123161

124162
// if (exp_slowmask) {
125163
// return __pgm_exp_d_vec128_slowpath(a, i, t, z);

0 commit comments

Comments
 (0)