1919 return (GENTYPE)(((BGENTYPE)x * (BGENTYPE)y) >> GENSIZE); \
2020 }
2121
22- // FOIL-based long mul_hi
23- //
24- // Summary: Treat mul_hi(long x, long y) as:
25- // (a+b) * (c+d) where a and c are the high-order parts of x and y respectively
26- // and b and d are the low-order parts of x and y.
27- // Thinking back to algebra, we use FOIL to do the work.
22+ #define __CLC_MUL_HI_DEC_IMPL (BTYPE , TYPE , BITS ) \
23+ __CLC_MUL_HI_IMPL(BTYPE, TYPE, BITS) \
24+ __CLC_MUL_HI_VEC_IMPL(BTYPE##2, TYPE##2, BITS) \
25+ __CLC_MUL_HI_VEC_IMPL(BTYPE##3, TYPE##3, BITS) \
26+ __CLC_MUL_HI_VEC_IMPL(BTYPE##4, TYPE##4, BITS) \
27+ __CLC_MUL_HI_VEC_IMPL(BTYPE##8, TYPE##8, BITS) \
28+ __CLC_MUL_HI_VEC_IMPL(BTYPE##16, TYPE##16, BITS)
29+
2830_CLC_OVERLOAD _CLC_DEF long __clc_mul_hi (long x , long y ) {
2931 long f , o , i ;
3032 ulong l ;
@@ -81,32 +83,33 @@ _CLC_OVERLOAD _CLC_DEF ulong __clc_mul_hi(ulong x, ulong y) {
8183 return (f + (__clc_hadd (o , (i + (l >> 32 ))) >> 31 ));
8284}
8385
84- #define __CLC_MUL_HI_VEC (GENTYPE ) \
85- _CLC_OVERLOAD _CLC_DEF GENTYPE##2 __clc_mul_hi(GENTYPE##2 x, GENTYPE##2 y) { \
86- return (GENTYPE##2){__clc_mul_hi(x.s0, y.s0), __clc_mul_hi(x.s1, y.s1)}; \
87- } \
88- _CLC_OVERLOAD _CLC_DEF GENTYPE##3 __clc_mul_hi(GENTYPE##3 x, GENTYPE##3 y) { \
89- return (GENTYPE##3){__clc_mul_hi(x.s0, y.s0), __clc_mul_hi(x.s1, y.s1), \
90- __clc_mul_hi(x.s2, y.s2)}; \
91- } \
92- _CLC_OVERLOAD _CLC_DEF GENTYPE##4 __clc_mul_hi(GENTYPE##4 x, GENTYPE##4 y) { \
93- return (GENTYPE##4){__clc_mul_hi(x.lo, y.lo), __clc_mul_hi(x.hi, y.hi)}; \
94- } \
95- _CLC_OVERLOAD _CLC_DEF GENTYPE##8 __clc_mul_hi(GENTYPE##8 x, GENTYPE##8 y) { \
96- return (GENTYPE##8){__clc_mul_hi(x.lo, y.lo), __clc_mul_hi(x.hi, y.hi)}; \
97- } \
98- _CLC_OVERLOAD _CLC_DEF GENTYPE##16 __clc_mul_hi(GENTYPE##16 x, \
99- GENTYPE##16 y) { \
100- return (GENTYPE##16){__clc_mul_hi(x.lo, y.lo), __clc_mul_hi(x.hi, y.hi)}; \
86+ // Vector-based mul_hi implementation for logn/ulong. See comments in the scalar
87+ // versions for more detail.
88+ #define __CLC_MUL_HI_LONG_VEC_IMPL (TY , UTY ) \
89+ _CLC_OVERLOAD _CLC_DEF TY __clc_mul_hi(TY x, TY y) { \
90+ TY f, o, i; \
91+ UTY l; \
92+ \
93+ TY x_hi = x >> 32; \
94+ TY x_lo = x & UINT_MAX; \
95+ TY y_hi = y >> 32; \
96+ TY y_lo = y & UINT_MAX; \
97+ \
98+ f = x_hi * y_hi; \
99+ o = x_hi * y_lo; \
100+ i = x_lo * y_hi; \
101+ l = __CLC_CONVERT_TY(x_lo * y_lo, UTY); \
102+ i += __CLC_CONVERT_TY(l >> (UTY)32, TY); \
103+ \
104+ return f + (__clc_hadd(o, i) >> (TY)31); \
101105 }
102106
103- #define __CLC_MUL_HI_DEC_IMPL (BTYPE , TYPE , BITS ) \
104- __CLC_MUL_HI_IMPL(BTYPE, TYPE, BITS) \
105- __CLC_MUL_HI_VEC_IMPL(BTYPE##2, TYPE##2, BITS) \
106- __CLC_MUL_HI_VEC_IMPL(BTYPE##3, TYPE##3, BITS) \
107- __CLC_MUL_HI_VEC_IMPL(BTYPE##4, TYPE##4, BITS) \
108- __CLC_MUL_HI_VEC_IMPL(BTYPE##8, TYPE##8, BITS) \
109- __CLC_MUL_HI_VEC_IMPL(BTYPE##16, TYPE##16, BITS)
107+ #define __CLC_MUL_HI_LONG_IMPL (BTYPE , UBTYPE ) \
108+ __CLC_MUL_HI_LONG_VEC_IMPL(BTYPE##2, UBTYPE##2) \
109+ __CLC_MUL_HI_LONG_VEC_IMPL(BTYPE##3, UBTYPE##3) \
110+ __CLC_MUL_HI_LONG_VEC_IMPL(BTYPE##4, UBTYPE##4) \
111+ __CLC_MUL_HI_LONG_VEC_IMPL(BTYPE##8, UBTYPE##8) \
112+ __CLC_MUL_HI_LONG_VEC_IMPL(BTYPE##16, UBTYPE##16)
110113
111114#define __CLC_MUL_HI_TYPES () \
112115 __CLC_MUL_HI_DEC_IMPL(short, char, 8) \
@@ -115,14 +118,15 @@ _CLC_OVERLOAD _CLC_DEF ulong __clc_mul_hi(ulong x, ulong y) {
115118 __CLC_MUL_HI_DEC_IMPL(uint, ushort, 16) \
116119 __CLC_MUL_HI_DEC_IMPL(long, int, 32) \
117120 __CLC_MUL_HI_DEC_IMPL(ulong, uint, 32) \
118- __CLC_MUL_HI_VEC (long) \
119- __CLC_MUL_HI_VEC( ulong)
121+ __CLC_MUL_HI_LONG_IMPL (long, ulong) \
122+ __CLC_MUL_HI_LONG_IMPL(ulong, ulong)
120123
121124__CLC_MUL_HI_TYPES ()
122125
123126#undef __CLC_MUL_HI_TYPES
127+ #undef __CLC_MUL_HI_LONG_IMPL
128+ #undef __CLC_MUL_HI_LONG_VEC_IMPL
124129#undef __CLC_MUL_HI_DEC_IMPL
125130#undef __CLC_MUL_HI_IMPL
126- #undef __CLC_MUL_HI_VEC
127131#undef __CLC_MUL_HI_VEC_IMPL
128132#undef __CLC_CONVERT_TY
0 commit comments