@@ -1615,7 +1615,7 @@ pub fn _mm_maskz_add_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: _
1615
1615
#[cfg_attr(test, assert_instr(vaddsh))]
1616
1616
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1617
1617
pub fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
1618
- _mm_add_round_sh::<_MM_FROUND_CUR_DIRECTION> (a, b)
1618
+ unsafe { simd_insert! (a, 0, _mm_cvtsh_h(a) + _mm_cvtsh_h(b)) }
1619
1619
}
1620
1620
1621
1621
/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@@ -1628,7 +1628,16 @@ pub fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
1628
1628
#[cfg_attr(test, assert_instr(vaddsh))]
1629
1629
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1630
1630
pub fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1631
- _mm_mask_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
1631
+ unsafe {
1632
+ let extractsrc: f16 = simd_extract!(src, 0);
1633
+ let mut add: f16 = extractsrc;
1634
+ if (k & 0b00000001) != 0 {
1635
+ let extracta: f16 = simd_extract!(a, 0);
1636
+ let extractb: f16 = simd_extract!(b, 0);
1637
+ add = extracta + extractb;
1638
+ }
1639
+ simd_insert!(a, 0, add)
1640
+ }
1632
1641
}
1633
1642
1634
1643
/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@@ -1641,7 +1650,15 @@ pub fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m
1641
1650
#[cfg_attr(test, assert_instr(vaddsh))]
1642
1651
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1643
1652
pub fn _mm_maskz_add_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1644
- _mm_maskz_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
1653
+ unsafe {
1654
+ let mut add: f16 = 0.;
1655
+ if (k & 0b00000001) != 0 {
1656
+ let extracta: f16 = simd_extract!(a, 0);
1657
+ let extractb: f16 = simd_extract!(b, 0);
1658
+ add = extracta + extractb;
1659
+ }
1660
+ simd_insert!(a, 0, add)
1661
+ }
1645
1662
}
1646
1663
1647
1664
/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
@@ -1927,7 +1944,7 @@ pub fn _mm_maskz_sub_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: _
1927
1944
#[cfg_attr(test, assert_instr(vsubsh))]
1928
1945
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1929
1946
pub fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
1930
- _mm_sub_round_sh::<_MM_FROUND_CUR_DIRECTION> (a, b)
1947
+ unsafe { simd_insert! (a, 0, _mm_cvtsh_h(a) - _mm_cvtsh_h(b)) }
1931
1948
}
1932
1949
1933
1950
/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
@@ -1940,7 +1957,16 @@ pub fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
1940
1957
#[cfg_attr(test, assert_instr(vsubsh))]
1941
1958
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1942
1959
pub fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1943
- _mm_mask_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
1960
+ unsafe {
1961
+ let extractsrc: f16 = simd_extract!(src, 0);
1962
+ let mut add: f16 = extractsrc;
1963
+ if (k & 0b00000001) != 0 {
1964
+ let extracta: f16 = simd_extract!(a, 0);
1965
+ let extractb: f16 = simd_extract!(b, 0);
1966
+ add = extracta - extractb;
1967
+ }
1968
+ simd_insert!(a, 0, add)
1969
+ }
1944
1970
}
1945
1971
1946
1972
/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
@@ -1953,7 +1979,15 @@ pub fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m
1953
1979
#[cfg_attr(test, assert_instr(vsubsh))]
1954
1980
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1955
1981
pub fn _mm_maskz_sub_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1956
- _mm_maskz_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
1982
+ unsafe {
1983
+ let mut add: f16 = 0.;
1984
+ if (k & 0b00000001) != 0 {
1985
+ let extracta: f16 = simd_extract!(a, 0);
1986
+ let extractb: f16 = simd_extract!(b, 0);
1987
+ add = extracta - extractb;
1988
+ }
1989
+ simd_insert!(a, 0, add)
1990
+ }
1957
1991
}
1958
1992
1959
1993
/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
@@ -2239,7 +2273,7 @@ pub fn _mm_maskz_mul_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: _
2239
2273
#[cfg_attr(test, assert_instr(vmulsh))]
2240
2274
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2241
2275
pub fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
2242
- _mm_mul_round_sh::<_MM_FROUND_CUR_DIRECTION> (a, b)
2276
+ unsafe { simd_insert! (a, 0, _mm_cvtsh_h(a) * _mm_cvtsh_h(b)) }
2243
2277
}
2244
2278
2245
2279
/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@@ -2252,7 +2286,16 @@ pub fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
2252
2286
#[cfg_attr(test, assert_instr(vmulsh))]
2253
2287
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2254
2288
pub fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2255
- _mm_mask_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2289
+ unsafe {
2290
+ let extractsrc: f16 = simd_extract!(src, 0);
2291
+ let mut add: f16 = extractsrc;
2292
+ if (k & 0b00000001) != 0 {
2293
+ let extracta: f16 = simd_extract!(a, 0);
2294
+ let extractb: f16 = simd_extract!(b, 0);
2295
+ add = extracta * extractb;
2296
+ }
2297
+ simd_insert!(a, 0, add)
2298
+ }
2256
2299
}
2257
2300
2258
2301
/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@@ -2265,7 +2308,15 @@ pub fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m
2265
2308
#[cfg_attr(test, assert_instr(vmulsh))]
2266
2309
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2267
2310
pub fn _mm_maskz_mul_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2268
- _mm_maskz_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
2311
+ unsafe {
2312
+ let mut add: f16 = 0.;
2313
+ if (k & 0b00000001) != 0 {
2314
+ let extracta: f16 = simd_extract!(a, 0);
2315
+ let extractb: f16 = simd_extract!(b, 0);
2316
+ add = extracta * extractb;
2317
+ }
2318
+ simd_insert!(a, 0, add)
2319
+ }
2269
2320
}
2270
2321
2271
2322
/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
@@ -2551,7 +2602,7 @@ pub fn _mm_maskz_div_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: _
2551
2602
#[cfg_attr(test, assert_instr(vdivsh))]
2552
2603
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2553
2604
pub fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
2554
- _mm_div_round_sh::<_MM_FROUND_CUR_DIRECTION> (a, b)
2605
+ unsafe { simd_insert! (a, 0, _mm_cvtsh_h(a) / _mm_cvtsh_h(b)) }
2555
2606
}
2556
2607
2557
2608
/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
@@ -2564,7 +2615,16 @@ pub fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
2564
2615
#[cfg_attr(test, assert_instr(vdivsh))]
2565
2616
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2566
2617
pub fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2567
- _mm_mask_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2618
+ unsafe {
2619
+ let extractsrc: f16 = simd_extract!(src, 0);
2620
+ let mut add: f16 = extractsrc;
2621
+ if (k & 0b00000001) != 0 {
2622
+ let extracta: f16 = simd_extract!(a, 0);
2623
+ let extractb: f16 = simd_extract!(b, 0);
2624
+ add = extracta / extractb;
2625
+ }
2626
+ simd_insert!(a, 0, add)
2627
+ }
2568
2628
}
2569
2629
2570
2630
/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
@@ -2577,7 +2637,15 @@ pub fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m
2577
2637
#[cfg_attr(test, assert_instr(vdivsh))]
2578
2638
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2579
2639
pub fn _mm_maskz_div_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2580
- _mm_maskz_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
2640
+ unsafe {
2641
+ let mut add: f16 = 0.;
2642
+ if (k & 0b00000001) != 0 {
2643
+ let extracta: f16 = simd_extract!(a, 0);
2644
+ let extractb: f16 = simd_extract!(b, 0);
2645
+ add = extracta / extractb;
2646
+ }
2647
+ simd_insert!(a, 0, add)
2648
+ }
2581
2649
}
2582
2650
2583
2651
/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
0 commit comments