Skip to content

Commit 01d8791

Browse files
committed
Fix and document _mmX_alignr_epiX family of intrinsics
1 parent 764ae1d commit 01d8791

File tree

3 files changed

+36
-27
lines changed

3 files changed

+36
-27
lines changed

crates/core_arch/src/x86/avx2.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
169169
static_assert_uimm_bits!(IMM8, 8);
170170
// If palignr is shifting the pair of vectors more than the size of two
171171
// lanes, emit zero.
172-
if IMM8 > 32 {
172+
if IMM8 >= 32 {
173173
return _mm256_set1_epi8(0);
174174
}
175175
// If palignr is shifting the pair of input vectors more than one lane,

crates/core_arch/src/x86/avx512bw.rs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ use crate::{
44
ptr,
55
};
66

7+
use core::hint::unreachable_unchecked;
8+
79
#[cfg(test)]
810
use stdarch_test::assert_instr;
911

@@ -11108,6 +11110,8 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
1110811110
}
1110911111

1111011112
/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst.
11113+
/// Unlike [`_mm_alignr_epi8`], [`_mm256_alignr_epi8`] functions, where the entire input vectors are concatenated to the temporary result,
11114+
/// this concatenation happens in 4 steps, where each step builds 32-byte temporary result.
1111111115
///
1111211116
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_alignr_epi8&expand=263)
1111311117
#[inline]
@@ -11118,7 +11122,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
1111811122
pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
1111911123
// If palignr is shifting the pair of vectors more than the size of two
1112011124
// lanes, emit zero.
11121-
if IMM8 > 32 {
11125+
if IMM8 >= 32 {
1112211126
return _mm512_set1_epi8(0);
1112311127
}
1112411128
// If palignr is shifting the pair of input vectors more than one lane,
@@ -11131,6 +11135,10 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
1113111135
let a = a.as_i8x64();
1113211136
let b = b.as_i8x64();
1113311137

11138+
if IMM8 == 16 {
11139+
return transmute(a);
11140+
}
11141+
1113411142
let r: i8x64 = match IMM8 % 16 {
1113511143
0 => simd_shuffle!(
1113611144
b,
@@ -11289,7 +11297,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
1128911297
121, 122, 123, 124, 125, 126,
1129011298
],
1129111299
),
11292-
_ => b,
11300+
_ => unreachable_unchecked(),
1129311301
};
1129411302
transmute(r)
1129511303
}

crates/core_arch/src/x86/avx512f.rs

Lines changed: 25 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ use crate::{
66
mem, ptr,
77
};
88

9+
use core::hint::unreachable_unchecked;
910
#[cfg(test)]
1011
use stdarch_test::assert_instr;
1112

@@ -27095,6 +27096,8 @@ pub unsafe fn _mm_mask_blend_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d
2709527096

2709627097
/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 64 bytes (16 elements) in dst.
2709727098
///
27099+
/// <div class="warning">Only lowest <strong>4 bits</strong> are used from the mask (shift at maximum by 60 bytes)!</div>
27100+
///
2709827101
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_alignr_epi32&expand=245)
2709927102
#[inline]
2710027103
#[target_feature(enable = "avx512f")]
@@ -27162,7 +27165,8 @@ pub unsafe fn _mm512_alignr_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __
2716227165
12 => simd_shuffle!(a, b, [28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
2716327166
13 => simd_shuffle!(a, b, [29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
2716427167
14 => simd_shuffle!(a, b, [30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]),
27165-
_ => simd_shuffle!(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
27168+
15 => simd_shuffle!(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
27169+
_ => unreachable_unchecked(),
2716627170
};
2716727171
transmute(r)
2716827172
}
@@ -27207,6 +27211,8 @@ pub unsafe fn _mm512_maskz_alignr_epi32<const IMM8: i32>(
2720727211

2720827212
/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst.
2720927213
///
27214+
/// <div class="warning">Only lowest <strong>3 bits</strong> are used from the mask (shift at maximum by 28 bytes)!</div>
27215+
///
2721027216
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi32&expand=242)
2721127217
#[inline]
2721227218
#[target_feature(enable = "avx512f,avx512vl")]
@@ -27217,7 +27223,7 @@ pub unsafe fn _mm256_alignr_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __
2721727223
static_assert_uimm_bits!(IMM8, 8);
2721827224
let a = a.as_i32x8();
2721927225
let b = b.as_i32x8();
27220-
let imm8: i32 = IMM8 % 16;
27226+
let imm8: i32 = IMM8 % 8;
2722127227
let r: i32x8 = match imm8 {
2722227228
0 => simd_shuffle!(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
2722327229
1 => simd_shuffle!(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
@@ -27227,14 +27233,7 @@ pub unsafe fn _mm256_alignr_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __
2722727233
5 => simd_shuffle!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
2722827234
6 => simd_shuffle!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
2722927235
7 => simd_shuffle!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
27230-
8 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
27231-
9 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
27232-
10 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
27233-
11 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
27234-
12 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
27235-
13 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
27236-
14 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
27237-
_ => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
27236+
_ => unreachable_unchecked(),
2723827237
};
2723927238
transmute(r)
2724027239
}
@@ -27279,6 +27278,8 @@ pub unsafe fn _mm256_maskz_alignr_epi32<const IMM8: i32>(
2727927278

2728027279
/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst.
2728127280
///
27281+
/// <div class="warning">Only lowest <strong>2 bits</strong> are used from the mask (shift at maximum by 12 bytes)!</div>
27282+
///
2728227283
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi32&expand=239)
2728327284
#[inline]
2728427285
#[target_feature(enable = "avx512f,avx512vl")]
@@ -27289,16 +27290,13 @@ pub unsafe fn _mm_alignr_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m12
2728927290
static_assert_uimm_bits!(IMM8, 8);
2729027291
let a = a.as_i32x4();
2729127292
let b = b.as_i32x4();
27292-
let imm8: i32 = IMM8 % 8;
27293+
let imm8: i32 = IMM8 % 4;
2729327294
let r: i32x4 = match imm8 {
2729427295
0 => simd_shuffle!(a, b, [4, 5, 6, 7]),
2729527296
1 => simd_shuffle!(a, b, [5, 6, 7, 0]),
2729627297
2 => simd_shuffle!(a, b, [6, 7, 0, 1]),
2729727298
3 => simd_shuffle!(a, b, [7, 0, 1, 2]),
27298-
4 => simd_shuffle!(a, b, [0, 1, 2, 3]),
27299-
5 => simd_shuffle!(a, b, [1, 2, 3, 0]),
27300-
6 => simd_shuffle!(a, b, [2, 3, 0, 1]),
27301-
_ => simd_shuffle!(a, b, [3, 0, 1, 2]),
27299+
_ => unreachable_unchecked(),
2730227300
};
2730327301
transmute(r)
2730427302
}
@@ -27343,6 +27341,8 @@ pub unsafe fn _mm_maskz_alignr_epi32<const IMM8: i32>(
2734327341

2734427342
/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 64 bytes (8 elements) in dst.
2734527343
///
27344+
/// <div class="warning">Only lowest <strong>3 bits</strong> are used from the mask (shift at maximum by 56 bytes)!</div>
27345+
///
2734627346
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_alignr_epi64&expand=254)
2734727347
#[inline]
2734827348
#[target_feature(enable = "avx512f")]
@@ -27360,7 +27360,8 @@ pub unsafe fn _mm512_alignr_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __
2736027360
4 => simd_shuffle!(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
2736127361
5 => simd_shuffle!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
2736227362
6 => simd_shuffle!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
27363-
_ => simd_shuffle!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
27363+
7 => simd_shuffle!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
27364+
_ => unreachable_unchecked(),
2736427365
};
2736527366
transmute(r)
2736627367
}
@@ -27405,6 +27406,8 @@ pub unsafe fn _mm512_maskz_alignr_epi64<const IMM8: i32>(
2740527406

2740627407
/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst.
2740727408
///
27409+
/// <div class="warning">Only lowest <strong>2 bits</strong> are used from the mask (shift at maximum by 24 bytes)!</div>
27410+
///
2740827411
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi64&expand=251)
2740927412
#[inline]
2741027413
#[target_feature(enable = "avx512f,avx512vl")]
@@ -27413,16 +27416,13 @@ pub unsafe fn _mm512_maskz_alignr_epi64<const IMM8: i32>(
2741327416
#[rustc_legacy_const_generics(2)]
2741427417
pub unsafe fn _mm256_alignr_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2741527418
static_assert_uimm_bits!(IMM8, 8);
27416-
let imm8: i32 = IMM8 % 8;
27419+
let imm8: i32 = IMM8 % 4;
2741727420
let r: i64x4 = match imm8 {
2741827421
0 => simd_shuffle!(a, b, [4, 5, 6, 7]),
2741927422
1 => simd_shuffle!(a, b, [5, 6, 7, 0]),
2742027423
2 => simd_shuffle!(a, b, [6, 7, 0, 1]),
2742127424
3 => simd_shuffle!(a, b, [7, 0, 1, 2]),
27422-
4 => simd_shuffle!(a, b, [0, 1, 2, 3]),
27423-
5 => simd_shuffle!(a, b, [1, 2, 3, 4]),
27424-
6 => simd_shuffle!(a, b, [2, 3, 4, 5]),
27425-
_ => simd_shuffle!(a, b, [3, 4, 5, 6]),
27425+
_ => unreachable_unchecked(),
2742627426
};
2742727427
transmute(r)
2742827428
}
@@ -27467,6 +27467,8 @@ pub unsafe fn _mm256_maskz_alignr_epi64<const IMM8: i32>(
2746727467

2746827468
/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst.
2746927469
///
27470+
/// <div class="warning">Only lowest <strong>bit</strong> is used from the mask (shift at maximum by 8 bytes)!</div>
27471+
///
2747027472
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi64&expand=248)
2747127473
#[inline]
2747227474
#[target_feature(enable = "avx512f,avx512vl")]
@@ -27475,12 +27477,11 @@ pub unsafe fn _mm256_maskz_alignr_epi64<const IMM8: i32>(
2747527477
#[rustc_legacy_const_generics(2)]
2747627478
pub unsafe fn _mm_alignr_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
2747727479
static_assert_uimm_bits!(IMM8, 8);
27478-
let imm8: i32 = IMM8 % 4;
27480+
let imm8: i32 = IMM8 % 2;
2747927481
let r: i64x2 = match imm8 {
2748027482
0 => simd_shuffle!(a, b, [2, 3]),
2748127483
1 => simd_shuffle!(a, b, [3, 0]),
27482-
2 => simd_shuffle!(a, b, [0, 1]),
27483-
_ => simd_shuffle!(a, b, [1, 2]),
27484+
_ => unreachable_unchecked(),
2748427485
};
2748527486
transmute(r)
2748627487
}

0 commit comments

Comments
 (0)