Skip to content

Commit c543481

Browse files
authored
[CIR][CIRGen][Builtin][X86] Lower cvt*2mask intrinsics (#1894)
Three things: - Corrected comments to `getZeroInitAttr` as [we return more than only integrals in that function](https://github.com/llvm/clangir/blob/2ea4005fa0aa291295b19c200860b5edf9b864b3/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h#L133). - Given that `emitX86MaskedCompare` and `emitX86MaskedCompareResult` helpers are pretty large, Added NYI statements on paths not related to the current set of intrinsics so review is specific to the ones encoded. - Added test comments related to the behavior observed coming from the canonicalizer on: #1770
1 parent 822684e commit c543481

File tree

7 files changed

+226
-5
lines changed

7 files changed

+226
-5
lines changed

clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
7070
return cir::ConstantOp::create(*this, loc, attr);
7171
}
7272

73-
// Creates constant null value for integral type ty.
73+
// Creates constant null value for the given type ty.
7474
cir::ConstantOp getNullValue(mlir::Type ty, mlir::Location loc) {
7575
return cir::ConstantOp::create(*this, loc, getZeroInitAttr(ty));
7676
}

clang/lib/CIR/CodeGen/CIRGenBuilder.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -994,7 +994,7 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
994994
/// Create a unary shuffle. The second vector operand of the IR instruction
995995
/// is poison.
996996
return createVecShuffle(
997-
loc, vec1, getConstant(loc, cir::PoisonAttr::get(vec1.getType())),
997+
loc, vec1, getConstant(loc, getAttr<cir::PoisonAttr>(vec1.getType())),
998998
mask);
999999
}
10001000

clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,85 @@ static mlir::Value emitX86PSLLDQIByteShift(CIRGenFunction &cgf,
201201
return builder.createBitcast(shuffleResult, resultType);
202202
}
203203

204+
static mlir::Value emitX86MaskedCompareResult(CIRGenFunction &cgf,
205+
mlir::Value cmp, unsigned numElts,
206+
mlir::Value maskIn,
207+
mlir::Location loc) {
208+
if (maskIn) {
209+
llvm_unreachable("NYI");
210+
}
211+
if (numElts < 8) {
212+
int64_t indices[8];
213+
for (unsigned i = 0; i != numElts; ++i)
214+
indices[i] = i;
215+
for (unsigned i = numElts; i != 8; ++i)
216+
indices[i] = i % numElts + numElts;
217+
218+
// This should shuffle between cmp (first vector) and null (second vector)
219+
mlir::Value nullVec = cgf.getBuilder().getNullValue(cmp.getType(), loc);
220+
cmp = cgf.getBuilder().createVecShuffle(loc, cmp, nullVec, indices);
221+
}
222+
return cgf.getBuilder().createBitcast(
223+
cmp, cgf.getBuilder().getUIntNTy(std::max(numElts, 8U)));
224+
}
225+
226+
static mlir::Value emitX86MaskedCompare(CIRGenFunction &cgf, unsigned cc,
227+
bool isSigned,
228+
ArrayRef<mlir::Value> ops,
229+
mlir::Location loc) {
230+
assert((ops.size() == 2 || ops.size() == 4) &&
231+
"Unexpected number of arguments");
232+
unsigned numElts = cast<cir::VectorType>(ops[0].getType()).getSize();
233+
mlir::Value cmp;
234+
235+
if (cc == 3) {
236+
llvm_unreachable("NYI");
237+
} else if (cc == 7) {
238+
llvm_unreachable("NYI");
239+
} else {
240+
cir::CmpOpKind pred;
241+
switch (cc) {
242+
default:
243+
llvm_unreachable("Unknown condition code");
244+
case 0:
245+
pred = cir::CmpOpKind::eq;
246+
break;
247+
case 1:
248+
pred = cir::CmpOpKind::lt;
249+
break;
250+
case 2:
251+
pred = cir::CmpOpKind::le;
252+
break;
253+
case 4:
254+
pred = cir::CmpOpKind::ne;
255+
break;
256+
case 5:
257+
pred = cir::CmpOpKind::ge;
258+
break;
259+
case 6:
260+
pred = cir::CmpOpKind::gt;
261+
break;
262+
}
263+
264+
auto resultTy = cgf.getBuilder().getType<cir::VectorType>(
265+
cgf.getBuilder().getUIntNTy(1), numElts);
266+
cmp = cgf.getBuilder().create<cir::VecCmpOp>(loc, resultTy, pred, ops[0],
267+
ops[1]);
268+
}
269+
270+
mlir::Value maskIn;
271+
if (ops.size() == 4)
272+
maskIn = ops[3];
273+
274+
return emitX86MaskedCompareResult(cgf, cmp, numElts, maskIn, loc);
275+
}
276+
277+
static mlir::Value emitX86ConvertToMask(CIRGenFunction &cgf, mlir::Value in,
278+
mlir::Location loc) {
279+
cir::ConstantOp zero = cgf.getBuilder().getNullValue(in.getType(), loc);
280+
return emitX86MaskedCompare(cgf, 1, true, {in, zero}, loc);
281+
}
282+
204283
mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned BuiltinID,
205284
const CallExpr *E) {
206285
if (BuiltinID == Builtin::BI__builtin_cpu_is)
@@ -547,6 +626,20 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned BuiltinID,
547626
case X86::BI__builtin_ia32_cvtmask2q512:
548627
return emitX86SExtMask(*this, Ops[0], convertType(E->getType()),
549628
getLoc(E->getExprLoc()));
629+
630+
case X86::BI__builtin_ia32_cvtb2mask128:
631+
case X86::BI__builtin_ia32_cvtb2mask256:
632+
case X86::BI__builtin_ia32_cvtb2mask512:
633+
case X86::BI__builtin_ia32_cvtw2mask128:
634+
case X86::BI__builtin_ia32_cvtw2mask256:
635+
case X86::BI__builtin_ia32_cvtw2mask512:
636+
case X86::BI__builtin_ia32_cvtd2mask128:
637+
case X86::BI__builtin_ia32_cvtd2mask256:
638+
case X86::BI__builtin_ia32_cvtd2mask512:
639+
case X86::BI__builtin_ia32_cvtq2mask128:
640+
case X86::BI__builtin_ia32_cvtq2mask256:
641+
case X86::BI__builtin_ia32_cvtq2mask512:
642+
return emitX86ConvertToMask(*this, Ops[0], getLoc(E->getExprLoc()));
550643
case X86::BI__builtin_ia32_cvtdq2ps512_mask:
551644
case X86::BI__builtin_ia32_cvtqq2ps512_mask:
552645
case X86::BI__builtin_ia32_cvtqq2pd512_mask:

clang/test/CIR/CodeGen/X86/avx512bw-builtins.c

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,10 @@
66
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -fno-signed-char -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
77
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
88
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -fno-signed-char -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
9-
// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
9+
// RUN: FileCheck --check-prefixes=LLVM-UNSIGNED-CHAR --input-file=%t.ll %s
10+
11+
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefix=OGCG
12+
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefix=OGCG
1013

1114
#include <immintrin.h>
1215

@@ -73,3 +76,31 @@ __m512i test_mm512_maskz_loadu_epi8(__mmask64 __U, void const *__P) {
7376
// LLVM: @llvm.masked.load.v64i8.p0(ptr %{{.*}}, i32 1, <64 x i1> %{{.*}}, <64 x i8> %{{.*}})
7477
return _mm512_maskz_loadu_epi8(__U, __P);
7578
}
79+
80+
__mmask64 test_mm512_movepi8_mask(__m512i __A) {
81+
// CIR-LABEL: @_mm512_movepi8_mask
82+
// CIR: %{{.*}} = cir.vec.cmp(lt, %{{.*}}, %{{.*}}) : !cir.vector<{{!s8i|!u8i}} x 64>, !cir.vector<!cir.int<u, 1> x 64>
83+
84+
// LLVM-LABEL: @test_mm512_movepi8_mask
85+
// LLVM: [[CMP:%.*]] = icmp slt <64 x i8> %{{.*}}, zeroinitializer
86+
87+
// In the unsigned case below, the canonicalizer proves the comparison is
88+
// always false (no i8 unsigned value can be < 0) and folds it away.
89+
// LLVM-UNSIGNED-CHAR: store i64 0, ptr %{{.*}}, align 8
90+
91+
// OGCG-LABEL: @test_mm512_movepi8_mask
92+
// OGCG: [[CMP:%.*]] = icmp slt <64 x i8> %{{.*}}, zeroinitializer
93+
return _mm512_movepi8_mask(__A);
94+
}
95+
96+
__mmask32 test_mm512_movepi16_mask(__m512i __A) {
97+
// CIR-LABEL: @_mm512_movepi16_mask
98+
// CIR: %{{.*}} = cir.vec.cmp(lt, %{{.*}}, %{{.*}}) : !cir.vector<!s16i x 32>, !cir.vector<!cir.int<u, 1> x 32>
99+
100+
// LLVM-LABEL: @test_mm512_movepi16_mask
101+
// LLVM: [[CMP:%.*]] = icmp slt <32 x i16> %{{.*}}, zeroinitializer
102+
103+
// OGCG-LABEL: @test_mm512_movepi16_mask
104+
// OGCG: [[CMP:%.*]] = icmp slt <32 x i16> %{{.*}}, zeroinitializer
105+
return _mm512_movepi16_mask(__A);
106+
}

clang/test/CIR/CodeGen/X86/avx512dq-builtins.c

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
33
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512dq -fclangir -emit-llvm -o %t.ll -Wall -Werror
44
// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
5+
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512dq -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefix=OGCG
56

67
#include <immintrin.h>
78

@@ -49,4 +50,28 @@ __m512i test_mm512_inserti64x2(__m512i __A, __m128i __B) {
4950
// LLVM-LABEL: @test_mm512_inserti64x2
5051
// LLVM: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
5152
return _mm512_inserti64x2(__A, __B, 1);
52-
}
53+
}
54+
55+
__mmask16 test_mm512_movepi32_mask(__m512i __A) {
56+
// CIR-LABEL: _mm512_movepi32_mask
57+
// CIR: %{{.*}} = cir.vec.cmp(lt, %{{.*}}, %{{.*}}) : !cir.vector<!s32i x 16>, !cir.vector<!cir.int<u, 1> x 16>
58+
59+
// LLVM-LABEL: @test_mm512_movepi32_mask
60+
// LLVM: [[CMP:%.*]] = icmp slt <16 x i32> %{{.*}}, zeroinitializer
61+
62+
// OGCG-LABEL: @test_mm512_movepi32_mask
63+
// OGCG: [[CMP:%.*]] = icmp slt <16 x i32> %{{.*}}, zeroinitializer
64+
return _mm512_movepi32_mask(__A);
65+
}
66+
67+
__mmask8 test_mm512_movepi64_mask(__m512i __A) {
68+
// CIR-LABEL: @_mm512_movepi64_mask
69+
// CIR: %{{.*}} = cir.vec.cmp(lt, %{{.*}}, %{{.*}}) : !cir.vector<!s64i x 8>, !cir.vector<!cir.int<u, 1> x 8>
70+
71+
// LLVM-LABEL: @test_mm512_movepi64_mask
72+
// LLVM: [[CMP:%.*]] = icmp slt <8 x i64> %{{.*}}, zeroinitializer
73+
74+
// OGCG-LABEL: @test_mm512_movepi64_mask
75+
// OGCG: [[CMP:%.*]] = icmp slt <8 x i64> %{{.*}}, zeroinitializer
76+
return _mm512_movepi64_mask(__A);
77+
}

clang/test/CIR/CodeGen/X86/avx512vlbw-buiiltins.c

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,16 @@
66
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -target-feature +avx512vl -fno-signed-char -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
77
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
88
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -target-feature +avx512vl -fno-signed-char -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
9-
// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
9+
// RUN: FileCheck --check-prefixes=LLVM-UNSIGNED-CHAR --input-file=%t.ll %s
1010

1111
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx10.1-512 -target-feature +avx512vl -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
1212
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
1313
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx10.1-512 -target-feature +avx512vl -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
1414
// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
1515

16+
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
17+
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +avx512vl -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
18+
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx10.1-512 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
1619

1720
#include <immintrin.h>
1821

@@ -188,3 +191,31 @@ __m512i test_mm512_maskz_load_epi32(__mmask16 __U, void const *__P) {
188191
// LLVM: @llvm.masked.load.v16i32.p0(ptr %{{.*}}, i32 64, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
189192
return _mm512_maskz_load_epi32(__U, __P);
190193
}
194+
195+
__mmask16 test_mm_movepi8_mask(__m128i __A) {
196+
// CIR-LABEL: _mm_movepi8_mask
197+
// CIR: %{{.*}} = cir.vec.cmp(lt, %{{.*}}, %{{.*}}) : !cir.vector<{{!s8i|!u8i}} x 16>, !cir.vector<!cir.int<u, 1> x 16>
198+
199+
// LLVM-LABEL: @test_mm_movepi8_mask
200+
// LLVM: [[CMP:%.*]] = icmp slt <16 x i8> %{{.*}}, zeroinitializer
201+
202+
// In the unsigned case below, the canonicalizer proves the comparison is
203+
// always false (no i8 unsigned value can be < 0) and folds it away.
204+
// LLVM-UNSIGNED-CHAR: store i16 0, ptr %{{.*}}, align 2
205+
206+
// OGCG-LABEL: @test_mm_movepi8_mask
207+
// OGCG: [[CMP:%.*]] = icmp slt <16 x i8> %{{.*}}, zeroinitializer
208+
return _mm_movepi8_mask(__A);
209+
}
210+
211+
__mmask16 test_mm256_movepi16_mask(__m256i __A) {
212+
// CIR-LABEL: _mm256_movepi16_mask
213+
// CIR: %{{.*}} = cir.vec.cmp(lt, %{{.*}}, %{{.*}}) : !cir.vector<!s16i x 16>, !cir.vector<!cir.int<u, 1> x 16>
214+
215+
// LLVM-LABEL: @test_mm256_movepi16_mask
216+
// LLVM: [[CMP:%.*]] = icmp slt <16 x i16> %{{.*}}, zeroinitializer
217+
218+
// OGCG-LABEL: @test_mm256_movepi16_mask
219+
// OGCG: [[CMP:%.*]] = icmp slt <16 x i16> %{{.*}}, zeroinitializer
220+
return _mm256_movepi16_mask(__A);
221+
}

clang/test/CIR/CodeGen/X86/avx512vldq-builtins.c

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
33
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512dq -target-feature +avx512vl -fclangir -emit-llvm -o %t.ll -Wall -Werror
44
// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
5+
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512dq -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
56

67
#include <immintrin.h>
78

@@ -83,3 +84,43 @@ __m256i test_mm256_inserti64x2(__m256i __A, __m128i __B) {
8384
// LLVM: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
8485
return _mm256_inserti64x2(__A, __B, 1);
8586
}
87+
88+
__mmask8 test_mm256_movepi32_mask(__m256i __A) {
89+
// LLVM-LABEL: @test_mm256_movepi32_mask
90+
// LLVM: [[CMP:%.*]] = icmp slt <8 x i32> %{{.*}}, zeroinitializer
91+
92+
// OGCG-LABEL: @test_mm256_movepi32_mask
93+
// OGCG: [[CMP:%.*]] = icmp slt <8 x i32> %{{.*}}, zeroinitializer
94+
return _mm256_movepi32_mask(__A);
95+
}
96+
97+
__mmask8 test_mm_movepi64_mask(__m128i __A) {
98+
// CIR-LABEL: _mm_movepi64_mask
99+
// CIR: %{{.*}} = cir.vec.cmp(lt, %{{.*}}, %{{.*}}) : !cir.vector<!s64i x 2>, !cir.vector<!cir.int<u, 1> x 2>
100+
// CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!cir.int<u, 1> x 2>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i] : !cir.vector<!cir.int<u, 1> x 8>
101+
// CIR: %{{.*}} = cir.cast(bitcast, %{{.*}} : !cir.vector<!cir.int<u, 1> x 8>), !u8i
102+
103+
// LLVM-LABEL: @test_mm_movepi64_mask
104+
// LLVM: [[CMP:%.*]] = icmp slt <2 x i64> %{{.*}}, zeroinitializer
105+
// LLVM: [[SHUF:%.*]] = shufflevector <2 x i1> [[CMP]], <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
106+
107+
// OGCG-LABEL: @test_mm_movepi64_mask
108+
// OGCG: [[CMP:%.*]] = icmp slt <2 x i64> %{{.*}}, zeroinitializer
109+
// OGCG: [[SHUF:%.*]] = shufflevector <2 x i1> [[CMP]], <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
110+
return _mm_movepi64_mask(__A);
111+
}
112+
113+
__mmask8 test_mm256_movepi64_mask(__m256i __A) {
114+
// CIR-LABEL: _mm256_movepi64_mask
115+
// CIR: %{{.*}} = cir.vec.cmp(lt, %{{.*}}, %{{.*}}) : !cir.vector<!s64i x 4>, !cir.vector<!cir.int<u, 1> x 4>
116+
// CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<!cir.int<u, 1> x 4>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i] : !cir.vector<!cir.int<u, 1> x 8>
117+
118+
// LLVM-LABEL: @test_mm256_movepi64_mask
119+
// LLVM: [[CMP:%.*]] = icmp slt <4 x i64> %{{.*}}, zeroinitializer
120+
// LLVM: [[SHUF:%.*]] = shufflevector <4 x i1> [[CMP]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
121+
122+
// OGCG-LABEL: @test_mm256_movepi64_mask
123+
// OGCG: [[CMP:%.*]] = icmp slt <4 x i64> %{{.*}}, zeroinitializer
124+
// OGCG: [[SHUF:%.*]] = shufflevector <4 x i1> [[CMP]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
125+
return _mm256_movepi64_mask(__A);
126+
}

0 commit comments

Comments
 (0)