Skip to content

Commit 46686b7

Browse files
authored
[CIR][CIRGen][Builtin][X86] Lower AVX masked load intrinsics (#1763)
For these intrinsics there only seems to be one function where the IR emmited seems to diverge: for `_mm_load_sbh` loads a single 16-bit bfloat (__bf16) value from memory into the lowest element of a 128-bit bfloat vector (__m128bh), leaving the remaining lanes unchanged or filled with a passthrough value. It is implemented using a masked load with only the first lane enabled. [source for intrinsics with similar behaviour](https://gist.github.com/leopck/86799fee6ceb9649d0ebe32c1c6e5b85) In the CIR lowering of `_mm_load_sbh`, we are currently emitting the mask of intrinsic (`llvm.masked.load`) operand as an explicit constant vector: ``` llvm <8 x i1> <true, false, false, false, false, false, false, false> ``` whereas OG lowers: ```llvm <8 x i1> bitcast (<1 x i8> splat (i8 1) to <8 x i1>) ``` I believe both things are semantically equal so: Is it acceptable for CIR and OG to diverge in this way for masked loads, or should we aim for parity in how the mask is represented, even if that reduces readability in CIR?
1 parent 2ccc77e commit 46686b7

File tree

8 files changed

+642
-3
lines changed

8 files changed

+642
-3
lines changed

clang/lib/CIR/CodeGen/CIRGenBuilder.h

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -895,6 +895,34 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
895895
return CIRBaseBuilderTy::createStore(loc, flag, dst);
896896
}
897897

898+
/// Create a call to a Masked Load intrinsic.
899+
/// \p loc - expression location
900+
/// \p ty - vector type to load
901+
/// \p ptr - base pointer for the load
902+
/// \p alignment - alignment of the source location
903+
/// \p mask - vector of booleans which indicates what vector lanes should
904+
/// be accessed in memory
905+
/// \p passThru - pass-through value that is used to fill the masked-off
906+
/// lanes
907+
/// of the result
908+
mlir::Value createMaskedLoad(mlir::Location loc, mlir::Type ty,
909+
mlir::Value ptr, llvm::Align alignment,
910+
mlir::Value mask, mlir::Value passThru) {
911+
912+
assert(mlir::isa<cir::VectorType>(ty) && "Type should be vector");
913+
assert(mask && "Mask should not be all-ones (null)");
914+
915+
if (!passThru)
916+
passThru = this->getConstant(loc, cir::PoisonAttr::get(ty));
917+
918+
mlir::Value ops[] = {ptr, this->getUInt32(int32_t(alignment.value()), loc),
919+
mask, passThru};
920+
921+
return create<cir::LLVMIntrinsicCallOp>(loc, getStringAttr("masked.load"),
922+
ty, ops)
923+
.getResult();
924+
}
925+
898926
/// Create a call to a masked store intrinsic.
899927
/// \p loc - expression location
900928
/// \p val - data to be stored

clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,19 @@ static mlir::Value emitX86MaskedStore(CIRGenFunction &cgf,
105105
maskVec);
106106
}
107107

108+
static mlir::Value emitX86MaskedLoad(CIRGenFunction &cgf,
109+
ArrayRef<mlir::Value> ops,
110+
llvm::Align alignment,
111+
mlir::Location loc) {
112+
mlir::Type ty = ops[1].getType();
113+
mlir::Value ptr = ops[0];
114+
mlir::Value maskVec =
115+
getMaskVecValue(cgf, ops[2], cast<cir::VectorType>(ty).getSize(), loc);
116+
117+
return cgf.getBuilder().createMaskedLoad(loc, ty, ptr, alignment, maskVec,
118+
ops[1]);
119+
}
120+
108121
static mlir::Value emitX86SExtMask(CIRGenFunction &cgf, mlir::Value op,
109122
mlir::Type dstTy, mlir::Location loc) {
110123
unsigned numberOfElements = cast<cir::VectorType>(dstTy).getSize();
@@ -586,13 +599,15 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned BuiltinID,
586599
case X86::BI__builtin_ia32_loaddqudi128_mask:
587600
case X86::BI__builtin_ia32_loaddqudi256_mask:
588601
case X86::BI__builtin_ia32_loaddqudi512_mask:
589-
llvm_unreachable("vfmaddsubph256_round_mask3 NYI");
602+
return emitX86MaskedLoad(*this, Ops, llvm::Align(1),
603+
getLoc(E->getExprLoc()));
590604

591605
case X86::BI__builtin_ia32_loadsbf16128_mask:
592606
case X86::BI__builtin_ia32_loadsh128_mask:
593607
case X86::BI__builtin_ia32_loadss128_mask:
594608
case X86::BI__builtin_ia32_loadsd128_mask:
595-
llvm_unreachable("vfmaddsubph256_round_mask3 NYI");
609+
return emitX86MaskedLoad(*this, Ops, llvm::Align(1),
610+
getLoc(E->getExprLoc()));
596611

597612
case X86::BI__builtin_ia32_loadaps128_mask:
598613
case X86::BI__builtin_ia32_loadaps256_mask:
@@ -606,7 +621,10 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned BuiltinID,
606621
case X86::BI__builtin_ia32_movdqa64load128_mask:
607622
case X86::BI__builtin_ia32_movdqa64load256_mask:
608623
case X86::BI__builtin_ia32_movdqa64load512_mask:
609-
llvm_unreachable("vfmaddsubph256_round_mask3 NYI");
624+
return emitX86MaskedLoad(
625+
*this, Ops,
626+
getContext().getTypeAlignInChars(E->getArg(1)->getType()).getAsAlign(),
627+
getLoc(E->getExprLoc()));
610628

611629
case X86::BI__builtin_ia32_expandloaddf128_mask:
612630
case X86::BI__builtin_ia32_expandloaddf256_mask:

clang/test/CIR/CodeGen/X86/avx10_2bf16-builtins.c

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,22 @@ void test_mm_mask_store_sbh(void *__P, __mmask8 __U, __m128bh __A) {
1313
// LLVM: call void @llvm.masked.store.v8bf16.p0(<8 x bfloat> %{{.*}}, ptr %{{.*}}, i32 1, <8 x i1> %{{.*}})
1414
_mm_mask_store_sbh(__P, __U, __A);
1515
}
16+
17+
__m128bh test_mm_load_sbh(void const *A) {
18+
// CIR-LABEL: _mm_load_sbh
19+
// CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!cir.bf16 x 8>>, !u32i, !cir.vector<!cir.int<s, 1> x 8>, !cir.vector<!cir.bf16 x 8>) -> !cir.vector<!cir.bf16 x 8>
20+
21+
// LLVM-LABEL: @test_mm_load_sbh
22+
// NOTE: OG represents the mask using a bitcast from splat (i8 1), see IR-differences #1767
23+
// LLVM: %{{.*}} = call <8 x bfloat> @llvm.masked.load.v8bf16.p0(ptr %{{.*}}, i32 1, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x bfloat> %{{.*}})
24+
return _mm_load_sbh(A);
25+
}
26+
27+
__m128bh test_mm_mask_load_sbh(__m128bh __A, __mmask8 __U, const void *__W) {
28+
// CIR-LABEL: _mm_mask_load_sbh
29+
// CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!cir.bf16 x 8>>, !u32i, !cir.vector<!cir.int<s, 1> x 8>, !cir.vector<!cir.bf16 x 8>) -> !cir.vector<!cir.bf16 x 8>
30+
31+
// LLVM-LABEL: @test_mm_mask_load_sbh
32+
// LLVM: %{{.*}} = call <8 x bfloat> @llvm.masked.load.v8bf16.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}})
33+
return _mm_mask_load_sbh(__A, __U, __W);
34+
}

clang/test/CIR/CodeGen/X86/avx512bw-builtins.c

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,39 @@ __m512i test_mm512_movm_epi16(__mmask32 __A) {
3737
// LLVM: %{{.*}} = sext <32 x i1> %{{.*}} to <32 x i16>
3838
return _mm512_movm_epi16(__A);
3939
}
40+
41+
__m512i test_mm512_mask_loadu_epi8(__m512i __W, __mmask64 __U, void const *__P) {
42+
// CIR-LABEL: _mm512_mask_loadu_epi8
43+
// CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<{{!s8i|!u8i}} x 64>>, !u32i, !cir.vector<!cir.int<s, 1> x 64>, !cir.vector<{{!s8i|!u8i}} x 64>) -> !cir.vector<{{!s8i|!u8i}} x 64>
44+
45+
// LLVM-LABEL: @test_mm512_mask_loadu_epi8
46+
// LLVM: @llvm.masked.load.v64i8.p0(ptr %{{.*}}, i32 1, <64 x i1> %{{.*}}, <64 x i8> %{{.*}})
47+
return _mm512_mask_loadu_epi8(__W, __U, __P);
48+
}
49+
50+
__m512i test_mm512_mask_loadu_epi16(__m512i __W, __mmask32 __U, void const *__P) {
51+
// CIR-LABEL: _mm512_mask_loadu_epi16
52+
// CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!s16i x 32>>, !u32i, !cir.vector<!cir.int<s, 1> x 32>, !cir.vector<!s16i x 32>) -> !cir.vector<!s16i x 32>
53+
54+
// LLVM-LABEL: @test_mm512_mask_loadu_epi16
55+
// LLVM: @llvm.masked.load.v32i16.p0(ptr %{{.*}}, i32 1, <32 x i1> %{{.*}}, <32 x i16> %{{.*}})
56+
return _mm512_mask_loadu_epi16(__W, __U, __P);
57+
}
58+
59+
__m512i test_mm512_maskz_loadu_epi16(__mmask32 __U, void const *__P) {
60+
// CIR-LABEL: _mm512_maskz_loadu_epi16
61+
// CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!s16i x 32>>, !u32i, !cir.vector<!cir.int<s, 1> x 32>, !cir.vector<!s16i x 32>) -> !cir.vector<!s16i x 32>
62+
63+
// LLVM-LABEL: @test_mm512_maskz_loadu_epi16
64+
// LLVM: @llvm.masked.load.v32i16.p0(ptr %{{.*}}, i32 1, <32 x i1> %{{.*}}, <32 x i16> %{{.*}})
65+
return _mm512_maskz_loadu_epi16(__U, __P);
66+
}
67+
68+
__m512i test_mm512_maskz_loadu_epi8(__mmask64 __U, void const *__P) {
69+
// CIR-LABEL: _mm512_maskz_loadu_epi8
70+
// CIR: cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<{{!s8i|!u8i}} x 64>>, !u32i, !cir.vector<!cir.int<s, 1> x 64>, !cir.vector<{{!s8i|!u8i}} x 64>) -> !cir.vector<{{!s8i|!u8i}} x 64>
71+
72+
// LLVM-LABEL: @test_mm512_maskz_loadu_epi8
73+
// LLVM: @llvm.masked.load.v64i8.p0(ptr %{{.*}}, i32 1, <64 x i1> %{{.*}}, <64 x i8> %{{.*}})
74+
return _mm512_maskz_loadu_epi8(__U, __P);
75+
}

clang/test/CIR/CodeGen/X86/avx512f-builtins.c

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,3 +82,171 @@ void test_mm512_mask_store_ps(void *p, __m512 a, __mmask16 m){
8282
// LLVM: @llvm.masked.store.v16f32.p0(<16 x float> %{{.*}}, ptr %{{.*}}, i32 64, <16 x i1> %{{.*}})
8383
_mm512_mask_store_ps(p, m, a);
8484
}
85+
86+
__m512 test_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void *__P)
87+
{
88+
// CIR-LABEL: _mm512_mask_loadu_ps
89+
// CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.float>, !u32i, !cir.vector<!cir.int<s, 1> x 16>, !cir.vector<!cir.float x 16>) -> !cir.vector<!cir.float x 16>
90+
91+
// LLVM-LABEL: test_mm512_mask_loadu_ps
92+
// LLVM: @llvm.masked.load.v16f32.p0(ptr %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
93+
return _mm512_mask_loadu_ps (__W,__U, __P);
94+
}
95+
96+
__m512 test_mm512_maskz_load_ps(__mmask16 __U, void *__P)
97+
{
98+
99+
// CIR-LABEL: _mm512_maskz_load_ps
100+
// CIR: cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!cir.float x 16>>, !u32i, !cir.vector<!cir.int<s, 1> x 16>, !cir.vector<!cir.float x 16>) -> !cir.vector<!cir.float x 16>
101+
102+
// LLVM-LABEL: test_mm512_maskz_load_ps
103+
// LLVM: @llvm.masked.load.v16f32.p0(ptr %{{.*}}, i32 64, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
104+
return _mm512_maskz_load_ps(__U, __P);
105+
}
106+
107+
__m512d test_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void *__P)
108+
{
109+
// CIR-LABEL: _mm512_mask_loadu_pd
110+
// CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.double>, !u32i, !cir.vector<!cir.int<s, 1> x 8>, !cir.vector<!cir.double x 8>) -> !cir.vector<!cir.double x 8>
111+
112+
// LLVM-LABEL: test_mm512_mask_loadu_pd
113+
// LLVM: @llvm.masked.load.v8f64.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
114+
return _mm512_mask_loadu_pd (__W,__U, __P);
115+
}
116+
117+
__m512d test_mm512_maskz_load_pd(__mmask8 __U, void *__P)
118+
{
119+
// CIR-LABEL: _mm512_maskz_load_pd
120+
// CIR: cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!cir.double x 8>>, !u32i, !cir.vector<!cir.int<s, 1> x 8>, !cir.vector<!cir.double x 8>) -> !cir.vector<!cir.double x 8>
121+
122+
// LLVM-LABEL: test_mm512_maskz_load_pd
123+
// LLVM: @llvm.masked.load.v8f64.p0(ptr %{{.*}}, i32 64, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
124+
return _mm512_maskz_load_pd(__U, __P);
125+
}
126+
127+
__m512i test_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void *__P)
128+
{
129+
// CIR-LABEL: _mm512_mask_loadu_epi32
130+
// CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!s32i>, !u32i, !cir.vector<!cir.int<s, 1> x 16>, !cir.vector<!s32i x 16>) -> !cir.vector<!s32i x 16>
131+
132+
// LLVM-LABEL: test_mm512_mask_loadu_epi32
133+
// LLVM: @llvm.masked.load.v16i32.p0(ptr %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
134+
return _mm512_mask_loadu_epi32 (__W,__U, __P);
135+
}
136+
137+
__m512i test_mm512_maskz_loadu_epi32 (__mmask16 __U, void *__P)
138+
{
139+
// CIR-LABEL: _mm512_maskz_loadu_epi32
140+
// CIR: cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!s32i>, !u32i, !cir.vector<!cir.int<s, 1> x 16>, !cir.vector<!s32i x 16>) -> !cir.vector<!s32i x 16>
141+
142+
// LLVM-LABEL: test_mm512_maskz_loadu_epi32
143+
// LLVM: @llvm.masked.load.v16i32.p0(ptr %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
144+
return _mm512_maskz_loadu_epi32 (__U, __P);
145+
}
146+
147+
__m512i test_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void *__P)
148+
{
149+
// CIR-LABEL: _mm512_mask_loadu_epi64
150+
// CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!s64i>, !u32i, !cir.vector<!cir.int<s, 1> x 8>, !cir.vector<!s64i x 8>) -> !cir.vector<!s64i x 8>
151+
152+
// LLVM-LABEL: test_mm512_mask_loadu_epi64
153+
// LLVM: @llvm.masked.load.v8i64.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
154+
return _mm512_mask_loadu_epi64 (__W,__U, __P);
155+
}
156+
157+
__m512i test_mm512_maskz_loadu_epi64 (__mmask16 __U, void *__P)
158+
{
159+
// CIR-LABEL: _mm512_maskz_loadu_epi64
160+
// CIR: cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!s64i>, !u32i, !cir.vector<!cir.int<s, 1> x 8>, !cir.vector<!s64i x 8>) -> !cir.vector<!s64i x 8>
161+
162+
// LLVM-LABEL: test_mm512_maskz_loadu_epi64
163+
// LLVM: @llvm.masked.load.v8i64.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
164+
return _mm512_maskz_loadu_epi64 (__U, __P);
165+
}
166+
167+
__m128 test_mm_mask_load_ss(__m128 __A, __mmask8 __U, const float* __W)
168+
{
169+
// CIR-LABEL: _mm_mask_load_ss
170+
// CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!cir.float x 4>>, !u32i, !cir.vector<!cir.int<s, 1> x 4>, !cir.vector<!cir.float x 4>) -> !cir.vector<!cir.float x 4>
171+
172+
// LLVM-LABEL: test_mm_mask_load_ss
173+
// LLVM: call {{.*}}<4 x float> @llvm.masked.load.v4f32.p0(ptr %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
174+
return _mm_mask_load_ss(__A, __U, __W);
175+
}
176+
177+
__m128 test_mm_maskz_load_ss (__mmask8 __U, const float * __W)
178+
{
179+
// CIR-LABEL: _mm_maskz_load_ss
180+
// CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!cir.float x 4>>, !u32i, !cir.vector<!cir.int<s, 1> x 4>, !cir.vector<!cir.float x 4>) -> !cir.vector<!cir.float x 4>
181+
182+
// LLVM-LABEL: test_mm_maskz_load_ss
183+
// LLVM: call {{.*}}<4 x float> @llvm.masked.load.v4f32.p0(ptr %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
184+
return _mm_maskz_load_ss (__U, __W);
185+
}
186+
187+
__m128d test_mm_mask_load_sd (__m128d __A, __mmask8 __U, const double * __W)
188+
{
189+
// CIR-LABEL: _mm_mask_load_sd
190+
// CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!cir.double x 2>>, !u32i, !cir.vector<!cir.int<s, 1> x 2>, !cir.vector<!cir.double x 2>) -> !cir.vector<!cir.double x 2>
191+
192+
// LLVM-LABEL: test_mm_mask_load_sd
193+
// LLVM: call {{.*}}<2 x double> @llvm.masked.load.v2f64.p0(ptr %{{.*}}, i32 1, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
194+
return _mm_mask_load_sd (__A, __U, __W);
195+
}
196+
197+
__m128d test_mm_maskz_load_sd (__mmask8 __U, const double * __W)
198+
{
199+
// CIR-LABEL: _mm_maskz_load_sd
200+
// CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!cir.double x 2>>, !u32i, !cir.vector<!cir.int<s, 1> x 2>, !cir.vector<!cir.double x 2>) -> !cir.vector<!cir.double x 2>
201+
202+
// LLVM-LABEL: test_mm_maskz_load_sd
203+
// LLVM: call {{.*}}<2 x double> @llvm.masked.load.v2f64.p0(ptr %{{.*}}, i32 1, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
204+
return _mm_maskz_load_sd (__U, __W);
205+
}
206+
207+
__m512 test_mm512_mask_load_ps (__m512 __W, __mmask16 __U, void *__P)
208+
{
209+
// CIR-LABEL: _mm512_mask_load_ps
210+
// CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!cir.float x 16>>, !u32i, !cir.vector<!cir.int<s, 1> x 16>, !cir.vector<!cir.float x 16>) -> !cir.vector<!cir.float x 16>
211+
212+
// LLVM-LABEL: test_mm512_mask_load_ps
213+
// LLVM: @llvm.masked.load.v16f32.p0(ptr %{{.*}}, i32 64, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
214+
return _mm512_mask_load_ps (__W,__U, __P);
215+
}
216+
217+
__m512d test_mm512_mask_load_pd (__m512d __W, __mmask8 __U, void *__P)
218+
{
219+
// CIR-LABEL: _mm512_mask_load_pd
220+
// CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!cir.double x 8>>, !u32i, !cir.vector<!cir.int<s, 1> x 8>, !cir.vector<!cir.double x 8>) -> !cir.vector<!cir.double x 8>
221+
222+
// LLVM-LABEL: test_mm512_mask_load_pd
223+
// LLVM: @llvm.masked.load.v8f64.p0(ptr %{{.*}}, i32 64, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
224+
return _mm512_mask_load_pd (__W,__U, __P);
225+
}
226+
227+
__m512i test_mm512_mask_load_epi32(__m512i __W, __mmask16 __U, void const *__P) {
228+
// CIR-LABEL: _mm512_mask_load_epi32
229+
// CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!s32i x 16>>, !u32i, !cir.vector<!cir.int<s, 1> x 16>, !cir.vector<!s32i x 16>) -> !cir.vector<!s32i x 16>
230+
231+
// LLVM-LABEL: test_mm512_mask_load_epi32
232+
// LLVM: @llvm.masked.load.v16i32.p0(ptr %{{.*}}, i32 64, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
233+
return _mm512_mask_load_epi32(__W, __U, __P);
234+
}
235+
236+
__m512i test_mm512_mask_load_epi64(__m512i __W, __mmask8 __U, void const *__P) {
237+
// CIR-LABEL: _mm512_mask_load_epi64
238+
// CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!s64i x 8>>, !u32i, !cir.vector<!cir.int<s, 1> x 8>, !cir.vector<!s64i x 8>) -> !cir.vector<!s64i x 8>
239+
240+
// LLVM-LABEL: test_mm512_mask_load_epi64
241+
// LLVM: @llvm.masked.load.v8i64.p0(ptr %{{.*}}, i32 64, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
242+
return _mm512_mask_load_epi64(__W, __U, __P);
243+
}
244+
245+
__m512i test_mm512_maskz_load_epi64(__mmask8 __U, void const *__P) {
246+
// CIR-LABEL: _mm512_maskz_load_epi64
247+
// CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!s64i x 8>>, !u32i, !cir.vector<!cir.int<s, 1> x 8>, !cir.vector<!s64i x 8>) -> !cir.vector<!s64i x 8>
248+
249+
// LLVM-LABEL: test_mm512_maskz_load_epi64
250+
// LLVM: @llvm.masked.load.v8i64.p0(ptr %{{.*}}, i32 64, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
251+
return _mm512_maskz_load_epi64(__U, __P);
252+
}

clang/test/CIR/CodeGen/X86/avx512fp16-builtins.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,21 @@ void test_mm_mask_store_sh(void *__P, __mmask8 __U, __m128h __A) {
1414
// LLVM: call void @llvm.masked.store.v8f16.p0(<8 x half> %{{.*}}, ptr %{{.*}}, i32 1, <8 x i1> %{{.*}})
1515
_mm_mask_store_sh(__P, __U, __A);
1616
}
17+
18+
__m128h test_mm_mask_load_sh(__m128h __A, __mmask8 __U, const void *__W) {
19+
// CIR-LABEL: _mm_mask_load_sh
20+
// CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!cir.f16 x 8>>, !u32i, !cir.vector<!cir.int<s, 1> x 8>, !cir.vector<!cir.f16 x 8>) -> !cir.vector<!cir.f16 x 8>
21+
22+
// LLVM-LABEL: @test_mm_mask_load_sh
23+
// LLVM: %{{.*}} = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x half> %{{.*}})
24+
return _mm_mask_load_sh(__A, __U, __W);
25+
}
26+
27+
__m128h test_mm_maskz_load_sh(__mmask8 __U, const void *__W) {
28+
// CIR-LABEL: _mm_maskz_load_sh
29+
// CIR: %{{.*}} = cir.llvm.intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<!cir.f16 x 8>>, !u32i, !cir.vector<!cir.int<s, 1> x 8>, !cir.vector<!cir.f16 x 8>) -> !cir.vector<!cir.f16 x 8>
30+
31+
// LLVM-LABEL: @test_mm_maskz_load_sh
32+
// LLVM: %{{.*}} = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x half> %{{.*}})
33+
return _mm_maskz_load_sh(__U, __W);
34+
}

0 commit comments

Comments
 (0)