Skip to content

Commit ca14a8a

Browse files
authored
[Clang] Add masked vector builtins for expand and compress access (llvm#156042)
Summary: The interface here is nearly indentical to the already added masked loads and stores. These bind to very similar intrinsics so we add them here.
1 parent 917f022 commit ca14a8a

File tree

7 files changed

+142
-22
lines changed

7 files changed

+142
-22
lines changed

clang/docs/LanguageExtensions.rst

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -950,16 +950,31 @@ argument is always boolean mask vector. The ``__builtin_masked_load`` builtin
950950
takes an optional third vector argument that will be used for the result of the
951951
masked-off lanes. These builtins assume the memory is always aligned.
952952

953+
The ``__builtin_masked_expand_load`` and ``__builtin_masked_compress_store``
954+
builtins have the same interface but store the result in consecutive indices.
955+
Effectively this performs the ``if (mask[i]) val[i] = ptr[j++]`` and ``if
956+
(mask[i]) ptr[j++] = val[i]`` pattern respectively.
957+
953958
Example:
954959

955960
.. code-block:: c++
956961

957962
using v8b = bool [[clang::ext_vector_type(8)]];
958963
using v8i = int [[clang::ext_vector_type(8)]];
959964

960-
v8i load(v8b m, v8i *p) { return __builtin_masked_load(m, p); }
961-
962-
void store(v8b m, v8i v, v8i *p) { __builtin_masked_store(m, v, p); }
965+
v8i load(v8b mask, v8i *ptr) { return __builtin_masked_load(mask, ptr); }
966+
967+
v8i load_expand(v8b mask, v8i *ptr) {
968+
return __builtin_masked_expand_load(mask, ptr);
969+
}
970+
971+
void store(v8b mask, v8i val, v8i *ptr) {
972+
__builtin_masked_store(mask, val, ptr);
973+
}
974+
975+
void store_compress(v8b mask, v8i val, v8i *ptr) {
976+
__builtin_masked_compress_store(mask, val, ptr);
977+
}
963978
964979

965980
Matrix Types

clang/docs/ReleaseNotes.rst

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,10 @@ Non-comprehensive list of changes in this release
169169
- A vector of booleans is now a valid condition for the ternary ``?:`` operator.
170170
This binds to a simple vector select operation.
171171

172-
- Added ``__builtin_masked_load`` and ``__builtin_masked_store`` for conditional
173-
memory loads from vectors. Binds to the LLVM intrinsic of the same name.
172+
- Added ``__builtin_masked_load``, ``__builtin_masked_expand_load``,
173+
``__builtin_masked_store``, ``__builtin_masked_compress_store`` for
174+
conditional memory loads from vectors. Binds to the LLVM intrinsics of the
175+
same name.
174176

175177
- The ``__builtin_popcountg``, ``__builtin_ctzg``, and ``__builtin_clzg``
176178
functions now accept fixed-size boolean vectors.

clang/include/clang/Basic/Builtins.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1244,6 +1244,18 @@ def MaskedStore : Builtin {
12441244
let Prototype = "void(...)";
12451245
}
12461246

1247+
def MaskedExpandLoad : Builtin {
1248+
let Spellings = ["__builtin_masked_expand_load"];
1249+
let Attributes = [NoThrow, CustomTypeChecking];
1250+
let Prototype = "void(...)";
1251+
}
1252+
1253+
def MaskedCompressStore : Builtin {
1254+
let Spellings = ["__builtin_masked_compress_store"];
1255+
let Attributes = [NoThrow, CustomTypeChecking];
1256+
let Prototype = "void(...)";
1257+
}
1258+
12471259
def AllocaUninitialized : Builtin {
12481260
let Spellings = ["__builtin_alloca_uninitialized"];
12491261
let Attributes = [FunctionWithBuiltinPrefix, NoThrow];

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4271,7 +4271,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
42714271
return RValue::get(Result);
42724272
}
42734273

4274-
case Builtin::BI__builtin_masked_load: {
4274+
case Builtin::BI__builtin_masked_load:
4275+
case Builtin::BI__builtin_masked_expand_load: {
42754276
llvm::Value *Mask = EmitScalarExpr(E->getArg(0));
42764277
llvm::Value *Ptr = EmitScalarExpr(E->getArg(1));
42774278

@@ -4284,14 +4285,21 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
42844285
if (E->getNumArgs() > 2)
42854286
PassThru = EmitScalarExpr(E->getArg(2));
42864287

4287-
Function *F =
4288-
CGM.getIntrinsic(Intrinsic::masked_load, {RetTy, UnqualPtrTy});
4289-
4290-
llvm::Value *Result =
4291-
Builder.CreateCall(F, {Ptr, AlignVal, Mask, PassThru}, "masked_load");
4288+
llvm::Value *Result;
4289+
if (BuiltinID == Builtin::BI__builtin_masked_load) {
4290+
Function *F =
4291+
CGM.getIntrinsic(Intrinsic::masked_load, {RetTy, UnqualPtrTy});
4292+
Result =
4293+
Builder.CreateCall(F, {Ptr, AlignVal, Mask, PassThru}, "masked_load");
4294+
} else {
4295+
Function *F = CGM.getIntrinsic(Intrinsic::masked_expandload, {RetTy});
4296+
Result =
4297+
Builder.CreateCall(F, {Ptr, Mask, PassThru}, "masked_expand_load");
4298+
}
42924299
return RValue::get(Result);
42934300
};
4294-
case Builtin::BI__builtin_masked_store: {
4301+
case Builtin::BI__builtin_masked_store:
4302+
case Builtin::BI__builtin_masked_compress_store: {
42954303
llvm::Value *Mask = EmitScalarExpr(E->getArg(0));
42964304
llvm::Value *Val = EmitScalarExpr(E->getArg(1));
42974305
llvm::Value *Ptr = EmitScalarExpr(E->getArg(2));
@@ -4304,10 +4312,15 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
43044312
llvm::Value *AlignVal =
43054313
llvm::ConstantInt::get(Int32Ty, Align.getQuantity());
43064314

4307-
llvm::Function *F =
4308-
CGM.getIntrinsic(llvm::Intrinsic::masked_store, {ValLLTy, PtrTy});
4309-
4310-
Builder.CreateCall(F, {Val, Ptr, AlignVal, Mask});
4315+
if (BuiltinID == Builtin::BI__builtin_masked_store) {
4316+
llvm::Function *F =
4317+
CGM.getIntrinsic(llvm::Intrinsic::masked_store, {ValLLTy, PtrTy});
4318+
Builder.CreateCall(F, {Val, Ptr, AlignVal, Mask});
4319+
} else {
4320+
llvm::Function *F =
4321+
CGM.getIntrinsic(llvm::Intrinsic::masked_compressstore, {ValLLTy});
4322+
Builder.CreateCall(F, {Val, Ptr, Mask});
4323+
}
43114324
return RValue::get(nullptr);
43124325
}
43134326

clang/lib/Sema/SemaChecking.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2310,7 +2310,9 @@ static ExprResult BuiltinMaskedLoad(Sema &S, CallExpr *TheCall) {
23102310
if (MaskVecTy->getNumElements() != DataVecTy->getNumElements())
23112311
return ExprError(
23122312
S.Diag(TheCall->getBeginLoc(), diag::err_vec_masked_load_store_size)
2313-
<< "__builtin_masked_load" << MaskTy << PointeeTy);
2313+
<< S.getASTContext().BuiltinInfo.getQuotedName(
2314+
TheCall->getBuiltinCallee())
2315+
<< MaskTy << PointeeTy);
23142316

23152317
TheCall->setType(PointeeTy);
23162318
return TheCall;
@@ -2344,7 +2346,9 @@ static ExprResult BuiltinMaskedStore(Sema &S, CallExpr *TheCall) {
23442346
MaskVecTy->getNumElements() != PtrVecTy->getNumElements())
23452347
return ExprError(
23462348
S.Diag(TheCall->getBeginLoc(), diag::err_vec_masked_load_store_size)
2347-
<< "__builtin_masked_store" << MaskTy << PointeeTy);
2349+
<< S.getASTContext().BuiltinInfo.getQuotedName(
2350+
TheCall->getBuiltinCallee())
2351+
<< MaskTy << PointeeTy);
23482352

23492353
if (!S.Context.hasSameType(ValTy, PointeeTy))
23502354
return ExprError(S.Diag(TheCall->getBeginLoc(),
@@ -2610,8 +2614,10 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
26102614
// TheCall will be freed by the smart pointer here, but that's fine, since
26112615
// BuiltinShuffleVector guts it, but then doesn't release it.
26122616
case Builtin::BI__builtin_masked_load:
2617+
case Builtin::BI__builtin_masked_expand_load:
26132618
return BuiltinMaskedLoad(*this, TheCall);
26142619
case Builtin::BI__builtin_masked_store:
2620+
case Builtin::BI__builtin_masked_compress_store:
26152621
return BuiltinMaskedStore(*this, TheCall);
26162622
case Builtin::BI__builtin_invoke:
26172623
return BuiltinInvoke(*this, TheCall);

clang/test/CodeGen/builtin-masked.c

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,34 @@ v8i test_load_passthru(v8b m, v8i *p, v8i t) {
5252
return __builtin_masked_load(m, p, t);
5353
}
5454

55+
// CHECK-LABEL: define dso_local <8 x i32> @test_load_expand(
56+
// CHECK-SAME: i8 noundef [[M_COERCE:%.*]], ptr noundef [[P:%.*]], ptr noundef byval(<8 x i32>) align 32 [[TMP0:%.*]]) #[[ATTR0]] {
57+
// CHECK-NEXT: [[ENTRY:.*:]]
58+
// CHECK-NEXT: [[M:%.*]] = alloca i8, align 1
59+
// CHECK-NEXT: [[M_ADDR:%.*]] = alloca i8, align 1
60+
// CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8
61+
// CHECK-NEXT: [[T_ADDR:%.*]] = alloca <8 x i32>, align 32
62+
// CHECK-NEXT: store i8 [[M_COERCE]], ptr [[M]], align 1
63+
// CHECK-NEXT: [[LOAD_BITS:%.*]] = load i8, ptr [[M]], align 1
64+
// CHECK-NEXT: [[M1:%.*]] = bitcast i8 [[LOAD_BITS]] to <8 x i1>
65+
// CHECK-NEXT: [[T:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32
66+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[M1]] to i8
67+
// CHECK-NEXT: store i8 [[TMP1]], ptr [[M_ADDR]], align 1
68+
// CHECK-NEXT: store ptr [[P]], ptr [[P_ADDR]], align 8
69+
// CHECK-NEXT: store <8 x i32> [[T]], ptr [[T_ADDR]], align 32
70+
// CHECK-NEXT: [[LOAD_BITS2:%.*]] = load i8, ptr [[M_ADDR]], align 1
71+
// CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1>
72+
// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[P_ADDR]], align 8
73+
// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr [[T_ADDR]], align 32
74+
// CHECK-NEXT: [[MASKED_EXPAND_LOAD:%.*]] = call <8 x i32> @llvm.masked.expandload.v8i32(ptr [[TMP3]], <8 x i1> [[TMP2]], <8 x i32> [[TMP4]])
75+
// CHECK-NEXT: ret <8 x i32> [[MASKED_EXPAND_LOAD]]
76+
//
77+
v8i test_load_expand(v8b m, v8i *p, v8i t) {
78+
return __builtin_masked_expand_load(m, p, t);
79+
}
80+
5581
// CHECK-LABEL: define dso_local void @test_store(
56-
// CHECK-SAME: i8 noundef [[M_COERCE:%.*]], ptr noundef byval(<8 x i32>) align 32 [[TMP0:%.*]], ptr noundef [[P:%.*]]) #[[ATTR2:[0-9]+]] {
82+
// CHECK-SAME: i8 noundef [[M_COERCE:%.*]], ptr noundef byval(<8 x i32>) align 32 [[TMP0:%.*]], ptr noundef [[P:%.*]]) #[[ATTR3:[0-9]+]] {
5783
// CHECK-NEXT: [[ENTRY:.*:]]
5884
// CHECK-NEXT: [[M:%.*]] = alloca i8, align 1
5985
// CHECK-NEXT: [[M_ADDR:%.*]] = alloca i8, align 1
@@ -77,3 +103,29 @@ v8i test_load_passthru(v8b m, v8i *p, v8i t) {
77103
void test_store(v8b m, v8i v, v8i *p) {
78104
__builtin_masked_store(m, v, p);
79105
}
106+
107+
// CHECK-LABEL: define dso_local void @test_compress_store(
108+
// CHECK-SAME: i8 noundef [[M_COERCE:%.*]], ptr noundef byval(<8 x i32>) align 32 [[TMP0:%.*]], ptr noundef [[P:%.*]]) #[[ATTR3]] {
109+
// CHECK-NEXT: [[ENTRY:.*:]]
110+
// CHECK-NEXT: [[M:%.*]] = alloca i8, align 1
111+
// CHECK-NEXT: [[M_ADDR:%.*]] = alloca i8, align 1
112+
// CHECK-NEXT: [[V_ADDR:%.*]] = alloca <8 x i32>, align 32
113+
// CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8
114+
// CHECK-NEXT: store i8 [[M_COERCE]], ptr [[M]], align 1
115+
// CHECK-NEXT: [[LOAD_BITS:%.*]] = load i8, ptr [[M]], align 1
116+
// CHECK-NEXT: [[M1:%.*]] = bitcast i8 [[LOAD_BITS]] to <8 x i1>
117+
// CHECK-NEXT: [[V:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32
118+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[M1]] to i8
119+
// CHECK-NEXT: store i8 [[TMP1]], ptr [[M_ADDR]], align 1
120+
// CHECK-NEXT: store <8 x i32> [[V]], ptr [[V_ADDR]], align 32
121+
// CHECK-NEXT: store ptr [[P]], ptr [[P_ADDR]], align 8
122+
// CHECK-NEXT: [[LOAD_BITS2:%.*]] = load i8, ptr [[M_ADDR]], align 1
123+
// CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1>
124+
// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr [[V_ADDR]], align 32
125+
// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[P_ADDR]], align 8
126+
// CHECK-NEXT: call void @llvm.masked.compressstore.v8i32(<8 x i32> [[TMP3]], ptr [[TMP4]], <8 x i1> [[TMP2]])
127+
// CHECK-NEXT: ret void
128+
//
129+
void test_compress_store(v8b m, v8i v, v8i *p) {
130+
__builtin_masked_compress_store(m, v, p);
131+
}

clang/test/Sema/builtin-masked.c

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@ typedef float v8f __attribute__((ext_vector_type(8)));
88
void test_masked_load(v8i *pf, v8b mask, v2b mask2, v2b thru) {
99
(void)__builtin_masked_load(mask); // expected-error {{too few arguments to function call, expected 2, have 1}}
1010
(void)__builtin_masked_load(mask, pf, pf, pf); // expected-error {{too many arguments to function call, expected at most 3, have 4}}
11-
(void)__builtin_masked_load(mask2, pf); // expected-error {{all arguments to __builtin_masked_load must have the same number of elements}}
11+
(void)__builtin_masked_load(mask2, pf); // expected-error {{all arguments to '__builtin_masked_load' must have the same number of elements}}
1212
(void)__builtin_masked_load(mask, mask); // expected-error {{2nd argument must be a pointer to vector}}
1313
(void)__builtin_masked_load(mask, (void *)0); // expected-error {{2nd argument must be a pointer to vector}}
1414
(void)__builtin_masked_load(mask2, pf, thru); // expected-error {{3rd argument must be a 'v8i' (vector of 8 'int' values)}}
15-
(void)__builtin_masked_load(mask2, pf); // expected-error {{all arguments to __builtin_masked_load must have the same number of elements}}
15+
(void)__builtin_masked_load(mask2, pf); // expected-error {{all arguments to '__builtin_masked_load' must have the same number of elements}}
1616
}
1717

1818
void test_masked_store(v8i *pf, v8f *pf2, v8b mask, v2b mask2) {
@@ -21,6 +21,26 @@ void test_masked_store(v8i *pf, v8f *pf2, v8b mask, v2b mask2) {
2121
__builtin_masked_store(0, 0, pf); // expected-error {{1st argument must be a vector of boolean types (was 'int')}}
2222
__builtin_masked_store(mask, 0, pf); // expected-error {{2nd argument must be a vector}}
2323
__builtin_masked_store(mask, *pf, 0); // expected-error {{3rd argument must be a pointer to vector}}
24-
__builtin_masked_store(mask2, *pf, pf); // expected-error {{all arguments to __builtin_masked_store must have the same number of elements}}
24+
__builtin_masked_store(mask2, *pf, pf); // expected-error {{all arguments to '__builtin_masked_store' must have the same number of elements}}
2525
__builtin_masked_store(mask, *pf, pf2); // expected-error {{last two arguments to '__builtin_masked_store' must have the same type}}
2626
}
27+
28+
void test_masked_expand_load(v8i *pf, v8b mask, v2b mask2, v2b thru) {
29+
(void)__builtin_masked_expand_load(mask); // expected-error {{too few arguments to function call, expected 2, have 1}}
30+
(void)__builtin_masked_expand_load(mask, pf, pf, pf); // expected-error {{too many arguments to function call, expected at most 3, have 4}}
31+
(void)__builtin_masked_expand_load(mask2, pf); // expected-error {{all arguments to '__builtin_masked_expand_load' must have the same number of elements}}
32+
(void)__builtin_masked_expand_load(mask, mask); // expected-error {{2nd argument must be a pointer to vector}}
33+
(void)__builtin_masked_expand_load(mask, (void *)0); // expected-error {{2nd argument must be a pointer to vector}}
34+
(void)__builtin_masked_expand_load(mask2, pf, thru); // expected-error {{3rd argument must be a 'v8i' (vector of 8 'int' values)}}
35+
(void)__builtin_masked_expand_load(mask2, pf); // expected-error {{all arguments to '__builtin_masked_expand_load' must have the same number of elements}}
36+
}
37+
38+
void test_masked_compress_store(v8i *pf, v8f *pf2, v8b mask, v2b mask2) {
39+
__builtin_masked_compress_store(mask); // expected-error {{too few arguments to function call, expected 3, have 1}}
40+
__builtin_masked_compress_store(mask, 0, 0, 0); // expected-error {{too many arguments to function call, expected 3, have 4}}
41+
__builtin_masked_compress_store(0, 0, pf); // expected-error {{1st argument must be a vector of boolean types (was 'int')}}
42+
__builtin_masked_compress_store(mask, 0, pf); // expected-error {{2nd argument must be a vector}}
43+
__builtin_masked_compress_store(mask, *pf, 0); // expected-error {{3rd argument must be a pointer to vector}}
44+
__builtin_masked_compress_store(mask2, *pf, pf); // expected-error {{all arguments to '__builtin_masked_compress_store' must have the same number of elements}}
45+
__builtin_masked_compress_store(mask, *pf, pf2); // expected-error {{last two arguments to '__builtin_masked_compress_store' must have the same type}}
46+
}

0 commit comments

Comments
 (0)