Skip to content

Commit 9888f0c

Browse files
authored
[Clang] Add builtins for masked vector loads / stores (#154464)
Summary: Clang has support for boolean vectors, these builtins expose the LLVM instruction of the same name. This differs from a manual load and select by potentially suppressing traps from deactivated lanes. Fixes: #107753
1 parent 2f6b747 commit 9888f0c

File tree

8 files changed

+239
-2
lines changed

8 files changed

+239
-2
lines changed

clang/docs/LanguageExtensions.rst

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -942,6 +942,24 @@ Let ``VT`` be a vector type and ``ET`` the element type of ``VT``.
942942
for the comparison.
943943
======================================= ====================================================================== ==================================
944944

945+
*Masked Builtins*
946+
947+
Each builtin accesses memory according to a provided boolean mask. These are
948+
provided as ``__builtin_masked_load`` and ``__builtin_masked_store``. The first
949+
argument is always boolean mask vector.
950+
951+
Example:
952+
953+
.. code-block:: c++
954+
955+
using v8b = bool [[clang::ext_vector_type(8)]];
956+
using v8i = int [[clang::ext_vector_type(8)]];
957+
958+
v8i load(v8b m, v8i *p) { return __builtin_masked_load(m, p); }
959+
960+
void store(v8b m, v8i v, v8i *p) { __builtin_masked_store(m, v, p); }
961+
962+
945963
Matrix Types
946964
============
947965

clang/docs/ReleaseNotes.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,9 @@ Non-comprehensive list of changes in this release
140140
- A vector of booleans is now a valid condition for the ternary ``?:`` operator.
141141
This binds to a simple vector select operation.
142142

143+
- Added ``__builtin_masked_load`` and ``__builtin_masked_store`` for conditional
144+
memory loads from vectors. Binds to the LLVM intrinsic of the same name.
145+
143146
- Use of ``__has_feature`` to detect the ``ptrauth_qualifier`` and ``ptrauth_intrinsics``
144147
features has been deprecated, and is restricted to the arm64e target only. The
145148
correct method to check for these features is to test for the ``__PTRAUTH__``

clang/include/clang/Basic/Builtins.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1232,6 +1232,18 @@ def ConvertVector : Builtin {
12321232
let Prototype = "void(...)";
12331233
}
12341234

1235+
def MaskedLoad : Builtin {
1236+
let Spellings = ["__builtin_masked_load"];
1237+
let Attributes = [NoThrow, CustomTypeChecking];
1238+
let Prototype = "void(...)";
1239+
}
1240+
1241+
def MaskedStore : Builtin {
1242+
let Spellings = ["__builtin_masked_store"];
1243+
let Attributes = [NoThrow, CustomTypeChecking];
1244+
let Prototype = "void(...)";
1245+
}
1246+
12351247
def AllocaUninitialized : Builtin {
12361248
let Spellings = ["__builtin_alloca_uninitialized"];
12371249
let Attributes = [FunctionWithBuiltinPrefix, NoThrow];

clang/include/clang/Basic/DiagnosticSemaKinds.td

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11001,10 +11001,15 @@ def err_block_on_vm : Error<
1100111001
def err_sizeless_nonlocal : Error<
1100211002
"non-local variable with sizeless type %0">;
1100311003

11004+
def err_vec_masked_load_store_ptr : Error<
11005+
"%ordinal0 argument must be a %1">;
11006+
def err_vec_masked_load_store_size : Error<
11007+
"all arguments to %0 must have the same number of elements (was %1 and %2)">;
11008+
1100411009
def err_vec_builtin_non_vector : Error<
1100511010
"%select{first two|all}1 arguments to %0 must be vectors">;
1100611011
def err_vec_builtin_incompatible_vector : Error<
11007-
"%select{first two|all}1 arguments to %0 must have the same type">;
11012+
"%select{first two|all|last two}1 arguments to %0 must have the same type">;
1100811013
def err_vsx_builtin_nonconstant_argument : Error<
1100911014
"argument %0 to %1 must be a 2-bit unsigned literal (i.e. 0, 1, 2 or 3)">;
1101011015

@@ -12866,7 +12871,7 @@ def err_builtin_invalid_arg_type: Error<
1286612871
"%plural{0:|: }1"
1286712872
// Second component: integer-like types
1286812873
"%select{|integer|signed integer|unsigned integer|'int'|"
12869-
"pointer to a valid matrix element}2"
12874+
"pointer to a valid matrix element|boolean}2"
1287012875
// A space after a non-empty second component
1287112876
"%plural{0:|: }2"
1287212877
// An 'or' if non-empty second and third components are combined

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4255,6 +4255,44 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
42554255
return RValue::get(Result);
42564256
}
42574257

4258+
case Builtin::BI__builtin_masked_load: {
4259+
llvm::Value *Mask = EmitScalarExpr(E->getArg(0));
4260+
llvm::Value *Ptr = EmitScalarExpr(E->getArg(1));
4261+
4262+
llvm::Type *RetTy = CGM.getTypes().ConvertType(E->getType());
4263+
CharUnits Align = CGM.getNaturalTypeAlignment(E->getType(), nullptr);
4264+
llvm::Value *AlignVal =
4265+
llvm::ConstantInt::get(Int32Ty, Align.getQuantity());
4266+
4267+
llvm::Value *PassThru = llvm::PoisonValue::get(RetTy);
4268+
4269+
Function *F =
4270+
CGM.getIntrinsic(Intrinsic::masked_load, {RetTy, UnqualPtrTy});
4271+
4272+
llvm::Value *Result =
4273+
Builder.CreateCall(F, {Ptr, AlignVal, Mask, PassThru}, "masked_load");
4274+
return RValue::get(Result);
4275+
};
4276+
case Builtin::BI__builtin_masked_store: {
4277+
llvm::Value *Mask = EmitScalarExpr(E->getArg(0));
4278+
llvm::Value *Val = EmitScalarExpr(E->getArg(1));
4279+
llvm::Value *Ptr = EmitScalarExpr(E->getArg(2));
4280+
4281+
QualType ValTy = E->getArg(1)->getType();
4282+
llvm::Type *ValLLTy = CGM.getTypes().ConvertType(ValTy);
4283+
llvm::Type *PtrTy = Ptr->getType();
4284+
4285+
CharUnits Align = CGM.getNaturalTypeAlignment(ValTy, nullptr);
4286+
llvm::Value *AlignVal =
4287+
llvm::ConstantInt::get(Int32Ty, Align.getQuantity());
4288+
4289+
llvm::Function *F =
4290+
CGM.getIntrinsic(llvm::Intrinsic::masked_store, {ValLLTy, PtrTy});
4291+
4292+
Builder.CreateCall(F, {Val, Ptr, AlignVal, Mask});
4293+
return RValue::get(nullptr);
4294+
}
4295+
42584296
case Builtin::BI__builtin_isinf_sign: {
42594297
// isinf_sign(x) -> fabs(x) == infinity ? (signbit(x) ? -1 : 1) : 0
42604298
CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);

clang/lib/Sema/SemaChecking.cpp

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2266,6 +2266,85 @@ static bool BuiltinCountZeroBitsGeneric(Sema &S, CallExpr *TheCall) {
22662266
return false;
22672267
}
22682268

2269+
static bool CheckMaskedBuiltinArgs(Sema &S, Expr *MaskArg, Expr *PtrArg,
2270+
unsigned Pos) {
2271+
QualType MaskTy = MaskArg->getType();
2272+
if (!MaskTy->isExtVectorBoolType())
2273+
return S.Diag(MaskArg->getBeginLoc(), diag::err_builtin_invalid_arg_type)
2274+
<< 1 << /* vector of */ 4 << /* booleans */ 6 << /* no fp */ 0
2275+
<< MaskTy;
2276+
2277+
QualType PtrTy = PtrArg->getType();
2278+
if (!PtrTy->isPointerType() || !PtrTy->getPointeeType()->isVectorType())
2279+
return S.Diag(PtrArg->getExprLoc(), diag::err_vec_masked_load_store_ptr)
2280+
<< Pos << "pointer to vector";
2281+
return false;
2282+
}
2283+
2284+
static ExprResult BuiltinMaskedLoad(Sema &S, CallExpr *TheCall) {
2285+
if (S.checkArgCount(TheCall, 2))
2286+
return ExprError();
2287+
2288+
Expr *MaskArg = TheCall->getArg(0);
2289+
Expr *PtrArg = TheCall->getArg(1);
2290+
if (CheckMaskedBuiltinArgs(S, MaskArg, PtrArg, 2))
2291+
return ExprError();
2292+
2293+
QualType MaskTy = MaskArg->getType();
2294+
QualType PtrTy = PtrArg->getType();
2295+
QualType PointeeTy = PtrTy->getPointeeType();
2296+
const VectorType *MaskVecTy = MaskTy->getAs<VectorType>();
2297+
const VectorType *DataVecTy = PointeeTy->getAs<VectorType>();
2298+
if (MaskVecTy->getNumElements() != DataVecTy->getNumElements())
2299+
return ExprError(
2300+
S.Diag(TheCall->getBeginLoc(), diag::err_vec_masked_load_store_size)
2301+
<< "__builtin_masked_load" << MaskTy << PointeeTy);
2302+
2303+
TheCall->setType(PointeeTy);
2304+
return TheCall;
2305+
}
2306+
2307+
static ExprResult BuiltinMaskedStore(Sema &S, CallExpr *TheCall) {
2308+
if (S.checkArgCount(TheCall, 3))
2309+
return ExprError();
2310+
2311+
Expr *MaskArg = TheCall->getArg(0);
2312+
Expr *ValArg = TheCall->getArg(1);
2313+
Expr *PtrArg = TheCall->getArg(2);
2314+
2315+
if (CheckMaskedBuiltinArgs(S, MaskArg, PtrArg, 3))
2316+
return ExprError();
2317+
2318+
QualType MaskTy = MaskArg->getType();
2319+
QualType PtrTy = PtrArg->getType();
2320+
QualType ValTy = ValArg->getType();
2321+
if (!ValTy->isVectorType())
2322+
return ExprError(
2323+
S.Diag(ValArg->getExprLoc(), diag::err_vec_masked_load_store_ptr)
2324+
<< 2 << "vector");
2325+
2326+
QualType PointeeTy = PtrTy->getPointeeType();
2327+
const VectorType *MaskVecTy = MaskTy->getAs<VectorType>();
2328+
const VectorType *ValVecTy = ValTy->getAs<VectorType>();
2329+
const VectorType *PtrVecTy = PointeeTy->getAs<VectorType>();
2330+
2331+
if (MaskVecTy->getNumElements() != ValVecTy->getNumElements() ||
2332+
MaskVecTy->getNumElements() != PtrVecTy->getNumElements())
2333+
return ExprError(
2334+
S.Diag(TheCall->getBeginLoc(), diag::err_vec_masked_load_store_size)
2335+
<< "__builtin_masked_store" << MaskTy << PointeeTy);
2336+
2337+
if (!S.Context.hasSameType(ValTy, PointeeTy))
2338+
return ExprError(S.Diag(TheCall->getBeginLoc(),
2339+
diag::err_vec_builtin_incompatible_vector)
2340+
<< TheCall->getDirectCallee() << /*isMorethantwoArgs*/ 2
2341+
<< SourceRange(TheCall->getArg(1)->getBeginLoc(),
2342+
TheCall->getArg(1)->getEndLoc()));
2343+
2344+
TheCall->setType(S.Context.VoidTy);
2345+
return TheCall;
2346+
}
2347+
22692348
static ExprResult BuiltinInvoke(Sema &S, CallExpr *TheCall) {
22702349
SourceLocation Loc = TheCall->getBeginLoc();
22712350
MutableArrayRef Args(TheCall->getArgs(), TheCall->getNumArgs());
@@ -2518,6 +2597,10 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
25182597
return BuiltinShuffleVector(TheCall);
25192598
// TheCall will be freed by the smart pointer here, but that's fine, since
25202599
// BuiltinShuffleVector guts it, but then doesn't release it.
2600+
case Builtin::BI__builtin_masked_load:
2601+
return BuiltinMaskedLoad(*this, TheCall);
2602+
case Builtin::BI__builtin_masked_store:
2603+
return BuiltinMaskedStore(*this, TheCall);
25212604
case Builtin::BI__builtin_invoke:
25222605
return BuiltinInvoke(*this, TheCall);
25232606
case Builtin::BI__builtin_prefetch:

clang/test/CodeGen/builtin-masked.c

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
2+
// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s | FileCheck %s
3+
4+
typedef int v8i __attribute__((ext_vector_type(8)));
5+
typedef _Bool v8b __attribute__((ext_vector_type(8)));
6+
7+
// CHECK-LABEL: define dso_local <8 x i32> @test_load(
8+
// CHECK-SAME: i8 noundef [[M_COERCE:%.*]], ptr noundef [[P:%.*]]) #[[ATTR0:[0-9]+]] {
9+
// CHECK-NEXT: [[ENTRY:.*:]]
10+
// CHECK-NEXT: [[M:%.*]] = alloca i8, align 1
11+
// CHECK-NEXT: [[M_ADDR:%.*]] = alloca i8, align 1
12+
// CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8
13+
// CHECK-NEXT: store i8 [[M_COERCE]], ptr [[M]], align 1
14+
// CHECK-NEXT: [[LOAD_BITS:%.*]] = load i8, ptr [[M]], align 1
15+
// CHECK-NEXT: [[M1:%.*]] = bitcast i8 [[LOAD_BITS]] to <8 x i1>
16+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i1> [[M1]] to i8
17+
// CHECK-NEXT: store i8 [[TMP0]], ptr [[M_ADDR]], align 1
18+
// CHECK-NEXT: store ptr [[P]], ptr [[P_ADDR]], align 8
19+
// CHECK-NEXT: [[LOAD_BITS2:%.*]] = load i8, ptr [[M_ADDR]], align 1
20+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1>
21+
// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[P_ADDR]], align 8
22+
// CHECK-NEXT: [[MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP2]], i32 32, <8 x i1> [[TMP1]], <8 x i32> poison)
23+
// CHECK-NEXT: ret <8 x i32> [[MASKED_LOAD]]
24+
//
25+
v8i test_load(v8b m, v8i *p) {
26+
return __builtin_masked_load(m, p);
27+
}
28+
29+
// CHECK-LABEL: define dso_local void @test_store(
30+
// CHECK-SAME: i8 noundef [[M_COERCE:%.*]], ptr noundef byval(<8 x i32>) align 32 [[TMP0:%.*]], ptr noundef [[P:%.*]]) #[[ATTR2:[0-9]+]] {
31+
// CHECK-NEXT: [[ENTRY:.*:]]
32+
// CHECK-NEXT: [[M:%.*]] = alloca i8, align 1
33+
// CHECK-NEXT: [[M_ADDR:%.*]] = alloca i8, align 1
34+
// CHECK-NEXT: [[V_ADDR:%.*]] = alloca <8 x i32>, align 32
35+
// CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8
36+
// CHECK-NEXT: store i8 [[M_COERCE]], ptr [[M]], align 1
37+
// CHECK-NEXT: [[LOAD_BITS:%.*]] = load i8, ptr [[M]], align 1
38+
// CHECK-NEXT: [[M1:%.*]] = bitcast i8 [[LOAD_BITS]] to <8 x i1>
39+
// CHECK-NEXT: [[V:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32
40+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[M1]] to i8
41+
// CHECK-NEXT: store i8 [[TMP1]], ptr [[M_ADDR]], align 1
42+
// CHECK-NEXT: store <8 x i32> [[V]], ptr [[V_ADDR]], align 32
43+
// CHECK-NEXT: store ptr [[P]], ptr [[P_ADDR]], align 8
44+
// CHECK-NEXT: [[LOAD_BITS2:%.*]] = load i8, ptr [[M_ADDR]], align 1
45+
// CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1>
46+
// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr [[V_ADDR]], align 32
47+
// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[P_ADDR]], align 8
48+
// CHECK-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP3]], ptr [[TMP4]], i32 32, <8 x i1> [[TMP2]])
49+
// CHECK-NEXT: ret void
50+
//
51+
void test_store(v8b m, v8i v, v8i *p) {
52+
__builtin_masked_store(m, v, p);
53+
}

clang/test/Sema/builtin-masked.c

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify %s
2+
3+
typedef int v8i __attribute__((ext_vector_type(8)));
4+
typedef _Bool v8b __attribute__((ext_vector_type(8)));
5+
typedef _Bool v2b __attribute__((ext_vector_type(2)));
6+
typedef float v8f __attribute__((ext_vector_type(8)));
7+
8+
void test_masked_load(v8i *pf, v8b mask, v2b mask2) {
9+
(void)__builtin_masked_load(mask); // expected-error {{too few arguments to function call, expected 2, have 1}}
10+
(void)__builtin_masked_load(mask, pf, pf); // expected-error {{too many arguments to function call, expected 2, have 3}}
11+
(void)__builtin_masked_load(mask2, pf); // expected-error {{all arguments to __builtin_masked_load must have the same number of elements}}
12+
(void)__builtin_masked_load(mask, mask); // expected-error {{2nd argument must be a pointer to vector}}
13+
(void)__builtin_masked_load(mask, (void *)0); // expected-error {{2nd argument must be a pointer to vector}}
14+
(void)__builtin_masked_load(mask2, pf); // expected-error {{all arguments to __builtin_masked_load must have the same number of elements}}
15+
}
16+
17+
void test_masked_store(v8i *pf, v8f *pf2, v8b mask, v2b mask2) {
18+
__builtin_masked_store(mask); // expected-error {{too few arguments to function call, expected 3, have 1}}
19+
__builtin_masked_store(mask, 0, 0, 0); // expected-error {{too many arguments to function call, expected 3, have 4}}
20+
__builtin_masked_store(0, 0, pf); // expected-error {{1st argument must be a vector of boolean types (was 'int')}}
21+
__builtin_masked_store(mask, 0, pf); // expected-error {{2nd argument must be a vector}}
22+
__builtin_masked_store(mask, *pf, 0); // expected-error {{3rd argument must be a pointer to vector}}
23+
__builtin_masked_store(mask2, *pf, pf); // expected-error {{all arguments to __builtin_masked_store must have the same number of elements}}
24+
__builtin_masked_store(mask, *pf, pf2); // expected-error {{last two arguments to '__builtin_masked_store' must have the same type}}
25+
}

0 commit comments

Comments
 (0)