Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions clang/lib/CIR/CodeGen/CIRGenBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,34 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
addr.getAlignment().getAsAlign().value());
}

/// Create a call to a Masked Load intrinsic.
/// \p loc - expression location
/// \p ty - vector type to load
/// \p ptr - base pointer for the load
/// \p alignment - alignment of the source location
/// \p mask - vector of booleans which indicates what vector lanes should
/// be accessed in memory
/// \p passThru - pass-through value that is used to fill the masked-off
/// lanes
/// of the result
mlir::Value createMaskedLoad(mlir::Location loc, mlir::Type ty,
mlir::Value ptr, llvm::Align alignment,
mlir::Value mask, mlir::Value passThru) {

assert(mlir::isa<cir::VectorType>(ty) && "Type should be vector");
assert(mask && "Mask should not be all-ones (null)");

if (!passThru)
passThru = this->getConstant(loc, cir::PoisonAttr::get(ty));

mlir::Value ops[] = {ptr, this->getUInt32(int32_t(alignment.value()), loc),
mask, passThru};

return cir::LLVMIntrinsicCallOp::create(
*this, loc, getStringAttr("masked.load"), ty, ops)
.getResult();
}

cir::VecShuffleOp
createVecShuffle(mlir::Location loc, mlir::Value vec1, mlir::Value vec2,
llvm::ArrayRef<mlir::Attribute> maskAttrs) {
Expand Down
42 changes: 42 additions & 0 deletions clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,40 @@ static mlir::Value emitIntrinsicCallOp(CIRGenFunction &cgf, const CallExpr *e,
.getResult();
}

// Convert the mask from an integer type to a vector of i1.
static mlir::Value getMaskVecValue(CIRGenFunction &cgf, mlir::Value mask,
unsigned numElts, mlir::Location loc) {
cir::VectorType maskTy =
cir::VectorType::get(cgf.getBuilder().getSIntNTy(1),
cast<cir::IntType>(mask.getType()).getWidth());

mlir::Value maskVec = cgf.getBuilder().createBitcast(mask, maskTy);

// If we have less than 8 elements, then the starting mask was an i8 and
// we need to extract down to the right number of elements.
if (numElts < 8) {
llvm::SmallVector<int64_t, 4> indices;
for (unsigned i = 0; i != numElts; ++i)
indices.push_back(i);
maskVec = cgf.getBuilder().createVecShuffle(loc, maskVec, maskVec, indices);
}

return maskVec;
}

static mlir::Value emitX86MaskedLoad(CIRGenFunction &cgf,
ArrayRef<mlir::Value> ops,
llvm::Align alignment,
mlir::Location loc) {
mlir::Type ty = ops[1].getType();
mlir::Value ptr = ops[0];
mlir::Value maskVec =
getMaskVecValue(cgf, ops[2], cast<cir::VectorType>(ty).getSize(), loc);

return cgf.getBuilder().createMaskedLoad(loc, ty, ptr, alignment, maskVec,
ops[1]);
}

// OG has unordered comparison as a form of optimization in addition to
// ordered comparison, while CIR doesn't.
//
Expand Down Expand Up @@ -327,6 +361,11 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
case X86::BI__builtin_ia32_movdqa64store512_mask:
case X86::BI__builtin_ia32_storeaps512_mask:
case X86::BI__builtin_ia32_storeapd512_mask:
cgm.errorNYI(expr->getSourceRange(),
std::string("unimplemented X86 builtin call: ") +
getContext().BuiltinInfo.getName(builtinID));
return {};

case X86::BI__builtin_ia32_loadups128_mask:
case X86::BI__builtin_ia32_loadups256_mask:
case X86::BI__builtin_ia32_loadups512_mask:
Expand All @@ -345,6 +384,9 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
case X86::BI__builtin_ia32_loaddqudi128_mask:
case X86::BI__builtin_ia32_loaddqudi256_mask:
case X86::BI__builtin_ia32_loaddqudi512_mask:
return emitX86MaskedLoad(*this, ops, llvm::Align(1),
getLoc(expr->getExprLoc()));

case X86::BI__builtin_ia32_loadsbf16128_mask:
case X86::BI__builtin_ia32_loadsh128_mask:
case X86::BI__builtin_ia32_loadss128_mask:
Expand Down
18 changes: 18 additions & 0 deletions clang/test/CIR/CodeGen/X86/avx512vl-builtins.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512f -target-feature +avx512vl -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512f -target-feature +avx512vl -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s


#include <immintrin.h>

__m128 test_mm_mask_loadu_ps(__m128 __W, __mmask8 __U, void const *__P) {
// CIR-LABEL: _mm_mask_loadu_ps
// CIR: {{%.*}} = cir.call_llvm_intrinsic "masked.load" %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (!cir.ptr<!cir.vector<4 x !cir.float>>, !u32i, !cir.vector<4 x !cir.int<s, 1>>, !cir.vector<4 x !cir.float>) -> !cir.vector<4 x !cir.float>

// LLVM-LABEL: @test_mm_mask_loadu_ps
// LLVM: @llvm.masked.load.v4f32.p0(ptr %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
return _mm_mask_loadu_ps(__W, __U, __P);
}


5 changes: 3 additions & 2 deletions llvm/include/llvm/IR/Intrinsics.td
Original file line number Diff line number Diff line change
Expand Up @@ -2524,9 +2524,10 @@ def int_vp_is_fpclass:
//
def int_masked_load:
DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[llvm_anyptr_ty,
[llvm_anyptr_ty, llvm_i32_ty,
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>],
[IntrReadMem, IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
[IntrReadMem, IntrArgMemOnly, ImmArg<ArgIndex<1>>,
NoCapture<ArgIndex<0>>]>;

def int_masked_store:
DefaultAttrsIntrinsic<[],
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Analysis/InstructionSimplify.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7151,6 +7151,7 @@ static Value *simplifyIntrinsic(CallBase *Call, Value *Callee,
switch (IID) {
case Intrinsic::masked_load:
case Intrinsic::masked_gather: {

Value *MaskArg = Args[1];
Value *PassthruArg = Args[2];
// If the mask is all zeros or undef, the "passthru" argument is the result.
Expand Down
7 changes: 5 additions & 2 deletions llvm/lib/IR/Verifier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6275,10 +6275,13 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
Check(Call.getType()->isVectorTy(), "masked_load: must return a vector",
Call);

Value *Mask = Call.getArgOperand(1);
Value *PassThru = Call.getArgOperand(2);
ConstantInt *Alignment = cast<ConstantInt>(Call.getArgOperand(1));
Value *Mask = Call.getArgOperand(2);
Value *PassThru = Call.getArgOperand(3);
Check(Mask->getType()->isVectorTy(), "masked_load: mask must be vector",
Call);
Check(Alignment->getValue().isPowerOf2(),
"masked_load: alignment must be a power of 2", Call);
Check(PassThru->getType() == Call.getType(),
"masked_load: pass through and return type must match", Call);
Check(cast<VectorType>(Mask->getType())->getElementCount() ==
Expand Down
Loading