Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
aa4e757
[AMDGPU] add support for i64 readlane
vikramRH Apr 10, 2024
b895dd5
add support for i64 readfirstlane and writelane intrinsics
vikramRH Apr 12, 2024
dfa3219
Fix issues with writelane expansion
vikramRH Apr 15, 2024
fcc0a1a
code refactor and add patterns for f64
vikramRH Apr 18, 2024
4e71a06
clang format
vikramRH Apr 18, 2024
c7ff0e5
fix corner case with regkill and add readlane tests
vikramRH Apr 18, 2024
d6a8ce4
update builtin handling for readlane and readfirstlane
vikramRH Apr 19, 2024
15cbd90
add and update tests, fixes to writelane src0 imm handling
vikramRH Apr 19, 2024
776a4c6
address review comments
Apr 22, 2024
82da530
Implement lowering in legalizer for legal types
vikramRH May 2, 2024
14fcf44
refactor/improve GIsel lowering, added new tests
vikramRH May 6, 2024
d0610c4
Review comments, refactor GISel Impl
vikramRH May 9, 2024
9233833
clang-format
vikramRH May 9, 2024
5feef44
Merge branch 'main' into rw_lane_64
vikramRH May 13, 2024
993a630
Review comments, improve pointer handling with GISel
vikramRH May 13, 2024
556dda2
align comments
vikramRH May 13, 2024
b59873e
Review comments
vikramRH May 15, 2024
edd3179
fix type profile
vikramRH May 16, 2024
a75eb6b
remove spurious comma
vikramRH May 17, 2024
52d7020
review comments, move pointer tests to new files
vikramRH May 18, 2024
66ca57c
remove bitcasts, avoid special handling of pointers in gisel
vikramRH May 23, 2024
c3e512c
Review comments, updated AMDGPUUsage.rst
vikramRH May 27, 2024
72af37c
preserve legel 32-bit pieces, update usage doc
vikramRH May 30, 2024
2e4c5bc
Refactor GIsel lowering
vikramRH May 30, 2024
67e19e5
fix documentation mess
vikramRH May 30, 2024
cba2b1d
review comments
vikramRH May 30, 2024
26223c8
handle comment
vikramRH May 31, 2024
429fb0f
Review comments
vikramRH May 31, 2024
ec7b5c1
test for convergence related crash
vikramRH Jun 3, 2024
3d9cf2e
Update convergence-laneops-xfail.ll
vikramRH Jun 3, 2024
c015040
Merge branch 'main' into rw_lane_64
vikramRH Jun 6, 2024
482f380
update convergence related failure tests
vikramRH Jun 12, 2024
2b4cabb
Merge branch 'main' into rw_lane_64
vikramRH Jun 12, 2024
1a33cbc
revert targte specific SDNodes, handle convergence tokens in SDAG
vikramRH Jun 14, 2024
cfa659d
remove spurious new lines
vikramRH Jun 14, 2024
be90ba6
review comments
vikramRH Jun 17, 2024
104121f
Merge branch 'main' into rw_lane_64
vikramRH Jun 22, 2024
31b8838
update builtin CodeGen
vikramRH Jun 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18410,6 +18410,24 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType());
return Builder.CreateCall(F, Args);
}
case AMDGPU::BI__builtin_amdgcn_readlane:
case AMDGPU::BI__builtin_amdgcn_readfirstlane: {
llvm::SmallVector<llvm::Value *, 6> Args;
unsigned ICEArguments = 0;
ASTContext::GetBuiltinTypeError Error;
Intrinsic::ID IID = (BuiltinID == AMDGPU::BI__builtin_amdgcn_readlane)
? Intrinsic::amdgcn_readlane
: Intrinsic::amdgcn_readfirstlane;

getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
assert(Error == ASTContext::GE_None && "Should not codegen an error");
for (unsigned I = 0; I != E->getNumArgs(); ++I) {
Args.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, I, E));
}

Function *F = CGM.getIntrinsic(IID, Args[0]->getType());
return Builder.CreateCall(F, Args);
}
case AMDGPU::BI__builtin_amdgcn_div_fixup:
case AMDGPU::BI__builtin_amdgcn_div_fixupf:
case AMDGPU::BI__builtin_amdgcn_div_fixuph:
Expand Down
4 changes: 2 additions & 2 deletions clang/test/CodeGenOpenCL/builtins-amdgcn.cl
Original file line number Diff line number Diff line change
Expand Up @@ -306,14 +306,14 @@ void test_ds_bpermute(global int* out, int a, int b)
}

// CHECK-LABEL: @test_readfirstlane
// CHECK: call i32 @llvm.amdgcn.readfirstlane(i32 %a)
// CHECK: call i32 @llvm.amdgcn.readfirstlane.i32(i32 %a)
void test_readfirstlane(global int* out, int a)
{
*out = __builtin_amdgcn_readfirstlane(a);
}

// CHECK-LABEL: @test_readlane
// CHECK: call i32 @llvm.amdgcn.readlane(i32 %a, i32 %b)
// CHECK: call i32 @llvm.amdgcn.readlane.i32(i32 %a, i32 %b)
void test_readlane(global int* out, int a, int b)
{
*out = __builtin_amdgcn_readlane(a, b);
Expand Down
15 changes: 6 additions & 9 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2176,26 +2176,23 @@ def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;

def int_amdgcn_readfirstlane :
ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

// The lane argument must be uniform across the currently active threads of the
// current wave. Otherwise, the result is undefined.
def int_amdgcn_readlane :
ClangBuiltin<"__builtin_amdgcn_readlane">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

// The value to write and lane select arguments must be uniform across the
// currently active threads of the current wave. Otherwise, the result is
// undefined.
def int_amdgcn_writelane :
ClangBuiltin<"__builtin_amdgcn_writelane">,
Intrinsic<[llvm_i32_ty], [
llvm_i32_ty, // uniform value to write: returned by the selected lane
llvm_i32_ty, // uniform lane select
llvm_i32_ty // returned by all lanes other than the selected one
Intrinsic<[llvm_any_ty], [
LLVMMatchType<0>, // uniform value to write: returned by the selected lane
llvm_i32_ty, // uniform lane select
LLVMMatchType<0> // returned by all lanes other than the selected one
],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
>;
Expand Down
34 changes: 17 additions & 17 deletions llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
// combine them with a scalar operation.
Function *ReadLane =
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
V = B.CreateBitCast(V, IntNTy);
Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
Expand Down Expand Up @@ -493,8 +493,8 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
if (!ST->isWave32()) {
// Combine lane 31 into lanes 32..63.
V = B.CreateBitCast(V, IntNTy);
Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
{V, B.getInt32(31)});
Value *const Lane31 = B.CreateIntrinsic(
Intrinsic::amdgcn_readlane, B.getInt32Ty(), {V, B.getInt32(31)});

Value *UpdateDPPCall = B.CreateCall(
UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
Expand Down Expand Up @@ -523,10 +523,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
B.getInt32(0xf), B.getFalse()});
} else {
Function *ReadLane =
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
Function *WriteLane =
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
Function *ReadLane = Intrinsic::getDeclaration(
M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
Function *WriteLane = Intrinsic::getDeclaration(
M, Intrinsic::amdgcn_writelane, B.getInt32Ty());

// On GFX10 all DPP operations are confined to a single row. To get cross-
// row operations we have to use permlane or readlane.
Expand Down Expand Up @@ -598,16 +598,16 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(

// Get the value required for atomic operation
V = B.CreateBitCast(V, IntNTy);
Value *LaneValue =
B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {V, LaneIdxInt});
Value *LaneValue = B.CreateIntrinsic(Intrinsic::amdgcn_readlane,
B.getInt32Ty(), {V, LaneIdxInt});
LaneValue = B.CreateBitCast(LaneValue, Ty);

// Perform writelane if intermediate scan results are required later in the
// kernel computations
Value *OldValue = nullptr;
if (NeedResult) {
OldValue =
B.CreateIntrinsic(Intrinsic::amdgcn_writelane, {},
B.CreateIntrinsic(Intrinsic::amdgcn_writelane, B.getInt32Ty(),
{B.CreateBitCast(Accumulator, IntNTy), LaneIdxInt,
B.CreateBitCast(OldValuePhi, IntNTy)});
OldValue = B.CreateBitCast(OldValue, Ty);
Expand Down Expand Up @@ -789,7 +789,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
assert(TyBitWidth == 32);
NewV = B.CreateBitCast(NewV, IntNTy);
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, B.getInt32Ty(),
{NewV, LastLaneIdx});
NewV = B.CreateBitCast(NewV, Ty);
}
Expand Down Expand Up @@ -925,19 +925,19 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
Value *const ExtractLo = B.CreateTrunc(CastedPhi, Int32Ty);
Value *const ExtractHi =
B.CreateTrunc(B.CreateLShr(CastedPhi, 32), Int32Ty);
CallInst *const ReadFirstLaneLo =
B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
CallInst *const ReadFirstLaneHi =
B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
CallInst *const ReadFirstLaneLo = B.CreateIntrinsic(
Intrinsic::amdgcn_readfirstlane, Int32Ty, ExtractLo);
CallInst *const ReadFirstLaneHi = B.CreateIntrinsic(
Intrinsic::amdgcn_readfirstlane, Int32Ty, ExtractHi);
Value *const PartialInsert = B.CreateInsertElement(
PoisonValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
Value *const Insert =
B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
BroadcastI = B.CreateBitCast(Insert, Ty);
} else if (TyBitWidth == 32) {
Value *CastedPhi = B.CreateBitCast(PHI, IntNTy);
BroadcastI =
B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, CastedPhi);
BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, Int32Ty,
CastedPhi);
BroadcastI = B.CreateBitCast(BroadcastI, Ty);

} else {
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -691,7 +691,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {

break;
}
case AMDGPU::V_WRITELANE_B32: {
case AMDGPU::V_WRITELANE_B32:
case AMDGPU::V_WRITELANE_PSEUDO_B64: {
// Some architectures allow more than one constant bus access without
// SGPR restriction
if (ST.getConstantBusLimit(MI.getOpcode()) != 1)
Expand Down
109 changes: 109 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4822,6 +4822,111 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
return RetBB;
}

static MachineBasicBlock *lowerPseudoLaneOp(MachineInstr &MI,
MachineBasicBlock *BB,
const GCNSubtarget &ST,
unsigned Opc) {
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const DebugLoc &DL = MI.getDebugLoc();
const SIInstrInfo *TII = ST.getInstrInfo();

MachineOperand &Dest = MI.getOperand(0);
MachineOperand &Src0 = MI.getOperand(1);

const TargetRegisterClass *Src0RC =
Src0.isReg() ? MRI.getRegClass(Src0.getReg()) : &AMDGPU::SReg_64RegClass;
const TargetRegisterClass *Src0SubRC =
TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);

Register DestSub0 = MRI.createVirtualRegister(
(Opc == AMDGPU::V_WRITELANE_PSEUDO_B64) ? &AMDGPU::VGPR_32RegClass
: &AMDGPU::SGPR_32RegClass);
Register DestSub1 = MRI.createVirtualRegister(
(Opc == AMDGPU::V_WRITELANE_PSEUDO_B64) ? &AMDGPU::VGPR_32RegClass
: &AMDGPU::SGPR_32RegClass);

MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);

MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);

MachineInstr *LoHalf, *HighHalf;
switch (Opc) {
case AMDGPU::V_READLANE_PSEUDO_B64: {
MachineOperand &Src1 = MI.getOperand(2);
auto IsKill = (Src1.isReg() && Src1.isKill());
if (IsKill)
Src1.setIsKill(false);
LoHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), DestSub0)
.add(SrcReg0Sub0)
.add(Src1);

if (IsKill)
Src1.setIsKill(true);
HighHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), DestSub1)
.add(SrcReg0Sub1)
.add(Src1);
break;
}
case AMDGPU::V_READFIRSTLANE_PSEUDO_B64: {
LoHalf =
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DestSub0)
.add(SrcReg0Sub0);
HighHalf =
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DestSub1)
.add(SrcReg0Sub1);
break;
}
case AMDGPU::V_WRITELANE_PSEUDO_B64: {
MachineOperand &Src1 = MI.getOperand(2);
MachineOperand &Src2 = MI.getOperand(3);
auto IsKill = (Src1.isReg() && Src1.isKill());

const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
const TargetRegisterClass *Src2SubRC =
TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);

MachineOperand SrcReg2Sub0 = TII->buildExtractSubRegOrImm(
MI, MRI, Src2, Src2RC, AMDGPU::sub0, Src2SubRC);

MachineOperand SrcReg2Sub1 = TII->buildExtractSubRegOrImm(
MI, MRI, Src2, Src2RC, AMDGPU::sub1, Src2SubRC);

if (IsKill)
Src1.setIsKill(false);

LoHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), DestSub0)
.add(SrcReg0Sub0)
.add(Src1)
.add(SrcReg2Sub0);

if (IsKill)
Src1.setIsKill(true);
HighHalf = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), DestSub1)
.add(SrcReg0Sub1)
.add(Src1)
.add(SrcReg2Sub1);
break;
}
default:
llvm_unreachable("should not occur");
}

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
.addReg(DestSub0)
.addImm(AMDGPU::sub0)
.addReg(DestSub1)
.addImm(AMDGPU::sub1);

TII->legalizeOperands(*LoHalf);
TII->legalizeOperands(*HighHalf);

MI.eraseFromParent();
return BB;
}

MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MachineInstr &MI, MachineBasicBlock *BB) const {

Expand Down Expand Up @@ -5065,6 +5170,10 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MI.eraseFromParent();
return BB;
}
case AMDGPU::V_READLANE_PSEUDO_B64:
case AMDGPU::V_READFIRSTLANE_PSEUDO_B64:
case AMDGPU::V_WRITELANE_PSEUDO_B64:
return lowerPseudoLaneOp(MI, BB, *getSubtarget(), MI.getOpcode());
case AMDGPU::SI_INIT_M0: {
BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
Expand Down
37 changes: 36 additions & 1 deletion llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,41 @@ def V_SUB_U64_PSEUDO : VPseudoInstSI <
>;
} // End usesCustomInserter = 1, Defs = [VCC]


let usesCustomInserter = 1 in {
def V_READLANE_PSEUDO_B64 : VPseudoInstSI <
(outs SReg_64:$sdst), (ins VReg_64:$src0, SSrc_b32:$src1)>;

def V_READFIRSTLANE_PSEUDO_B64 : VPseudoInstSI <
(outs SReg_64:$sdst), (ins VReg_64:$src0)>;

def V_WRITELANE_PSEUDO_B64 : VPseudoInstSI <
(outs VReg_64:$sdst), (ins SReg_64:$src0, SSrc_b32:$src1, VReg_64:$src2)> {
let UseNamedOperandTable = 1;
}
} // End usesCustomInserter = 1

class ReadLanePseudoPat <ValueType vt> : GCNPat <
(vt (int_amdgcn_readlane vt:$src0, i32:$src1)),
(V_READLANE_PSEUDO_B64 VReg_64:$src0, SSrc_b32:$src1)>;

def : ReadLanePseudoPat<i64>;
def : ReadLanePseudoPat<f64>;

class WriteLanePseudoPat <ValueType vt> : GCNPat <
(vt (int_amdgcn_writelane vt:$src0, i32:$src1, vt:$src2)),
(V_WRITELANE_PSEUDO_B64 SReg_64:$src0, SSrc_b32:$src1, VReg_64:$src2)>;

def : WriteLanePseudoPat<i64>;
def : WriteLanePseudoPat<f64>;

class ReadFirstLanePseudoPat <ValueType vt> : GCNPat <
(vt (int_amdgcn_readfirstlane vt:$src0)),
(V_READFIRSTLANE_PSEUDO_B64 VReg_64:$src0)>;

def : ReadFirstLanePseudoPat<i64>;
def : ReadFirstLanePseudoPat<f64>;

let usesCustomInserter = 1, Defs = [SCC] in {
def S_ADD_U64_PSEUDO : SPseudoInstSI <
(outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
Expand Down Expand Up @@ -3405,7 +3440,7 @@ def : GCNPat<
// FIXME: Should also do this for readlane, but tablegen crashes on
// the ignored src1.
def : GCNPat<
(int_amdgcn_readfirstlane (i32 imm:$src)),
(i32 (int_amdgcn_readfirstlane (i32 imm:$src))),
(S_MOV_B32 SReg_32:$src)
>;

Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/VOP1Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ class getVOP1Pat <SDPatternOperator node, VOPProfile P> : LetDummies {
!if(P.HasOMod,
[(set P.DstVT:$vdst, (node (P.Src0VT (VOP3OMods P.Src0VT:$src0,
i1:$clamp, i32:$omod))))],
[(set P.DstVT:$vdst, (node P.Src0RC32:$src0))]
[(set P.DstVT:$vdst, (node (P.Src0VT P.Src0RC32:$src0)))]
)
);
}
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ define amdgpu_kernel void @mov_dpp8(ptr addrspace(1) %out, i32 %in) #0 {
ret void
}

; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.writelane(i32 0, i32 1, i32 2)
; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.writelane.i32(i32 0, i32 1, i32 2)
define amdgpu_kernel void @writelane(ptr addrspace(1) %out) #0 {
%tmp0 = call i32 @llvm.amdgcn.writelane(i32 0, i32 1, i32 2)
%tmp0 = call i32 @llvm.amdgcn.writelane.i32(i32 0, i32 1, i32 2)
store i32 %tmp0, ptr addrspace(1) %out
ret void
}
Expand Down Expand Up @@ -237,7 +237,7 @@ declare i32 @llvm.amdgcn.permlanex16.var(i32, i32, i32, i1, i1) #1
declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1
declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #1
declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #1
declare i32 @llvm.amdgcn.writelane.i32(i32, i32, i32) #1
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half>, <16 x half> , <8 x float>) #1
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16>, <16 x i16> , <8 x float>) #1
declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1
Expand Down
Loading