Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -640,6 +640,11 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16
// GFX1250+ only builtins.
//===----------------------------------------------------------------------===//

TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_tensor_store_from_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_tensor_store_from_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts")

TARGET_BUILTIN(__builtin_amdgcn_global_load_tr4_b64_v2i32, "V2iV2i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr8_b64_v2i32, "V2iV2i*1", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr6_b96_v3i32, "V3iV3i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
Expand Down
26 changes: 26 additions & 0 deletions clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -621,6 +621,32 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
return Builder.CreateCall(F, {Addr});
}
case AMDGPU::BI__builtin_amdgcn_tensor_load_to_lds:
case AMDGPU::BI__builtin_amdgcn_tensor_load_to_lds_d2:
case AMDGPU::BI__builtin_amdgcn_tensor_store_from_lds:
case AMDGPU::BI__builtin_amdgcn_tensor_store_from_lds_d2: {
Intrinsic::ID IID;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_tensor_load_to_lds:
IID = Intrinsic::amdgcn_tensor_load_to_lds;
break;
case AMDGPU::BI__builtin_amdgcn_tensor_load_to_lds_d2:
IID = Intrinsic::amdgcn_tensor_load_to_lds_d2;
break;
case AMDGPU::BI__builtin_amdgcn_tensor_store_from_lds:
IID = Intrinsic::amdgcn_tensor_store_from_lds;
break;
case AMDGPU::BI__builtin_amdgcn_tensor_store_from_lds_d2:
IID = Intrinsic::amdgcn_tensor_store_from_lds_d2;
break;
}

SmallVector<Value *, 5> Args;
for (int i = 0, e = E->getNumArgs(); i != e; ++i)
Args.push_back(EmitScalarExpr(E->getArg(i)));
llvm::Function *F = CGM.getIntrinsic(IID, {});
return Builder.CreateCall(F, {Args});
}
case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
// Should this have asan instrumentation?
return emitBuiltinWithOneOverloadedType<5>(*this, E,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250

typedef int v4i __attribute__((ext_vector_type(4)));
typedef int v8i __attribute__((ext_vector_type(8)));

// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], i32 0)
// CHECK-GFX1250-NEXT: ret void
//
void test_amdgcn_tensor_load_to_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3)
{
__builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, 0);
}

// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds_d2(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], i32 27)
// CHECK-GFX1250-NEXT: ret void
//
void test_amdgcn_tensor_load_to_lds_d2(v4i sg0, v8i sg1)
{
__builtin_amdgcn_tensor_load_to_lds_d2(sg0, sg1, 27);
}

// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], i32 22)
// CHECK-GFX1250-NEXT: ret void
//
void test_amdgcn_tensor_store_from_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3)
{
__builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, 22);
}

// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds_d2(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], i32 0)
// CHECK-GFX1250-NEXT: ret void
//
void test_amdgcn_tensor_store_from_lds_d2(v4i sg0, v8i sg1)
{
__builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, 0);
}
11 changes: 11 additions & 0 deletions clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1250 -verify -S -o - %s

typedef int v4i __attribute__((ext_vector_type(4)));
typedef int v8i __attribute__((ext_vector_type(8)));

void test_setprio_inc_wg(short a) {
__builtin_amdgcn_s_setprio_inc_wg(a); // expected-error {{'__builtin_amdgcn_s_setprio_inc_wg' must be a constant integer}}
}
Expand All @@ -16,3 +19,11 @@ void test__builtin_amdgcn_cvt_f16_fp8(int a, int b) {
void test__builtin_amdgcn_cvt_f16_bf8(int a, int b) {
__builtin_amdgcn_cvt_f16_bf8(a, b); // expected-error {{'__builtin_amdgcn_cvt_f16_bf8' must be a constant integer}}
}

void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, int cpol)
{
__builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds' must be a constant integer}}
__builtin_amdgcn_tensor_load_to_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds_d2' must be a constant integer}}
__builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds' must be a constant integer}}
__builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds_d2' must be a constant integer}}
}
31 changes: 31 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -3580,6 +3580,37 @@ def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic<
[IntrNoMem, IntrSpeculatable]
>;

class AMDGPUTensorLoadStore:
Intrinsic<
[],
[llvm_v4i32_ty, // D# group 0
llvm_v8i32_ty, // D# group 1
llvm_v4i32_ty, // D# group 2
llvm_v4i32_ty, // D# group 3
llvm_i32_ty], // cachepolicy:
// bits [0-2] = th
// bits [3-4] = scope
[IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<4>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree],
"", [SDNPMemOperand]
>;

class AMDGPUTensorLoadStoreD2:
Intrinsic<
[],
[llvm_v4i32_ty, // D# group 0
llvm_v8i32_ty, // D# group 1
llvm_i32_ty], // cachepolicy:
// bits [0-2] = th
// bits [3-4] = scope
[IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<2>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree],
"", [SDNPMemOperand]
>;

def int_amdgcn_tensor_load_to_lds : AMDGPUTensorLoadStore;
def int_amdgcn_tensor_store_from_lds : AMDGPUTensorLoadStore;
def int_amdgcn_tensor_load_to_lds_d2 : AMDGPUTensorLoadStoreD2;
def int_amdgcn_tensor_store_from_lds_d2 : AMDGPUTensorLoadStoreD2;

/// Emit an addrspacecast without null pointer checking.
/// Should only be inserted by a pass based on analysis of an addrspacecast's src.
def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic<
Expand Down
30 changes: 30 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3348,6 +3348,20 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
MI.eraseFromParent();
return;
}
case Intrinsic::amdgcn_tensor_load_to_lds:
case Intrinsic::amdgcn_tensor_store_from_lds: {
constrainOpWithReadfirstlane(B, MI, 1);
constrainOpWithReadfirstlane(B, MI, 2);
constrainOpWithReadfirstlane(B, MI, 3);
constrainOpWithReadfirstlane(B, MI, 4);
return;
}
case Intrinsic::amdgcn_tensor_load_to_lds_d2:
case Intrinsic::amdgcn_tensor_store_from_lds_d2: {
constrainOpWithReadfirstlane(B, MI, 1);
constrainOpWithReadfirstlane(B, MI, 2);
return;
}
default: {
if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
Expand Down Expand Up @@ -5354,6 +5368,22 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case Intrinsic::amdgcn_pops_exiting_wave_id:
return getDefaultMappingSOP(MI);
case Intrinsic::amdgcn_tensor_load_to_lds_d2:
case Intrinsic::amdgcn_tensor_store_from_lds_d2:
case Intrinsic::amdgcn_tensor_load_to_lds:
case Intrinsic::amdgcn_tensor_store_from_lds: {
// Lie and claim everything is legal, even all operands need to be
// SGPRs. applyMapping will have to deal with it with readfirstlane.
for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
if (MI.getOperand(I).isReg()) {
Register Reg = MI.getOperand(I).getReg();
auto OpBank = getRegBankID(Reg, MRI);
unsigned Size = getSizeInBits(Reg, MRI, *TRI);
OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
}
}
Comment on lines +5377 to +5384
Copy link
Member

@tgymnich tgymnich Jul 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
if (MI.getOperand(I).isReg()) {
Register Reg = MI.getOperand(I).getReg();
auto OpBank = getRegBankID(Reg, MRI);
unsigned Size = getSizeInBits(Reg, MRI, *TRI);
OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
}
}
for (MachineOperand &MO : MI.all_uses()) {
Register Reg = MO.getReg();
auto OpBank = getRegBankID(Reg, MRI);
unsigned Size = getSizeInBits(Reg, MRI, *TRI);
OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
}

Copy link
Contributor Author

@changpeng changpeng Jul 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The suggested code does not compile. Thanks

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry. Updated the code accordingly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry. Updated the code accordingly.

What is the value of "I" then? We need a way to map the operand back to the index.

break;
}
case Intrinsic::amdgcn_s_prefetch_data: {
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
Expand Down
29 changes: 23 additions & 6 deletions llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1784,6 +1784,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
bool validateMIMGAddrSize(const MCInst &Inst, const SMLoc &IDLoc);
bool validateMIMGD16(const MCInst &Inst);
bool validateMIMGDim(const MCInst &Inst, const OperandVector &Operands);
bool validateTensorR128(const MCInst &Inst);
bool validateMIMGMSAA(const MCInst &Inst);
bool validateOpSel(const MCInst &Inst);
bool validateTrue16OpSel(const MCInst &Inst);
Expand Down Expand Up @@ -4280,6 +4281,20 @@ bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) {
return true;
}

bool AMDGPUAsmParser::validateTensorR128(const MCInst &Inst) {
const unsigned Opc = Inst.getOpcode();
const MCInstrDesc &Desc = MII.get(Opc);

if ((Desc.TSFlags & SIInstrFlags::TENSOR_CNT) == 0)
return true;

int R128Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::r128);
if (R128Idx >= 0 && Inst.getOperand(R128Idx).getImm())
return false;

return true;
}

static bool IsRevOpcode(const unsigned Opcode)
{
switch (Opcode) {
Expand Down Expand Up @@ -5113,14 +5128,11 @@ bool AMDGPUAsmParser::validateTHAndScopeBits(const MCInst &Inst,
return PrintError("scope and th combination is not valid");
}

bool IsStore = TID.mayStore();
bool IsAtomic =
TID.TSFlags & (SIInstrFlags::IsAtomicNoRet | SIInstrFlags::IsAtomicRet);

if (IsAtomic) {
unsigned THType = AMDGPU::getTemporalHintType(TID);
if (THType == AMDGPU::CPol::TH_TYPE_ATOMIC) {
if (!(CPol & AMDGPU::CPol::TH_TYPE_ATOMIC))
return PrintError("invalid th value for atomic instructions");
} else if (IsStore) {
} else if (THType == AMDGPU::CPol::TH_TYPE_STORE) {
if (!(CPol & AMDGPU::CPol::TH_TYPE_STORE))
return PrintError("invalid th value for store instructions");
} else {
Expand Down Expand Up @@ -5205,6 +5217,11 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
Error(IDLoc, "missing dim operand");
return false;
}
if (!validateTensorR128(Inst)) {
Error(getImmLoc(AMDGPUOperand::ImmTyD16, Operands),
"instruction must set modifier r128=0");
return false;
}
if (!validateMIMGMSAA(Inst)) {
Error(getImmLoc(AMDGPUOperand::ImmTyDim, Operands),
"invalid dim; must be MSAA type");
Expand Down
10 changes: 3 additions & 7 deletions llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,13 +173,12 @@ void AMDGPUInstPrinter::printTH(const MCInst *MI, int64_t TH, int64_t Scope,

const unsigned Opcode = MI->getOpcode();
const MCInstrDesc &TID = MII.get(Opcode);
bool IsStore = TID.mayStore();
bool IsAtomic =
TID.TSFlags & (SIInstrFlags::IsAtomicNoRet | SIInstrFlags::IsAtomicRet);
unsigned THType = AMDGPU::getTemporalHintType(TID);
bool IsStore = (THType == AMDGPU::CPol::TH_TYPE_STORE);

O << " th:";

if (IsAtomic) {
if (THType == AMDGPU::CPol::TH_TYPE_ATOMIC) {
O << "TH_ATOMIC_";
if (TH & AMDGPU::CPol::TH_ATOMIC_CASCADE) {
if (Scope >= AMDGPU::CPol::SCOPE_DEV)
Expand All @@ -196,9 +195,6 @@ void AMDGPUInstPrinter::printTH(const MCInst *MI, int64_t TH, int64_t Scope,
if (!IsStore && TH == AMDGPU::CPol::TH_RESERVED)
O << formatHex(TH);
else {
// This will default to printing load variants when neither MayStore nor
// MayLoad flag is present which is the case with instructions like
// image_get_resinfo.
O << (IsStore ? "TH_STORE_" : "TH_LOAD_");
switch (TH) {
case AMDGPU::CPol::TH_NT:
Expand Down
94 changes: 94 additions & 0 deletions llvm/lib/Target/AMDGPU/MIMGInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -2019,3 +2019,97 @@ def : MIMGG16Mapping<IMAGE_SAMPLE_CD_O_nortn, IMAGE_SAMPLE_CD_O_G16_nortn>;
def : MIMGG16Mapping<IMAGE_SAMPLE_CD_CL_O_nortn, IMAGE_SAMPLE_CD_CL_O_G16_nortn>;
def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_O_nortn, IMAGE_SAMPLE_C_CD_O_G16_nortn>;
def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_CL_O_nortn, IMAGE_SAMPLE_C_CD_CL_O_G16_nortn>;

//===----------------------------------------------------------------------===//
// VIMAGE Tensor Instructions
//===----------------------------------------------------------------------===//

class VIMAGE_TENSOR_Pseudo<string opName, bit _UpTo2D = 0> :
InstSI<(outs ), (ins ), "", []>,
SIMCInstr<opName#!if(_UpTo2D, "_D2", ""), SIEncodingFamily.NONE> {

let isPseudo = 1;
let isCodeGenOnly = 1;
string Mnemonic = opName;

let VALU = 1;
let maybeAtomic = 0;
let TENSOR_CNT = 1;
let mayLoad = 1;
let mayStore = 1;
let Uses = [EXEC, TENSORcnt];
let Defs = [TENSORcnt];
let SchedRW = [WriteVMEM, WriteLDS];
let UseNamedOperandTable = 1;
let hasSideEffects = 0;

bit UpTo2D = _UpTo2D;
let InOperandList = !if(UpTo2D, (ins SReg_128:$vaddr0, SReg_256:$vaddr1, R128A16:$r128, CPol:$cpol),
(ins SReg_128:$vaddr0, SReg_256:$vaddr1, SReg_128:$vaddr2,
SReg_128:$vaddr3, R128A16:$r128, CPol:$cpol));
string AsmOperands = " $vaddr0, $vaddr1"#!if(UpTo2D, "", ", $vaddr2, $vaddr3")#"$r128$cpol";
}

let SubtargetPredicate = isGFX1250Plus in {
def TENSOR_LOAD_TO_LDS : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds">;
def TENSOR_STORE_FROM_LDS : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds">;
def TENSOR_LOAD_TO_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds", 1>;
def TENSOR_STORE_FROM_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds", 1>;
} // End SubtargetPredicate = isGFX1250Plus.

class TensorPat <VIMAGE_TENSOR_Pseudo inst, SDPatternOperator node> : GCNPat <
(node v4i32:$vaddr0, v8i32:$vaddr1, v4i32:$vaddr2, v4i32:$vaddr3, (i32 timm:$cpol)),
(inst $vaddr0, $vaddr1, $vaddr2, $vaddr3, 0, $cpol)
>;

class TensorD2Pat <VIMAGE_TENSOR_Pseudo inst, SDPatternOperator node> : GCNPat <
(node v4i32:$vaddr0, v8i32:$vaddr1, (i32 timm:$cpol)),
(inst $vaddr0, $vaddr1, 0, $cpol)
>;

let SubtargetPredicate = isGFX1250Plus in {
def : TensorPat <TENSOR_LOAD_TO_LDS, int_amdgcn_tensor_load_to_lds>;
def : TensorPat <TENSOR_STORE_FROM_LDS, int_amdgcn_tensor_store_from_lds>;
def : TensorD2Pat <TENSOR_LOAD_TO_LDS_D2, int_amdgcn_tensor_load_to_lds_d2>;
def : TensorD2Pat <TENSOR_STORE_FROM_LDS_D2, int_amdgcn_tensor_store_from_lds_d2>;
}

class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = ps.Mnemonic> :
InstSI <ps.OutOperandList, ps.InOperandList, opName # ps.AsmOperands, []>,
VIMAGEe<op> {

// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
let TSFlags = ps.TSFlags;
let mayLoad = ps.mayLoad;
let mayStore = ps.mayStore;
let UseNamedOperandTable = ps.UseNamedOperandTable;
let SchedRW = ps.SchedRW;

// D# group 2 and 3 set to NULL for 2D or less.
let vaddr2 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);
let vaddr3 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);

// set to 0 based on SPG.
let vaddr4 = 0;
let rsrc = 0;
let vdata = 0;
let d16 = 0;
let a16 = 0;
let tfe = 0;
let dmask = 1; // sp3
let dim = 1; // sp3
}

multiclass VIMAGE_TENSOR_Real_gfx1250<bits<8> op> {
let AssemblerPredicate = isGFX1250Plus, DecoderNamespace = "GFX1250" in {
foreach DSuffix = ["_D2", ""] in {
defvar ps = !cast<VIMAGE_TENSOR_Pseudo>(NAME # DSuffix);
def DSuffix # _gfx1250 : VIMAGE_TENSOR_Real<op, ps, ps.Mnemonic>,
SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX1250>;
}
}
}

defm TENSOR_LOAD_TO_LDS : VIMAGE_TENSOR_Real_gfx1250<0xc4>;
defm TENSOR_STORE_FROM_LDS : VIMAGE_TENSOR_Real_gfx1250<0xc5>;
3 changes: 1 addition & 2 deletions llvm/lib/Target/AMDGPU/SIDefines.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,7 @@ enum : uint64_t {
DisableWQM = UINT64_C(1) << 36,
Gather4 = UINT64_C(1) << 37,

// Reserved, must be 0.
Reserved0 = UINT64_C(1) << 38,
TENSOR_CNT = UINT64_C(1) << 38,

SCALAR_STORE = UINT64_C(1) << 39,
FIXED_SIZE = UINT64_C(1) << 40,
Expand Down
Loading
Loading