Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion clang/test/CodeGenOpenCL/amdgpu-features.cl
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@
// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+wavefrontsize32"
// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+transpose-load-f4f6-insts,+wavefrontsize32"

// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"

Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1094,6 +1094,12 @@ def FeatureBitOp3Insts : SubtargetFeature<"bitop3-insts",
"Has v_bitop3_b32/v_bitop3_b16 instructions"
>;

def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts",
"HasTransposeLoadF4F6Insts",
"true",
"Has ds_load_tr4/tr6 and global_load_tr4/tr6 instructions"
>;

def FeaturePrngInst : SubtargetFeature<"prng-inst",
"HasPrngInst",
"true",
Expand Down Expand Up @@ -1933,6 +1939,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureScalarDwordx3Loads,
FeatureDPPSrc1SGPR,
FeatureBitOp3Insts,
FeatureTransposeLoadF4F6Insts,
FeatureBF16ConversionInsts,
FeatureCvtPkF16F32Inst,
FeatureMinimum3Maximum3PKF16,
Expand Down Expand Up @@ -2627,6 +2634,9 @@ def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">,
def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">,
AssemblerPredicate<(all_of FeatureBitOp3Insts)>;

def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">,
AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>;

def HasPrngInst : Predicate<"Subtarget->hasPrngInst()">,
AssemblerPredicate<(all_of FeaturePrngInst)>;

Expand Down
22 changes: 22 additions & 0 deletions llvm/lib/Target/AMDGPU/DSInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -783,6 +783,19 @@ multiclass DSAtomicRetNoRetPatIntrinsic_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
defm : DSAtomicRetNoRetPatIntrinsic_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "int_amdgcn_atomic_cond_sub_u32">;
} // let SubtargetPredicate = isGFX12Plus

let SubtargetPredicate = isGFX1250Plus in {

let WaveSizePredicate = isWave32, mayStore = 0 in {
let OtherPredicates = [HasTransposeLoadF4F6Insts] in {
defm DS_LOAD_TR4_B64 : DS_1A_RET_NoM0<"ds_load_tr4_b64", VReg_64>;
defm DS_LOAD_TR6_B96 : DS_1A_RET_NoM0<"ds_load_tr6_b96", VReg_96>;
} // let OtherPredicates = [HasTransposeLoadF4F6Insts]
defm DS_LOAD_TR8_B64 : DS_1A_RET_NoM0<"ds_load_tr8_b64", VReg_64>;
defm DS_LOAD_TR16_B128 : DS_1A_RET_NoM0<"ds_load_tr16_b128", VReg_128>;
} // let WaveSizePredicate = isWave32, mayStore = 0

} // let SubtargetPredicate = isGFX1250Plus

let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore = 0 in {
defm DS_READ_B64_TR_B4 : DS_1A_RET_NoM0<"ds_read_b64_tr_b4", VReg_64>;
defm DS_READ_B64_TR_B8 : DS_1A_RET_NoM0<"ds_read_b64_tr_b8", VReg_64>;
Expand Down Expand Up @@ -1332,6 +1345,11 @@ defm DS_PK_ADD_BF16 : DS_Real_gfx12<0x09b>;
defm DS_PK_ADD_RTN_BF16 : DS_Real_gfx12<0x0ab>;
defm DS_BPERMUTE_FI_B32 : DS_Real_gfx12<0x0cd>;

defm DS_LOAD_TR4_B64 : DS_Real_gfx12<0x0fa>;
defm DS_LOAD_TR6_B96 : DS_Real_gfx12<0x0fb>;
defm DS_LOAD_TR16_B128 : DS_Real_gfx12<0x0fc>;
defm DS_LOAD_TR8_B64 : DS_Real_gfx12<0x0fd>;

defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx12<0x0e0,
"ds_bvh_stack_push4_pop1_rtn_b32", true>;
defm DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_Real_gfx12<0x0e1>;
Expand All @@ -1345,6 +1363,10 @@ let AssemblerPredicate = isGFX12Plus in {
def : AMDGPUMnemonicAlias<"ds_subrev_rtn_u64", "ds_rsub_rtn_u64">;
}

// Aliases that have existed since these instructions were introduced.
def : MnemonicAlias<"ds_load_tr_b64", "ds_load_tr8_b64">, Requires<[isGFX1250Plus]>;
def : MnemonicAlias<"ds_load_tr_b128", "ds_load_tr16_b128">, Requires<[isGFX1250Plus]>;

//===----------------------------------------------------------------------===//
// GFX11.
//===----------------------------------------------------------------------===//
Expand Down
89 changes: 77 additions & 12 deletions llvm/lib/Target/AMDGPU/FLATInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1092,19 +1092,23 @@ let SubtargetPredicate = isGFX12Plus in {
}

let WaveSizePredicate = isWave32 in {
let Mnemonic = "global_load_tr_b128" in
defm GLOBAL_LOAD_TR_B128_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w32", VReg_128>;
let Mnemonic = "global_load_tr_b64" in
defm GLOBAL_LOAD_TR_B64_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w32", VReg_64>;
}
let WaveSizePredicate = isWave64 in {
let Mnemonic = "global_load_tr_b128" in
defm GLOBAL_LOAD_TR_B128_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w64", VReg_64>;
let Mnemonic = "global_load_tr_b64" in
defm GLOBAL_LOAD_TR_B64_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w64", VGPR_32>;
defm GLOBAL_LOAD_TR_B128_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b128", VReg_128>;
defm GLOBAL_LOAD_TR_B64_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b64", VReg_64>;
}
} // End SubtargetPredicate = isGFX12Plus

let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12PlusNot12_50 in {
let Mnemonic = "global_load_tr_b128" in
defm GLOBAL_LOAD_TR_B128_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w64", VReg_64>;
let Mnemonic = "global_load_tr_b64" in
defm GLOBAL_LOAD_TR_B64_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w64", VGPR_32>;
}

let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX1250Plus in {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not use HasTransposeLoadF4F6Insts instead of isGFX1250Plus ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was later changed to HasTransposeLoadF4F6Insts. When I made that change, I also submitted the change to downsream. Unfortunately, you are merging an older point from upstream to ToT downstream branch.

defm GLOBAL_LOAD_TR6_B96 : FLAT_Global_Load_Pseudo <"global_load_tr6_b96", VReg_96>;
defm GLOBAL_LOAD_TR4_B64 : FLAT_Global_Load_Pseudo <"global_load_tr4_b64", VReg_64>;
}

let SubtargetPredicate = isGFX10Plus in {
defm GLOBAL_ATOMIC_FCMPSWAP :
FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32, v2f32, VReg_64>;
Expand Down Expand Up @@ -2809,6 +2813,13 @@ multiclass VGLOBAL_Real_AllAddr_gfx12<bits<8> op,
defm _SADDR : VFLAT_Real_gfx12<op, name>;
}

multiclass VGLOBAL_Real_AllAddr_gfx1200<bits<8> op> {
let AssemblerPredicate = isGFX12Not12_50 in {
defm "" : VFLAT_Real_gfx12<op>;
defm _SADDR : VFLAT_Real_gfx12<op>;
}
}

multiclass VGLOBAL_Real_AllAddr_gfx12_w64<bits<8> op,
string name = get_FLAT_ps<NAME>.Mnemonic> :
VFLAT_Aliases_gfx12<name> {
Expand Down Expand Up @@ -2951,8 +2962,8 @@ defm GLOBAL_ATOMIC_FMIN : VGLOBAL_Real_Atomics_gfx12<0x051, "global_a
defm GLOBAL_ATOMIC_FMAX : VGLOBAL_Real_Atomics_gfx12<0x052, "global_atomic_max_num_f32", "global_atomic_max_f32">;
defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056>;

defm GLOBAL_LOAD_TR_B128_w32 : VGLOBAL_Real_AllAddr_gfx12<0x057>;
defm GLOBAL_LOAD_TR_B64_w32 : VGLOBAL_Real_AllAddr_gfx12<0x058>;
defm GLOBAL_LOAD_TR_B128_w32 : VGLOBAL_Real_AllAddr_gfx1200<0x057>;
defm GLOBAL_LOAD_TR_B64_w32 : VGLOBAL_Real_AllAddr_gfx1200<0x058>;

defm GLOBAL_LOAD_TR_B128_w64 : VGLOBAL_Real_AllAddr_gfx12_w64<0x057>;
defm GLOBAL_LOAD_TR_B64_w64 : VGLOBAL_Real_AllAddr_gfx12_w64<0x058>;
Expand Down Expand Up @@ -2992,6 +3003,60 @@ defm SCRATCH_STORE_SHORT_D16_HI : VSCRATCH_Real_AllAddr_gfx12<0x25, "scratch_
defm SCRATCH_LOAD_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x53>;
defm SCRATCH_STORE_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x54>;

//===----------------------------------------------------------------------===//
// GFX1250
//===----------------------------------------------------------------------===//

multiclass VFLAT_Real_gfx1250<bits<8> op,
string name = get_FLAT_ps<NAME>.Mnemonic> {
defvar ps = !cast<FLAT_Pseudo>(NAME);
def _gfx1250 : VFLAT_Real<op, ps, name>,
SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX1250> {
let AssemblerPredicate = isGFX125xOnly;
let DecoderNamespace = "GFX1250";

let Inst{25-24} = {ps.is_flat_global, ps.is_flat_scratch};
}
}

multiclass VFLAT_Aliases_gfx1250<string name> {
defvar ps = get_FLAT_ps<NAME>;
if !ne(ps.Mnemonic, name) then
def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX125xOnly]>;
}

multiclass VFLAT_Real_Base_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic> :
VFLAT_Aliases_gfx1250<name> {
defm "" : VFLAT_Real_gfx1250<op, name>;
}

multiclass VFLAT_Real_RTN_gfx1250<bits<8> op, string name> {
defm _RTN : VFLAT_Real_gfx1250<op, name>;
}

multiclass VFLAT_Real_SADDR_gfx1250<bits<8> op, string name> {
defm _SADDR : VFLAT_Real_gfx1250<op, name>;
}

multiclass VFLAT_Real_SADDR_RTN_gfx1250<bits<8> op, string name> {
defm _SADDR_RTN : VFLAT_Real_gfx1250<op, name>;
}

multiclass VFLAT_Real_AllAddr_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic> :
VFLAT_Real_Base_gfx1250<op, name>,
VFLAT_Real_SADDR_gfx1250<op, name>;

multiclass VFLAT_Real_Atomics_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic> :
VFLAT_Real_AllAddr_gfx1250<op, name>,
VFLAT_Real_RTN_gfx1250<op, name>,
VFLAT_Real_SADDR_RTN_gfx1250<op, name>;

defm GLOBAL_LOAD_TR_B128_w32 : VFLAT_Real_AllAddr_gfx1250<0x057, "global_load_tr16_b128">;
defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "global_load_tr8_b64">;

defm GLOBAL_LOAD_TR4_B64 : VFLAT_Real_AllAddr_gfx1250<0x073>;
defm GLOBAL_LOAD_TR6_B96 : VFLAT_Real_AllAddr_gfx1250<0x074>;

def True16D16Table : GenericTable {
let FilterClass = "True16D16Table";
let CppTypeName = "True16D16Info";
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasPseudoScalarTrans = false;
bool HasRestrictedSOffset = false;
bool HasBitOp3Insts = false;
bool HasTransposeLoadF4F6Insts = false;
bool HasPrngInst = false;
bool HasBVHDualAndBVH8Insts = false;
bool HasPermlane16Swap = false;
Expand Down Expand Up @@ -1372,6 +1373,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return HasMinimum3Maximum3PKF16;
}

bool hasTransposeLoadF4F6Insts() const { return HasTransposeLoadF4F6Insts; }

/// \returns true if the target has s_wait_xcnt insertion. Supported for
/// GFX1250.
bool hasWaitXCnt() const { return HasWaitXcnt; }
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/TargetParser/TargetParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["gfx1250-insts"] = true;
Features["bitop3-insts"] = true;
Features["prng-inst"] = true;
Features["transpose-load-f4f6-insts"] = true;
Features["fp8-conversion-insts"] = true;
Features["permlane16-swap"] = true;
Features["ashr-pk-insts"] = true;
Expand Down
Loading