Skip to content

Commit ad19db1

Browse files
rampitecmahesh-attarde
authored andcommitted
[AMDGPU] Add V_ADD|SUB|MUL_U64 gfx1250 opcodes (llvm#150291)
1 parent 49f8f69 commit ad19db1

23 files changed

+2506
-250
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1355,6 +1355,10 @@ def FeatureLshlAddU64Inst
13551355
: SubtargetFeature<"lshl-add-u64-inst", "HasLshlAddU64Inst", "true",
13561356
"Has v_lshl_add_u64 instruction">;
13571357

1358+
def FeatureAddSubU64Insts
1359+
: SubtargetFeature<"add-sub-u64-insts", "HasAddSubU64Insts", "true",
1360+
"Has v_add_u64 and v_sub_u64 instructions">;
1361+
13581362
def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts",
13591363
"HasVMemToLDSLoad",
13601364
"true",
@@ -2010,6 +2014,7 @@ def FeatureISAVersion12_50 : FeatureSet<
20102014
FeatureMemoryAtomicFAddF32DenormalSupport,
20112015
FeatureKernargPreload,
20122016
FeatureLshlAddU64Inst,
2017+
FeatureAddSubU64Insts,
20132018
FeatureLdsBarrierArriveAtomic,
20142019
FeatureSetPrioIncWgInst,
20152020
]>;
@@ -2787,6 +2792,9 @@ def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">,
27872792
def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">,
27882793
AssemblerPredicate<(all_of FeatureLshlAddU64Inst)>;
27892794

2795+
def HasAddSubU64Insts : Predicate<"Subtarget->hasAddSubU64Insts()">,
2796+
AssemblerPredicate<(all_of FeatureAddSubU64Insts)>;
2797+
27902798
def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">,
27912799
AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>;
27922800

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4208,6 +4208,9 @@ bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
42084208
assert(Ty.isScalar());
42094209

42104210
unsigned Size = Ty.getSizeInBits();
4211+
if (ST.hasVectorMulU64() && Size == 64)
4212+
return true;
4213+
42114214
unsigned NumParts = Size / 32;
42124215
assert((Size % 32) == 0);
42134216
assert(NumParts >= 2);

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2528,7 +2528,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
25282528
// Special case for s_mul_u64. There is not a vector equivalent of
25292529
// s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector
25302530
// multiplications.
2531-
if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) {
2531+
if (!Subtarget.hasVectorMulU64() && Opc == AMDGPU::G_MUL &&
2532+
DstTy.getSizeInBits() == 64) {
25322533
applyMappingSMULU64(B, OpdMapper);
25332534
return;
25342535
}
@@ -3973,7 +3974,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
39733974
OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
39743975
OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
39753976
} else {
3976-
OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3977+
if (MI.getOpcode() == AMDGPU::G_MUL && Subtarget.hasVectorMulU64())
3978+
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3979+
else
3980+
OpdsMapping[0] =
3981+
getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
39773982
unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
39783983
OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
39793984

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
267267
bool HasMinimum3Maximum3F16 = false;
268268
bool HasMinimum3Maximum3PKF16 = false;
269269
bool HasLshlAddU64Inst = false;
270+
bool HasAddSubU64Insts = false;
270271
bool HasPointSampleAccel = false;
271272
bool HasLdsBarrierArriveAtomic = false;
272273
bool HasSetPrioIncWgInst = false;
@@ -1500,6 +1501,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
15001501

15011502
bool hasVOPD3() const { return GFX1250Insts; }
15021503

1504+
// \returns true if the target has V_ADD_U64/V_SUB_U64 instructions.
1505+
bool hasAddSubU64Insts() const { return HasAddSubU64Insts; }
1506+
1507+
// \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
1508+
bool hasVectorMulU64() const { return GFX1250Insts; }
1509+
15031510
// \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
15041511
bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }
15051512

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -874,7 +874,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
874874

875875
setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
876876

877-
if (Subtarget->hasScalarSMulU64())
877+
if (Subtarget->hasVectorMulU64())
878+
setOperationAction(ISD::MUL, MVT::i64, Legal);
879+
else if (Subtarget->hasScalarSMulU64())
878880
setOperationAction(ISD::MUL, MVT::i64, Custom);
879881

880882
if (Subtarget->hasMad64_32())
@@ -5421,6 +5423,19 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
54215423
MachineOperand &Src0 = MI.getOperand(1);
54225424
MachineOperand &Src1 = MI.getOperand(2);
54235425

5426+
if (ST.hasAddSubU64Insts()) {
5427+
auto I = BuildMI(*BB, MI, DL,
5428+
TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5429+
: AMDGPU::V_SUB_U64_e64),
5430+
Dest.getReg())
5431+
.add(Src0)
5432+
.add(Src1)
5433+
.addImm(0); // clamp
5434+
TII->legalizeOperands(*I);
5435+
MI.eraseFromParent();
5436+
return BB;
5437+
}
5438+
54245439
if (IsAdd && ST.hasLshlAddU64Inst()) {
54255440
auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
54265441
Dest.getReg())

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7361,6 +7361,10 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
73617361
}
73627362

73637363
case AMDGPU::S_MUL_U64:
7364+
if (ST.hasVectorMulU64()) {
7365+
NewOpcode = AMDGPU::V_MUL_U64_e64;
7366+
break;
7367+
}
73647368
// Split s_mul_u64 in 32-bit vector multiplications.
73657369
splitScalarSMulU64(Worklist, Inst, MDT);
73667370
Inst.eraseFromParent();

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2914,6 +2914,7 @@ def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
29142914
def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
29152915
def VOP_I16_F32_F32 : VOPProfile <[i16, f32, f32, untyped]>;
29162916
def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], /*EnableClamp=*/1>;
2917+
def VOP_I64_I64_I64_ARITH : VOPProfile <[i64, i64, i64, untyped], /*EnableClamp=*/1>;
29172918
def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
29182919
def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>;
29192920
def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>;

llvm/lib/Target/AMDGPU/VOP2Instructions.td

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -925,6 +925,17 @@ let isAdd = 1 in {
925925
defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_addc_u32">;
926926
}
927927

928+
let isReMaterializable = 1 in {
929+
let SubtargetPredicate = HasAddSubU64Insts, SchedRW = [Write64Bit] in {
930+
defm V_ADD_U64 : VOP2Inst <"v_add_nc_u64", VOP_I64_I64_I64_ARITH>;
931+
// We don't actually have something like V_SUBREV_U64 so V_SUB_U64 can't be treated as commutable.
932+
let isCommutable = 0 in
933+
defm V_SUB_U64 : VOP2Inst <"v_sub_nc_u64", VOP_I64_I64_I64_ARITH>;
934+
} // End SubtargetPredicate = HasAddSubU64Insts, SchedRW = [Write64Bit]
935+
let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDouble] in
936+
defm V_MUL_U64 : VOP2Inst <"v_mul_u64", VOP_I64_I64_I64, DivergentBinFrag<mul>>;
937+
} // End isReMaterializable = 1
938+
928939
} // End isCommutable = 1
929940

930941
// These are special and do not read the exec mask.
@@ -1754,6 +1765,9 @@ multiclass VOP2_Real_FULL_with_name<GFXGen Gen, bits<6> op, string opName,
17541765
VOP2_Realtriple_e64_with_name<Gen, op, opName, asmName>,
17551766
VOP2_Real_NO_VOP3_with_name<Gen, op, opName, asmName>;
17561767

1768+
multiclass VOP2_Real_NO_DPP<GFXGen Gen, bits<6> op> :
1769+
VOP2_Real_e32<Gen, op>, VOP2_Real_e64<Gen, op>;
1770+
17571771
multiclass VOP2_Real_NO_DPP_with_name<GFXGen Gen, bits<6> op, string opName,
17581772
string asmName> {
17591773
defm NAME : VOP2_Real_e32_with_name<Gen, op, opName, asmName>,
@@ -1843,6 +1857,9 @@ defm V_FMAC_F64 : VOP2_Real_FULL<GFX12Gen, 0x17>;
18431857

18441858
defm V_FMAMK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x23>;
18451859
defm V_FMAAK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x24>;
1860+
defm V_ADD_U64 : VOP2_Real_FULL<GFX1250Gen, 0x28>;
1861+
defm V_SUB_U64 : VOP2_Real_FULL<GFX1250Gen, 0x29>;
1862+
defm V_MUL_U64 : VOP2_Real_NO_DPP<GFX1250Gen, 0x2a>;
18461863

18471864
//===----------------------------------------------------------------------===//
18481865
// GFX11.

0 commit comments

Comments
 (0)