Skip to content

Commit d992382

Browse files
authored
[AMDGPU] Implement v_mad_u32/v_mad_nc_u|i64_u32 on gfx1250 (#151226)
1 parent 3212704 commit d992382

File tree

12 files changed

+417
-79
lines changed

12 files changed

+417
-79
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1389,6 +1389,9 @@ def FeatureAddSubU64Insts
13891389
: SubtargetFeature<"add-sub-u64-insts", "HasAddSubU64Insts", "true",
13901390
"Has v_add_u64 and v_sub_u64 instructions">;
13911391

1392+
def FeatureMadU32Inst : SubtargetFeature<"mad-u32-inst", "HasMadU32Inst",
1393+
"true", "Has v_mad_u32 instruction">;
1394+
13921395
def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts",
13931396
"HasVMemToLDSLoad",
13941397
"true",
@@ -2049,6 +2052,7 @@ def FeatureISAVersion12_50 : FeatureSet<
20492052
FeatureVmemPrefInsts,
20502053
FeatureLshlAddU64Inst,
20512054
FeatureAddSubU64Insts,
2055+
FeatureMadU32Inst,
20522056
FeatureLdsBarrierArriveAtomic,
20532057
FeatureSetPrioIncWgInst,
20542058
]>;
@@ -2839,6 +2843,9 @@ def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">,
28392843
def HasAddSubU64Insts : Predicate<"Subtarget->hasAddSubU64Insts()">,
28402844
AssemblerPredicate<(all_of FeatureAddSubU64Insts)>;
28412845

2846+
def HasMadU32Inst : Predicate<"Subtarget->hasMadU32Inst()">,
2847+
AssemblerPredicate<(all_of FeatureMadU32Inst)>;
2848+
28422849
def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">,
28432850
AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>;
28442851

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1134,15 +1134,26 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
11341134
SDLoc SL(N);
11351135
bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
11361136
unsigned Opc;
1137+
bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && !N->hasAnyUseOfValue(1);
11371138
if (Subtarget->hasMADIntraFwdBug())
11381139
Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
11391140
: AMDGPU::V_MAD_U64_U32_gfx11_e64;
1141+
else if (UseNoCarry)
1142+
Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
11401143
else
11411144
Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
11421145

11431146
SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
11441147
SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
11451148
Clamp };
1149+
1150+
if (UseNoCarry) {
1151+
MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops);
1152+
ReplaceUses(SDValue(N, 0), SDValue(Mad, 0));
1153+
CurDAG->RemoveDeadNode(N);
1154+
return;
1155+
}
1156+
11461157
CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
11471158
}
11481159

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -574,13 +574,22 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
574574
MachineBasicBlock *BB = I.getParent();
575575
MachineFunction *MF = BB->getParent();
576576
const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
577+
bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() &&
578+
MRI->use_nodbg_empty(I.getOperand(1).getReg());
577579

578580
unsigned Opc;
579581
if (Subtarget->hasMADIntraFwdBug())
580582
Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
581583
: AMDGPU::V_MAD_I64_I32_gfx11_e64;
584+
else if (UseNoCarry)
585+
Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
586+
: AMDGPU::V_MAD_NC_I64_I32_e64;
582587
else
583588
Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
589+
590+
if (UseNoCarry)
591+
I.removeOperand(1);
592+
584593
I.setDesc(TII.get(Opc));
585594
I.addOperand(*MF, MachineOperand::CreateImm(0));
586595
I.addImplicitDefUseOperands(*MF);

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
273273
bool HasMinimum3Maximum3PKF16 = false;
274274
bool HasLshlAddU64Inst = false;
275275
bool HasAddSubU64Insts = false;
276+
bool HasMadU32Inst = false;
276277
bool HasPointSampleAccel = false;
277278
bool HasLdsBarrierArriveAtomic = false;
278279
bool HasSetPrioIncWgInst = false;
@@ -1521,9 +1522,16 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
15211522
// \returns true if the target has V_ADD_U64/V_SUB_U64 instructions.
15221523
bool hasAddSubU64Insts() const { return HasAddSubU64Insts; }
15231524

1525+
// \returns true if the target has V_MAD_U32 instruction.
1526+
bool hasMadU32Inst() const { return HasMadU32Inst; }
1527+
15241528
// \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
15251529
bool hasVectorMulU64() const { return GFX1250Insts; }
15261530

1531+
// \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32
1532+
// instructions.
1533+
bool hasMadU64U32NoCarry() const { return GFX1250Insts; }
1534+
15271535
// \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
15281536
bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }
15291537

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,14 @@ class V_MUL_PROF<VOPProfile P> : VOP3_Profile<P> {
5757
def V_LSHL_ADD_U64_PROF : VOP3_Profile<VOP_I64_I64_I32_I64>;
5858

5959
def VOP_F64_F64_F64_F64_DPP_PROF : VOP3_Profile<VOP_F64_F64_F64_F64>;
60+
def V_MAD_U32_PROF: VOP3_Profile<VOP_I32_I32_I32_I32> {
61+
let HasExtVOP3DPP = 0;
62+
let HasExt64BitDPP = 1;
63+
}
64+
def VOP_I64_I64_I64_DPP : VOP3_Profile<VOP_I64_I64_I64>;
65+
def VOP_I32_I32_I64_DPP : VOP3_Profile<VOPProfile<[i64, i32, i32, i64]>> {
66+
let HasClamp = 1;
67+
}
6068
} // End HasExt64BitDPP = 1;
6169

6270
//===----------------------------------------------------------------------===//
@@ -152,6 +160,15 @@ defm V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32
152160
defm V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, any_fma>, VOPD_Component<0x13, "v_fma_f32">;
153161
defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
154162

163+
let SchedRW = [WriteIntMul] in {
164+
let SubtargetPredicate = HasMadU32Inst in
165+
defm V_MAD_U32 : VOP3Inst <"v_mad_u32", V_MAD_U32_PROF>;
166+
let SubtargetPredicate = isGFX1250Plus in {
167+
defm V_MAD_NC_U64_U32 : VOP3Inst<"v_mad_nc_u64_u32", VOP_I32_I32_I64_DPP>;
168+
defm V_MAD_NC_I64_I32 : VOP3Inst<"v_mad_nc_i64_i32", VOP_I32_I32_I64_DPP>;
169+
}
170+
}
171+
155172
let SchedRW = [WriteDoubleAdd] in {
156173
let FPDPRounding = 1 in {
157174
defm V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP_F64_F64_F64_F64_DPP_PROF, any_fma>, VOPD_Component<0x20, "v_fma_f64">;
@@ -848,6 +865,9 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>;
848865
def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>;
849866
def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;
850867

868+
let SubtargetPredicate = HasMadU32Inst, AddedComplexity = 10 in
869+
def : ThreeOp_i32_Pats<mul, add, V_MAD_U32_e64>;
870+
851871
def : GCNPat<
852872
(DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
853873
(V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;
@@ -1746,6 +1766,10 @@ defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x368, "v_m
17461766
defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>;
17471767
defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>;
17481768

1769+
defm V_MAD_U32 : VOP3Only_Realtriple_gfx1250<0x235>;
1770+
defm V_MAD_NC_U64_U32 : VOP3Only_Realtriple_gfx1250<0x2fa>;
1771+
defm V_MAD_NC_I64_I32 : VOP3Only_Realtriple_gfx1250<0x2fb>;
1772+
17491773
defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x369, "v_cvt_pk_fp8_f32">;
17501774
defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x36a, "v_cvt_pk_bf8_f32">;
17511775
defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32" >;

llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll

Lines changed: 58 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -801,15 +801,15 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
801801
; GFX1250-NEXT: s_wait_kmcnt 0x0
802802
; GFX1250-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
803803
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
804-
; GFX1250-NEXT: v_mul_lo_u32 v0, v6, v5
805-
; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v7, v4, v[0:1]
806-
; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v3, 0
807-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
808-
; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9]
809-
; GFX1250-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v11, v8
804+
; GFX1250-NEXT: v_mul_lo_u32 v0, v7, v4
805+
; GFX1250-NEXT: v_mad_u32 v5, v6, v5, v0
806+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v6, v3, 0
807+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
808+
; GFX1250-NEXT: v_mad_u32 v9, v2, v3, v5
809+
; GFX1250-NEXT: v_mov_b32_e32 v8, v1
810810
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
811-
; GFX1250-NEXT: v_mad_co_u64_u32 v[4:5], null, v6, v4, v[10:11]
812-
; GFX1250-NEXT: v_mad_co_u64_u32 v[2:3], null, v7, v3, v[4:5]
811+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5], v6, v4, v[8:9]
812+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], v7, v3, v[4:5]
813813
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
814814
; GFX1250-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3
815815
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -1206,11 +1206,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
12061206
; GFX1250-NEXT: s_wait_kmcnt 0x0
12071207
; GFX1250-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
12081208
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1209-
; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v6, 0
1210-
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], null, v9, v5, v[0:1]
1211-
; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0
1209+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v8, v6, 0
1210+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v9, v5, v[0:1]
1211+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v8, v4, 0
12121212
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1213-
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], null, v2, v4, v[10:11]
1213+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v2, v4, v[10:11]
12141214
; GFX1250-NEXT: v_mov_b32_e32 v12, v1
12151215
; GFX1250-NEXT: v_mul_lo_u32 v1, v9, v6
12161216
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1220,15 +1220,13 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
12201220
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
12211221
; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[12:13]
12221222
; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v11, v8, s0
1223-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
1224-
; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v8, v1, vcc_lo
1225-
; GFX1250-NEXT: v_mov_b32_e32 v1, v6
1226-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1227-
; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v5, v[8:9]
1223+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1224+
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v8, v1, vcc_lo
1225+
; GFX1250-NEXT: v_mad_u32 v1, v2, v5, v1
1226+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
12281227
; GFX1250-NEXT: v_mov_b32_e32 v2, v7
1229-
; GFX1250-NEXT: v_mad_co_u64_u32 v[4:5], null, v3, v4, v[8:9]
1230-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
1231-
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
1228+
; GFX1250-NEXT: v_mad_u32 v3, v3, v4, v1
1229+
; GFX1250-NEXT: v_mov_b32_e32 v1, v6
12321230
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
12331231
%result = mul i128 %num, %den
12341232
ret i128 %result
@@ -2856,90 +2854,89 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
28562854
; GFX1250: ; %bb.0:
28572855
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
28582856
; GFX1250-NEXT: s_wait_kmcnt 0x0
2859-
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v0, v14, 0
2860-
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], null, v0, v12, 0
2861-
; GFX1250-NEXT: v_mul_lo_u32 v26, v6, v9
2857+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v14, 0
2858+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v0, v12, 0
2859+
; GFX1250-NEXT: v_mul_lo_u32 v27, v5, v10
28622860
; GFX1250-NEXT: v_mul_lo_u32 v29, v3, v12
28632861
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2864-
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v1, v13, v[16:17]
2862+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v1, v13, v[16:17]
28652863
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v1, v11, v[18:19]
28662864
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
28672865
; GFX1250-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
2868-
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v2, v12, v[16:17]
2866+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v2, v12, v[16:17]
28692867
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
28702868
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
28712869
; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
2872-
; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], null, v0, v10, 0
2870+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v0, v10, 0
28732871
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2874-
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v3, v11, v[16:17]
2872+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v3, v11, v[16:17]
28752873
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
28762874
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
28772875
; GFX1250-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
2878-
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v4, v10, v[16:17]
2879-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
2876+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v4, v10, v[16:17]
2877+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
28802878
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
2881-
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v5, v9, v[16:17]
2882-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
2883-
; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[16:17]
2879+
; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
2880+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2881+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v5, v9, v[16:17]
2882+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[22:23], v6, v8, v[16:17]
28842883
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v9, v[20:21]
2885-
; GFX1250-NEXT: v_mov_b32_e32 v20, v19
2886-
; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v24, vcc_lo
2887-
; GFX1250-NEXT: v_cndmask_b32_e64 v19, 0, 1, s0
2888-
; GFX1250-NEXT: v_mov_b32_e32 v21, v22
2889-
; GFX1250-NEXT: v_mul_lo_u32 v22, v5, v10
2890-
; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v2, v8, v[16:17]
2891-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
2892-
; GFX1250-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v19, vcc_lo
2893-
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v0, v13, v[20:21]
2894-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
2895-
; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v20, v25
2884+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
2885+
; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
2886+
; GFX1250-NEXT: v_mul_lo_u32 v22, v6, v9
2887+
; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
2888+
; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[16:17]
2889+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
2890+
; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v0, v13, v[20:21]
2891+
; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s0
2892+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
2893+
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v12, v[20:21]
2894+
; GFX1250-NEXT: v_dual_mov_b32 v20, v25 :: v_dual_mov_b32 v21, v18
28962895
; GFX1250-NEXT: v_mul_lo_u32 v25, v4, v11
2897-
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], vcc_lo, v1, v12, v[16:17]
2898-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2896+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
28992897
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v0, v11, v[20:21]
29002898
; GFX1250-NEXT: v_cndmask_b32_e64 v28, 0, 1, s2
2901-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
29022899
; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[16:17]
2900+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v8, 0
2901+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
29032902
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v1, v10, v[18:19]
2904-
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v0, v8, 0
2905-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
29062903
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21]
29072904
; GFX1250-NEXT: v_mul_lo_u32 v20, v2, v13
29082905
; GFX1250-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v28, s2
2906+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
29092907
; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19]
29102908
; GFX1250-NEXT: v_dual_mov_b32 v18, v17 :: v_dual_mov_b32 v19, v24
2911-
; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v21, s2
29122909
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11]
2910+
; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v21, s2
29132911
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
29142912
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v0, v9, v[18:19]
29152913
; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v15
29162914
; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13]
29172915
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
29182916
; GFX1250-NEXT: v_mul_lo_u32 v9, v1, v14
2919-
; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, s2
29202917
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11]
2918+
; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, s2
29212919
; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v1, v8, v[18:19]
29222920
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
29232921
; GFX1250-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2
2924-
; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v27, v13, s2
2922+
; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v6, v13, s2
29252923
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
29262924
; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v2, v10, s2
2927-
; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v6, v11, s2
2928-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2925+
; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v26, v11, s2
2926+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
29292927
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v23, v0, s2
2928+
; GFX1250-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v1, v14
29302929
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v9, s5
29312930
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
29322931
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v20, s4
29332932
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v29, s3
29342933
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
29352934
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v25, s1
2936-
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v22, vcc_lo
2935+
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v27, s0
29372936
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2938-
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v26, s0
2939-
; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v7, v8, v[0:1]
2940-
; GFX1250-NEXT: v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v14
2941-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
2942-
; GFX1250-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v7, v8
2937+
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v22, vcc_lo
2938+
; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v0
2939+
; GFX1250-NEXT: v_mov_b32_e32 v0, v16
29432940
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
29442941
%result = mul i256 %num, %den
29452942
ret i256 %result
@@ -3004,7 +3001,7 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr
30043001
; GFX1250: ; %bb.0:
30053002
; GFX1250-NEXT: global_load_b32 v2, v[2:3], off
30063003
; GFX1250-NEXT: s_wait_loadcnt 0x0
3007-
; GFX1250-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0
3004+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v2, 0
30083005
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
30093006
; GFX1250-NEXT: s_endpgm
30103007
%val = load i32, ptr addrspace(1) %in, align 4
@@ -3195,7 +3192,7 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
31953192
; GFX1250: ; %bb.0:
31963193
; GFX1250-NEXT: global_load_b32 v2, v[2:3], off
31973194
; GFX1250-NEXT: s_wait_loadcnt 0x0
3198-
; GFX1250-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0
3195+
; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v2, 0
31993196
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
32003197
; GFX1250-NEXT: s_endpgm
32013198
%val = load i32, ptr addrspace(1) %in, align 4

0 commit comments

Comments
 (0)