Skip to content

Commit 57c1e01

Browse files
authored
[AMDGPU] Don't allow wgp mode on gfx1250 (#153680)
- gfx1250 only supports cu mode
1 parent a1529cd commit 57c1e01

File tree

13 files changed

+1623
-439
lines changed

13 files changed

+1623
-439
lines changed

clang/test/CodeGenHIP/hip-cumode.hip

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,20 @@
55
// RUN: %clang -S -o - --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mcumode \
66
// RUN: %s 2>&1 | FileCheck --check-prefix=NOWGP %s
77
// RUN: %clang -S -o - --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
8-
// RUN: %s 2>&1 | FileCheck --check-prefixes=NOWGP,WARN-CUMODE %s
8+
// RUN: %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx906 --check-prefixes=NOWGP,WARN-CUMODE %s
99
// RUN: %clang -S -o - --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib \
1010
// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s
1111
// RUN: %clang -S -o - --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib -mcumode \
1212
// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
1313
// RUN: %clang -S -o - --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
1414
// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s
15-
// WARN-CUMODE: warning: ignoring '-mno-cumode' option as it is not currently supported for processor 'gfx906' [-Woption-ignored]
15+
// RUN: %clang -S -o - --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib \
16+
// RUN: %s 2>&1 | FileCheck --check-prefix=NOWGP %s
17+
// RUN: %clang -S -o - --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib -mcumode \
18+
// RUN: %s 2>&1 | FileCheck --check-prefix=NOWGP %s
19+
// RUN: %clang -S -o - --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
20+
// RUN: %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx1250 --check-prefixes=NOWGP,WARN-CUMODE %s
21+
// WARN-CUMODE: warning: ignoring '-mno-cumode' option as it is not currently supported for processor '[[OFFLOAD_ARCH]]' [-Woption-ignored]
1622
// NOWGP-NOT: .amdhsa_workgroup_processor_mode
1723
// CUMODE-ON: .amdhsa_workgroup_processor_mode 0
1824
// CUMODE-OFF: .amdhsa_workgroup_processor_mode 1

clang/test/Driver/hip-macros.hip

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,21 +27,27 @@
2727
// RUN: %clang -E -dM --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mcumode \
2828
// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
2929
// RUN: %clang -E -dM --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
30-
// RUN: %s 2>&1 | FileCheck --check-prefixes=CUMODE-ON,WARN-CUMODE %s
30+
// RUN: %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx906 --check-prefixes=CUMODE-ON,WARN-CUMODE %s
3131
// RUN: %clang -E -dM --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib \
3232
// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s
3333
// RUN: %clang -E -dM --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib -mcumode \
3434
// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
3535
// RUN: %clang -E -dM --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
3636
// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s
37+
// RUN: %clang -E -dM --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib \
38+
// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
39+
// RUN: %clang -E -dM --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib -mcumode \
40+
// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
41+
// RUN: %clang -E -dM --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
42+
// RUN: %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx1250 --check-prefixes=CUMODE-ON,WARN-CUMODE %s
3743

3844
// Check no duplicate warnings.
3945
// RUN: %clang -E -dM --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mcumode \
4046
// RUN: -mno-cumode -mno-cumode \
41-
// RUN: %s 2>&1 | FileCheck --check-prefixes=CUMODE-ON,WARN-CUMODE %s
47+
// RUN: %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx906 --check-prefixes=CUMODE-ON,WARN-CUMODE %s
4248

43-
// WARN-CUMODE-DAG: warning: ignoring '-mno-cumode' option as it is not currently supported for processor 'gfx906' [-Woption-ignored]
44-
// WARN-CUMODE-NOT: warning: ignoring '-mno-cumode' option as it is not currently supported for processor 'gfx906' [-Woption-ignored]
49+
// WARN-CUMODE-DAG: warning: ignoring '-mno-cumode' option as it is not currently supported for processor '[[OFFLOAD_ARCH]]' [-Woption-ignored]
50+
// WARN-CUMODE-NOT: warning: ignoring '-mno-cumode' option as it is not currently supported for processor '[[OFFLOAD_ARCH]]' [-Woption-ignored]
4551
// CUMODE-ON-DAG: #define __AMDGCN_CUMODE__ 1
4652
// CUMODE-OFF-DAG: #define __AMDGCN_CUMODE__ 0
4753

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1144,8 +1144,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
11441144
CreateExpr(STM.getWavefrontSize()), Ctx),
11451145
CreateExpr(1ULL << ScratchAlignShift));
11461146

1147-
if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
1147+
if (STM.supportsWGP()) {
11481148
ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
1149+
}
1150+
1151+
if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
11491152
ProgInfo.MemOrdered = 1;
11501153
ProgInfo.FwdProgress = 1;
11511154
}

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6270,8 +6270,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
62706270
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
62716271
ExprVal, ValRange);
62726272
} else if (ID == ".amdhsa_workgroup_processor_mode") {
6273-
if (IVersion.Major < 10)
6274-
return Error(IDRange.Start, "directive requires gfx10+", IDRange);
6273+
if (!supportsWGP(getSTI()))
6274+
return Error(IDRange.Start,
6275+
"directive unsupported on " + getSTI().getCPU(), IDRange);
62756276
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
62766277
COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE, ExprVal,
62776278
ValRange);

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
390390
/// the original value.
391391
bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
392392

393-
bool supportsWGP() const { return getGeneration() >= GFX10; }
393+
bool supportsWGP() const {
394+
if (GFX1250Insts)
395+
return false;
396+
return getGeneration() >= GFX10;
397+
}
394398

395399
bool hasIntClamp() const {
396400
return HasIntClamp;

llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -563,11 +563,12 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
563563
PrintField(KD.compute_pgm_rsrc3,
564564
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
565565
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, ".amdhsa_tg_split");
566-
if (IVersion.Major >= 10) {
566+
if (AMDGPU::supportsWGP(STI))
567567
PrintField(KD.compute_pgm_rsrc1,
568568
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE_SHIFT,
569569
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE,
570570
".amdhsa_workgroup_processor_mode");
571+
if (IVersion.Major >= 10) {
571572
PrintField(KD.compute_pgm_rsrc1,
572573
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED_SHIFT,
573574
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED,

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1167,12 +1167,21 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
11671167

11681168
unsigned getEUsPerCU(const MCSubtargetInfo *STI) {
11691169
// "Per CU" really means "per whatever functional block the waves of a
1170-
// workgroup must share". For gfx10 in CU mode this is the CU, which contains
1170+
// workgroup must share".
1171+
1172+
// GFX12.5 only supports CU mode, which contains four SIMDs.
1173+
if (isGFX1250(*STI)) {
1174+
assert(STI->getFeatureBits().test(FeatureCuMode));
1175+
return 4;
1176+
}
1177+
1178+
// For gfx10 in CU mode the functional block is the CU, which contains
11711179
// two SIMDs.
11721180
if (isGFX10Plus(*STI) && STI->getFeatureBits().test(FeatureCuMode))
11731181
return 2;
1174-
// Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP contains
1175-
// two CUs, so a total of four SIMDs.
1182+
1183+
// Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP
1184+
// contains two CUs, so a total of four SIMDs.
11761185
return 4;
11771186
}
11781187

@@ -2480,6 +2489,12 @@ bool isGFX1250(const MCSubtargetInfo &STI) {
24802489
return STI.getFeatureBits()[AMDGPU::FeatureGFX1250Insts];
24812490
}
24822491

2492+
bool supportsWGP(const MCSubtargetInfo &STI) {
2493+
if (isGFX1250(STI))
2494+
return false;
2495+
return isGFX10Plus(STI);
2496+
}
2497+
24832498
bool isNotGFX11Plus(const MCSubtargetInfo &STI) { return !isGFX11Plus(STI); }
24842499

24852500
bool isNotGFX10Plus(const MCSubtargetInfo &STI) {

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1549,6 +1549,7 @@ bool isGFX11Plus(const MCSubtargetInfo &STI);
15491549
bool isGFX12(const MCSubtargetInfo &STI);
15501550
bool isGFX12Plus(const MCSubtargetInfo &STI);
15511551
bool isGFX1250(const MCSubtargetInfo &STI);
1552+
bool supportsWGP(const MCSubtargetInfo &STI);
15521553
bool isNotGFX12Plus(const MCSubtargetInfo &STI);
15531554
bool isNotGFX11Plus(const MCSubtargetInfo &STI);
15541555
bool isGCN3Encoding(const MCSubtargetInfo &STI);

llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll

Lines changed: 51 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -2854,89 +2854,90 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
28542854
; GFX1250: ; %bb.0:
28552855
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
28562856
; GFX1250-NEXT: s_wait_kmcnt 0x0
2857-
; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v14, 0
2858-
; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v0, v12, 0
2857+
; GFX1250-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
28592858
; GFX1250-NEXT: v_mul_lo_u32 v27, v5, v10
28602859
; GFX1250-NEXT: v_mul_lo_u32 v29, v3, v12
2861-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2862-
; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v1, v13, v[16:17]
2863-
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v1, v11, v[18:19]
2864-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
2860+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2861+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v14, 0
2862+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v16, v12, 0
2863+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v17, v13, v[0:1]
2864+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2865+
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
28652866
; GFX1250-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
2866-
; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v2, v12, v[16:17]
2867-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2867+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2868+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v2, v12, v[0:1]
28682869
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
2870+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
28692871
; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
2870-
; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v0, v10, 0
2871-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2872-
; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v3, v11, v[16:17]
2872+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v16, v10, 0
2873+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v3, v11, v[0:1]
2874+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
28732875
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
2874-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
28752876
; GFX1250-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
2876-
; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v4, v10, v[16:17]
2877-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2877+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2878+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v4, v10, v[0:1]
28782879
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
2880+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
28792881
; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
2880-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2881-
; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v5, v9, v[16:17]
2882-
; GFX1250-NEXT: v_mad_nc_u64_u32 v[22:23], v6, v8, v[16:17]
2883-
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v9, v[20:21]
2884-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
2882+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v5, v9, v[0:1]
2883+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2884+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[22:23], v6, v8, v[0:1]
2885+
; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
28852886
; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
28862887
; GFX1250-NEXT: v_mul_lo_u32 v22, v6, v9
28872888
; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
2888-
; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[16:17]
2889-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
2890-
; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v0, v13, v[20:21]
2889+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2890+
; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[0:1]
2891+
; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[20:21]
2892+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
28912893
; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s0
2892-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
2893-
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v12, v[20:21]
2894+
; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v12, v[20:21]
2895+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
28942896
; GFX1250-NEXT: v_dual_mov_b32 v20, v25 :: v_dual_mov_b32 v21, v18
28952897
; GFX1250-NEXT: v_mul_lo_u32 v25, v4, v11
2896-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2897-
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v0, v11, v[20:21]
2898+
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v16, v11, v[20:21]
2899+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
28982900
; GFX1250-NEXT: v_cndmask_b32_e64 v28, 0, 1, s2
2899-
; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[16:17]
2900-
; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v8, 0
2901-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
2902-
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v1, v10, v[18:19]
2901+
; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[0:1]
2902+
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v8, 0
2903+
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v17, v10, v[18:19]
2904+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
29032905
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21]
29042906
; GFX1250-NEXT: v_mul_lo_u32 v20, v2, v13
29052907
; GFX1250-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v28, s2
2906-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
29072908
; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19]
2908-
; GFX1250-NEXT: v_dual_mov_b32 v18, v17 :: v_dual_mov_b32 v19, v24
2909+
; GFX1250-NEXT: v_dual_mov_b32 v18, v1 :: v_dual_mov_b32 v19, v24
29092910
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11]
2910-
; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v21, s2
2911-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
2912-
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v0, v9, v[18:19]
2913-
; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v15
2911+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
2912+
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v21, s2
2913+
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v16, v9, v[18:19]
2914+
; GFX1250-NEXT: v_mul_lo_u32 v2, v16, v15
29142915
; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13]
29152916
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
2916-
; GFX1250-NEXT: v_mul_lo_u32 v9, v1, v14
2917+
; GFX1250-NEXT: v_mul_lo_u32 v9, v17, v14
29172918
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11]
2918-
; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, s2
2919-
; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v1, v8, v[18:19]
2919+
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s2
2920+
; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[18:19]
29202921
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
29212922
; GFX1250-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2
29222923
; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v6, v13, s2
29232924
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2924-
; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v2, v10, s2
2925+
; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v1, v10, s2
29252926
; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v26, v11, s2
29262927
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2927-
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v23, v0, s2
2928-
; GFX1250-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v1, v14
2929-
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v9, s5
2928+
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v23, v2, s2
2929+
; GFX1250-NEXT: v_mov_b32_e32 v2, v15
2930+
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, s5
29302931
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2931-
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v20, s4
2932-
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v29, s3
2932+
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v20, s4
2933+
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v29, s3
29332934
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2934-
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v25, s1
2935-
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v27, s0
2935+
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v25, s1
2936+
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v27, s0
29362937
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2937-
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v22, vcc_lo
2938-
; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v0
2939-
; GFX1250-NEXT: v_mov_b32_e32 v0, v16
2938+
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v22, vcc_lo
2939+
; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v1
2940+
; GFX1250-NEXT: v_mov_b32_e32 v1, v14
29402941
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
29412942
%result = mul i256 %num, %den
29422943
ret i256 %result

0 commit comments

Comments
 (0)