From 3574456e98da8b83b2da95e5cbac49d3c3e915ee Mon Sep 17 00:00:00 2001 From: Pragyansh Chaturvedi Date: Sat, 13 Sep 2025 22:04:51 +0000 Subject: [PATCH 1/9] [GlobalISel] add KnownBits tracking for G_UMULH and G_SMULH --- .../CodeGen/GlobalISel/GISelValueTracking.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp index 0cf44e02254de..1b8d192e53fcd 100644 --- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp +++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp @@ -366,6 +366,22 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known, Known = KnownBits::mul(Known, Known2); break; } + case TargetOpcode::G_UMULH: { + computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts, + Depth + 1); + computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts, + Depth + 1); + Known = KnownBits::mulhu(Known, Known2); + break; + } + case TargetOpcode::G_SMULH: { + computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts, + Depth + 1); + computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts, + Depth + 1); + Known = KnownBits::mulhs(Known, Known2); + break; + } case TargetOpcode::G_SELECT: { computeKnownBitsMin(MI.getOperand(2).getReg(), MI.getOperand(3).getReg(), Known, DemandedElts, Depth + 1); From 468f4e48dc203f836cba188889ee7b214d3d4023 Mon Sep 17 00:00:00 2001 From: Pragyansh Chaturvedi Date: Sun, 14 Sep 2025 20:59:09 +0000 Subject: [PATCH 2/9] [GlobalISel] add GISelValueTracking tests for G_UMULH and G_SMULH --- .../AArch64/GlobalISel/knownbits-smulh.mir | 137 ++++++++++++++++++ .../AArch64/GlobalISel/knownbits-umulh.mir | 137 ++++++++++++++++++ 2 files changed, 274 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/knownbits-smulh.mir create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/knownbits-umulh.mir diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-smulh.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-smulh.mir new file mode 100644 index 0000000000000..b9cde9587c78c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-smulh.mir @@ -0,0 +1,137 @@ +# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=aarch64 -passes='print' -filetype=null %s 2>&1 | FileCheck %s + +--- +name: Cst +body: | + bb.0: + ; CHECK-LABEL: name: @Cst + ; CHECK-NEXT: %0:_ KnownBits:00010011 SignBits:3 + ; CHECK-NEXT: %1:_ KnownBits:11100000 SignBits:3 + ; CHECK-NEXT: %2:_ KnownBits:11111101 SignBits:6 + %0:_(s8) = G_CONSTANT i8 19 + %1:_(s8) = G_CONSTANT i8 224 + %2:_(s8) = G_SMULH %0, %1 +... +--- +name: CstZero +body: | + bb.0: + ; CHECK-LABEL: name: @CstZero + ; CHECK-NEXT: %0:_ KnownBits:11111111 SignBits:8 + ; CHECK-NEXT: %1:_ KnownBits:00000000 SignBits:8 + ; CHECK-NEXT: %2:_ KnownBits:00000000 SignBits:8 + %0:_(s8) = G_CONSTANT i8 255 + %1:_(s8) = G_CONSTANT i8 0 + %2:_(s8) = G_SMULH %0, %1 +... +--- +name: ScalarVar +body: | + bb.0: + ; CHECK-LABEL: name: @ScalarVar + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1 + %0:_(s8) = COPY $b0 + %1:_(s8) = COPY $b1 + %2:_(s8) = G_SMULH %0, %1 +... +--- +name: ScalarZero +body: | + bb.0: + ; CHECK-LABEL: name: @ScalarZero + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:00000000 SignBits:8 + ; CHECK-NEXT: %2:_ KnownBits:00000000 SignBits:8 + %0:_(s8) = COPY $b0 + %1:_(s8) = G_CONSTANT i8 0 + %2:_(s8) = G_SMULH %0, %1 +... +--- +name: ScalarVarAbs +body: | + bb.0: + ; CHECK-LABEL: name: @ScalarVarAbs + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????????????? SignBits:9 + ; CHECK-NEXT: %3:_ KnownBits:0000000000000001 SignBits:15 + ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1 + %0:_(s8) = COPY $b0 + %1:_(s8) = G_ABS %0 + %2:_(s16) = G_SEXT %1 + %3:_(s16) = G_CONSTANT i16 1 + %4:_(s16) = G_SMULH %2, %3 +... +--- +name: SplatVecCst +body: | + bb.0: + ; CHECK-LABEL: name: @SplatVecCst + ; CHECK-NEXT: %0:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %1:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %2:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %3:_ KnownBits:00000000 SignBits:8 + %0:_(s8) = G_CONSTANT i8 250 + %1:_() = G_SPLAT_VECTOR %0(s8) + %2:_() = G_SPLAT_VECTOR %0(s8) + %3:_() = G_SMULH %1, %2 +... +--- +name: SplatVecPartScalar +body: | + bb.0: + ; CHECK-LABEL: name: @SplatVecPartScalar + ; CHECK-NEXT: %0:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %1:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %3:_ KnownBits:00001111 SignBits:4 + ; CHECK-NEXT: %4:_ KnownBits:0000???? SignBits:4 + ; CHECK-NEXT: %5:_ KnownBits:0000???? SignBits:4 + ; CHECK-NEXT: %6:_ KnownBits:???????? SignBits:1 + %0:_(s8) = G_CONSTANT i8 250 + %1:_() = G_SPLAT_VECTOR %0(s8) + %2:_(s8) = G_IMPLICIT_DEF + %3:_(s8) = G_CONSTANT i8 15 + %4:_(s8) = G_AND %2, %3 + %5:_() = G_SPLAT_VECTOR %4(s8) + %6:_() = G_SMULH %1, %5 +... +--- +name: VecCst +body: | + bb.0: + ; CHECK-LABEL: name: @VecCst + ; CHECK-NEXT: %0:_ KnownBits:00011001 SignBits:3 + ; CHECK-NEXT: %1:_ KnownBits:11100001 SignBits:3 + ; CHECK-NEXT: %2:_ KnownBits:?????001 SignBits:3 + ; CHECK-NEXT: %3:_ KnownBits:?????001 SignBits:3 + ; CHECK-NEXT: %4:_ KnownBits:???????? SignBits:1 + %0:_(s8) = G_CONSTANT i8 25 + %1:_(s8) = G_CONSTANT i8 225 + %2:_(<2 x s8>) = G_BUILD_VECTOR %0:_(s8), %1:_(s8) + %3:_(<2 x s8>) = G_BUILD_VECTOR %0:_(s8), %1:_(s8) + %4:_(<2 x s8>) = G_SMULH %2, %3 +... +--- +name: VecPartScalar +body: | + bb.0: + ; CHECK-LABEL: name: @VecPartScalar + ; CHECK-NEXT: %0:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %1:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %3:_ KnownBits:00001111 SignBits:4 + ; CHECK-NEXT: %4:_ KnownBits:0000???? SignBits:4 + ; CHECK-NEXT: %5:_ KnownBits:0000???? SignBits:4 + ; CHECK-NEXT: %6:_ KnownBits:???????? SignBits:1 + %0:_(s8) = G_CONSTANT i8 250 + %1:_(<2 x s8>) = G_BUILD_VECTOR %0:_(s8), %0:_(s8) + %2:_(s8) = G_IMPLICIT_DEF + %3:_(s8) = G_CONSTANT i8 15 + %4:_(s8) = G_AND %2, %3 + %5:_(<2 x s8>) = G_BUILD_VECTOR %4:_(s8), %4:_(s8) + %6:_(<2 x s8>) = G_SMULH %1, %5 +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-umulh.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-umulh.mir new file mode 100644 index 0000000000000..debdbaaeecf5c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-umulh.mir @@ -0,0 +1,137 @@ +# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=aarch64 -passes='print' -filetype=null %s 2>&1 | FileCheck %s + +--- +name: Cst +body: | + bb.0: + ; CHECK-LABEL: name: @Cst + ; CHECK-NEXT: %0:_ KnownBits:00010011 SignBits:3 + ; CHECK-NEXT: %1:_ KnownBits:11100000 SignBits:3 + ; CHECK-NEXT: %2:_ KnownBits:00010000 SignBits:3 + %0:_(s8) = G_CONSTANT i8 19 + %1:_(s8) = G_CONSTANT i8 224 + %2:_(s8) = G_UMULH %0, %1 +... +--- +name: CstZero +body: | + bb.0: + ; CHECK-LABEL: name: @CstZero + ; CHECK-NEXT: %0:_ KnownBits:11111111 SignBits:8 + ; CHECK-NEXT: %1:_ KnownBits:00000000 SignBits:8 + ; CHECK-NEXT: %2:_ KnownBits:00000000 SignBits:8 + %0:_(s8) = G_CONSTANT i8 255 + %1:_(s8) = G_CONSTANT i8 0 + %2:_(s8) = G_UMULH %0, %1 +... +--- +name: ScalarVar +body: | + bb.0: + ; CHECK-LABEL: name: @ScalarVar + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1 + %0:_(s8) = COPY $b0 + %1:_(s8) = COPY $b1 + %2:_(s8) = G_UMULH %0, %1 +... +--- +name: ScalarZero +body: | + bb.0: + ; CHECK-LABEL: name: @ScalarZero + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:00000000 SignBits:8 + ; CHECK-NEXT: %2:_ KnownBits:00000000 SignBits:8 + %0:_(s8) = COPY $b0 + %1:_(s8) = G_CONSTANT i8 0 + %2:_(s8) = G_UMULH %0, %1 +... +--- +name: ScalarVarAbs +body: | + bb.0: + ; CHECK-LABEL: name: @ScalarVarAbs + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????????????? SignBits:9 + ; CHECK-NEXT: %3:_ KnownBits:0000000000000001 SignBits:15 + ; CHECK-NEXT: %4:_ KnownBits:0000000000000000 SignBits:16 + %0:_(s8) = COPY $b0 + %1:_(s8) = G_ABS %0 + %2:_(s16) = G_SEXT %1 + %3:_(s16) = G_CONSTANT i16 1 + %4:_(s16) = G_UMULH %2, %3 +... +--- +name: SplatVecCst +body: | + bb.0: + ; CHECK-LABEL: name: @SplatVecCst + ; CHECK-NEXT: %0:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %1:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %2:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %3:_ KnownBits:11110100 SignBits:4 + %0:_(s8) = G_CONSTANT i8 250 + %1:_() = G_SPLAT_VECTOR %0(s8) + %2:_() = G_SPLAT_VECTOR %0(s8) + %3:_() = G_UMULH %1, %2 +... +--- +name: SplatVecPartScalar +body: | + bb.0: + ; CHECK-LABEL: name: @SplatVecPartScalar + ; CHECK-NEXT: %0:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %1:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %3:_ KnownBits:00001111 SignBits:4 + ; CHECK-NEXT: %4:_ KnownBits:0000???? SignBits:4 + ; CHECK-NEXT: %5:_ KnownBits:0000???? SignBits:4 + ; CHECK-NEXT: %6:_ KnownBits:0000???? SignBits:4 + %0:_(s8) = G_CONSTANT i8 250 + %1:_() = G_SPLAT_VECTOR %0(s8) + %2:_(s8) = G_IMPLICIT_DEF + %3:_(s8) = G_CONSTANT i8 15 + %4:_(s8) = G_AND %2, %3 + %5:_() = G_SPLAT_VECTOR %4(s8) + %6:_() = G_UMULH %1, %5 +... +--- +name: VecCst +body: | + bb.0: + ; CHECK-LABEL: name: @VecCst + ; CHECK-NEXT: %0:_ KnownBits:00011001 SignBits:3 + ; CHECK-NEXT: %1:_ KnownBits:11100001 SignBits:3 + ; CHECK-NEXT: %2:_ KnownBits:?????001 SignBits:3 + ; CHECK-NEXT: %3:_ KnownBits:?????001 SignBits:3 + ; CHECK-NEXT: %4:_ KnownBits:???????? SignBits:1 + %0:_(s8) = G_CONSTANT i8 25 + %1:_(s8) = G_CONSTANT i8 225 + %2:_(<2 x s8>) = G_BUILD_VECTOR %0:_(s8), %1:_(s8) + %3:_(<2 x s8>) = G_BUILD_VECTOR %0:_(s8), %1:_(s8) + %4:_(<2 x s8>) = G_UMULH %2, %3 +... +--- +name: VecPartScalar +body: | + bb.0: + ; CHECK-LABEL: name: @VecPartScalar + ; CHECK-NEXT: %0:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %1:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %3:_ KnownBits:00001111 SignBits:4 + ; CHECK-NEXT: %4:_ KnownBits:0000???? SignBits:4 + ; CHECK-NEXT: %5:_ KnownBits:0000???? SignBits:4 + ; CHECK-NEXT: %6:_ KnownBits:0000???? SignBits:4 + %0:_(s8) = G_CONSTANT i8 250 + %1:_(<2 x s8>) = G_BUILD_VECTOR %0:_(s8), %0:_(s8) + %2:_(s8) = G_IMPLICIT_DEF + %3:_(s8) = G_CONSTANT i8 15 + %4:_(s8) = G_AND %2, %3 + %5:_(<2 x s8>) = G_BUILD_VECTOR %4:_(s8), %4:_(s8) + %6:_(<2 x s8>) = G_UMULH %1, %5 +... From 27526494f9ab2b4e7a7b4b44ad85c2b6cd2e0143 Mon Sep 17 00:00:00 2001 From: Pragyansh Chaturvedi Date: Sun, 14 Sep 2025 21:22:37 +0000 Subject: [PATCH 3/9] [GlobalISel] update CodeGen/AArch64/pr58431.ll to agree with KnownBits changes --- llvm/test/CodeGen/AArch64/pr58431.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/AArch64/pr58431.ll b/llvm/test/CodeGen/AArch64/pr58431.ll index 467ceb062f249..a37300432bca0 100644 --- a/llvm/test/CodeGen/AArch64/pr58431.ll +++ b/llvm/test/CodeGen/AArch64/pr58431.ll @@ -9,7 +9,7 @@ define i32 @f(i64 %0) { ; CHECK-NEXT: mov w10, #10 // =0xa ; CHECK-NEXT: eor x8, x8, #0x8000000000000003 ; CHECK-NEXT: umulh x8, x9, x8 -; CHECK-NEXT: msub x0, x8, x10, x9 +; CHECK-NEXT: umsubl x0, w8, w10, x9 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %2 = trunc i64 %0 to i32 From 25c37de60e54370ee948c59cf86abf8666e55ef4 Mon Sep 17 00:00:00 2001 From: Pragyansh Chaturvedi Date: Fri, 19 Sep 2025 12:03:14 +0000 Subject: [PATCH 4/9] [GlobalISel] Update AMDGPU tests after G_UMULH/G_SMULH knownbits change --- .../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll | 325 ++++++++--------- .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll | 344 +++++++++--------- .../CodeGen/AMDGPU/GlobalISel/udiv.i64.ll | 108 +++--- .../CodeGen/AMDGPU/GlobalISel/urem.i64.ll | 32 +- 4 files changed, 388 insertions(+), 421 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index f57fc005b994b..c688db7c7bff3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -2534,202 +2534,195 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v1 -; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v5 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v9, 0 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v1 +; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GISEL-NEXT: v_trunc_f32_e32 v7, v4 -; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v7 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v12, v3 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3 -; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7 -; GISEL-NEXT: v_mul_lo_u32 v14, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; GISEL-NEXT: v_trunc_f32_e32 v5, v4 +; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5] +; GISEL-NEXT: v_mul_hi_u32 v12, v7, v3 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 +; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 +; GISEL-NEXT: v_mul_lo_u32 v13, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v7, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v3 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v12, v3 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v3 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 ; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3 -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 -; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v2 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v0, v7, v3 +; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, v12, v7 +; GISEL-NEXT: v_mul_lo_u32 v5, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v8, v7, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v12, v3, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v10, v4 -; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v10, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc +; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v10, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v4 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v2 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_mul_lo_u32 v5, 0, v3 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v0, 0 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v3 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v4, v[7:8] -; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v9 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v0, v[7:8] -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v10, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0 +; GISEL-NEXT: v_mul_hi_u32 v4, 0, v3 +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 +; GISEL-NEXT: v_mov_b32_e32 v5, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v4, v[5:6] +; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v8 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[5:6] +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v10, v7 ; GISEL-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 -; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v7, vcc -; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 0, v7 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v7 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v2 +; GISEL-NEXT: v_trunc_f32_e32 v8, v6 +; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v2 +; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v3 ; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v7 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[2:3] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, -1, v16, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v10, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v15, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v6 -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v8, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v2, vcc -; GISEL-NEXT: v_mul_hi_u32 v2, v12, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v8 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5 +; GISEL-NEXT: v_mov_b32_e32 v2, v7 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v15, v[2:3] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v14, v12, v[7:8] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v8, -1, v2, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v2, v15, v6 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v7 +; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v5, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v12, v6 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v15, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v15, v5 +; GISEL-NEXT: v_mul_lo_u32 v5, v15, v7 +; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v10, v12, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_mul_hi_u32 v7, v15, v7 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v12, v2 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v15, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v0 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v4, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v1 -; GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v13, v10, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v14, v7, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v12 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v15, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v10, v5 -; GISEL-NEXT: v_mul_lo_u32 v14, v7, v1 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v13, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v5, v10, v5 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v13, v6 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v1 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v12, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; GISEL-NEXT: v_mul_hi_u32 v1, v10, v1 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6 -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v6 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 -; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], v10, v1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v2 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v15, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v10, 0 +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v9, v1 +; GISEL-NEXT: v_mov_b32_e32 v2, v6 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, v[2:3] +; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v16, vcc +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v10, v[6:7] +; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v4, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1 +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, v10, v6 +; GISEL-NEXT: v_mul_hi_u32 v14, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v14, v12, v6 +; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_mul_hi_u32 v9, v10, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v14, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v6, v12, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v12, v6, vcc ; GISEL-NEXT: v_mul_lo_u32 v6, 0, v5 -; GISEL-NEXT: v_mul_lo_u32 v7, v11, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v11, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v11, v7 +; GISEL-NEXT: v_mul_hi_u32 v14, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v2 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v13, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, 0, v7 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GISEL-NEXT: v_mul_lo_u32 v7, 0, v1 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v11, v1 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v11, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v1, 0, v1 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v10, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v1, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v5, v6 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GISEL-NEXT: v_mul_hi_u32 v10, 0, v7 ; GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[1:2] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v10, v[1:2] ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v10, v[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v12, vcc +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v9, v[6:7] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v5 ; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v6 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v6, vcc @@ -2740,8 +2733,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v9 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 @@ -2752,8 +2745,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_24bit: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 19dc20c510041..682c5bccbe844 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -3029,203 +3029,193 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 -; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v1 +; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v7 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v3 -; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 +; GISEL-NEXT: v_trunc_f32_e32 v5, v4 ; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 -; GISEL-NEXT: v_mov_b32_e32 v3, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v5, v[3:4] -; GISEL-NEXT: v_mul_lo_u32 v3, v5, v7 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v13, v10, v8 -; GISEL-NEXT: v_mul_lo_u32 v14, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5] +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v3 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 +; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 +; GISEL-NEXT: v_mul_lo_u32 v13, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v13, v3 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v3 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 -; GISEL-NEXT: v_mov_b32_e32 v3, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v5, v[3:4] -; GISEL-NEXT: v_mul_lo_u32 v3, v5, v7 -; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v0 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] -; GISEL-NEXT: v_mul_hi_u32 v0, v10, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v3 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5] +; GISEL-NEXT: v_mul_hi_u32 v9, v8, v3 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5] +; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0 +; GISEL-NEXT: v_mul_lo_u32 v0, v11, v3 +; GISEL-NEXT: v_mul_lo_u32 v5, v8, v4 +; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_mul_hi_u32 v5, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v3, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v3, vcc +; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v10, v8 ; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v6, v10, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v11, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v0, v6 -; GISEL-NEXT: v_mul_hi_u32 v9, 0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v8, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v7, v[0:1] -; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v8, v[6:7] -; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v4 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v11, v5 -; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], 0, v6, vcc -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v9, v4 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v0 -; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], 0, v3 -; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v6 -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[0:1] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v0, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v0, v9, v4 -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 -; GISEL-NEXT: v_mul_hi_u32 v15, v10, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v13, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v9, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_mul_lo_u32 v5, 0, v8 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_mul_hi_u32 v6, v10, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, 0 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v7, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v9, 0 +; GISEL-NEXT: v_mul_hi_u32 v6, 0, v8 +; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v7 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[0:1] -; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v13, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v15, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v16, vcc -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v13, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, v10, v5 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v10, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v11, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v5 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v12, v10, v5 -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v11, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v4, v0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v6, v[0:1] +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v7 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v11, v7 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v11 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v0 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v12, 0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6] +; GISEL-NEXT: v_mov_b32_e32 v0, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] +; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v12, v[8:9] +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v10, v4 +; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], 0, v5 +; GISEL-NEXT: v_mul_lo_u32 v4, v11, v7 +; GISEL-NEXT: v_mul_lo_u32 v5, v12, v8 +; GISEL-NEXT: v_mul_hi_u32 v9, v12, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v11, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v11, v8 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v7 ; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v10, v0 -; GISEL-NEXT: v_addc_u32_e64 v4, s[4:5], v9, v4, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v5, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v10, v13, v6, vcc -; GISEL-NEXT: v_mul_hi_u32 v6, v2, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; GISEL-NEXT: v_mul_lo_u32 v9, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v2, v4 +; GISEL-NEXT: v_mul_hi_u32 v5, v12, v8 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v4 +; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v11, v5, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v7, 0 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v0, vcc +; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v9, -1, v6, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v8, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v1 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v7, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v12, v8, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v11, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v4 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v14, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v13, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v8, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, 0, v4 +; GISEL-NEXT: v_mul_lo_u32 v8, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v2, v4 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v0, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v11, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v7 +; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v11, 0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v4, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, 0, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc ; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v6, v[0:1] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v10, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v8, v[5:6] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v7, vcc ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll index 4de10788a6bd7..ded985ec3a1ec 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -2053,90 +2053,82 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_mul_lo_u32 v12, v6, v2 ; GISEL-NEXT: v_mul_lo_u32 v13, 0, v2 ; GISEL-NEXT: v_mul_hi_u32 v14, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v2, 0, v2 -; GISEL-NEXT: v_mul_lo_u32 v15, v0, v5 +; GISEL-NEXT: v_mul_hi_u32 v15, 0, v2 +; GISEL-NEXT: v_mul_lo_u32 v2, v0, v5 ; GISEL-NEXT: v_mul_lo_u32 v16, 0, v5 ; GISEL-NEXT: v_mul_hi_u32 v17, v0, v5 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 +; GISEL-NEXT: v_mul_lo_u32 v12, v3, v15 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7 +; GISEL-NEXT: v_mul_lo_u32 v10, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v13, v3, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 -; GISEL-NEXT: v_mul_lo_u32 v14, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v15, v1, v7 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v2, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_mul_lo_u32 v2, v3, v8 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v4 -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v8, vcc -; GISEL-NEXT: v_mul_lo_u32 v17, v1, v5 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v7 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_mul_lo_u32 v7, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v4 +; GISEL-NEXT: v_mul_hi_u32 v9, v3, v4 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v4 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v15, vcc +; GISEL-NEXT: v_mul_lo_u32 v14, v1, v2 +; GISEL-NEXT: v_mul_lo_u32 v16, 0, v2 +; GISEL-NEXT: v_mul_hi_u32 v17, v1, v2 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v2 ; GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v17 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v16, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v11 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v13, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v17 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], 0, v8, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v11 -; GISEL-NEXT: v_subb_u32_e64 v11, s[6:7], 0, v12, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v14 +; GISEL-NEXT: v_subb_u32_e64 v14, s[6:7], 0, v9, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v9 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v11 -; GISEL-NEXT: v_add_i32_e64 v9, s[10:11], 1, v18 -; GISEL-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v19, s[10:11] -; GISEL-NEXT: v_sub_i32_e64 v2, s[10:11], 0, v2 -; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], 0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v13, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v15, s[8:9] -; GISEL-NEXT: v_subbrev_u32_e64 v12, vcc, 0, v12, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v14 +; GISEL-NEXT: v_add_i32_e64 v7, s[10:11], 1, v18 +; GISEL-NEXT: v_addc_u32_e64 v14, s[10:11], 0, v19, s[10:11] +; GISEL-NEXT: v_sub_i32_e64 v8, s[10:11], 0, v8 +; GISEL-NEXT: v_sub_i32_e64 v9, s[10:11], 0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v10, s[6:7] +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v17, -1, v17, s[8:9] +; GISEL-NEXT: v_subbrev_u32_e64 v9, vcc, 0, v9, s[4:5] ; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v12, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v9, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v3, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v9, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v3, v18, v7, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v6, v13, v16, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v19, v11, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, v19, v14, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index a41ec8e7ce3ea..be5543b9b5b7f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -2058,42 +2058,34 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; GISEL-NEXT: v_mul_lo_u32 v2, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7 +; GISEL-NEXT: v_mul_lo_u32 v5, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, 0, v4 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_mul_lo_u32 v8, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, 0, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 -; GISEL-NEXT: v_mul_lo_u32 v13, 0, v7 +; GISEL-NEXT: v_mul_lo_u32 v10, v1, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, 0, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_mul_lo_u32 v2, v3, v2 -; GISEL-NEXT: v_mul_lo_u32 v5, v1, v5 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v7 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v6, v9 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v6, v8 ; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v2, vcc ; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], 0, v2 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v11 +; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v10 ; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], 0, v4, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], 0, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v1 From 7020b7892fe6e880e4315e5d1d82a935cbb057ae Mon Sep 17 00:00:00 2001 From: Pragyansh Chaturvedi Date: Fri, 19 Sep 2025 12:04:09 +0000 Subject: [PATCH 5/9] [GlobalISel] Early exit if the first arg is fully unknown in G_UMULH/G_SMULH --- llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp index 1b8d192e53fcd..ca6c7c56d6a01 100644 --- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp +++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp @@ -369,6 +369,7 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known, case TargetOpcode::G_UMULH: { computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts, Depth + 1); + if (Known.isUnknown()) break; computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts, Depth + 1); Known = KnownBits::mulhu(Known, Known2); @@ -377,6 +378,7 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known, case TargetOpcode::G_SMULH: { computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts, Depth + 1); + if (Known.isUnknown()) break; computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts, Depth + 1); Known = KnownBits::mulhs(Known, Known2); From 003acadca22a89841abf202a5f2c4bb20454fa2c Mon Sep 17 00:00:00 2001 From: Pragyansh Chaturvedi Date: Fri, 19 Sep 2025 12:22:24 +0000 Subject: [PATCH 6/9] [GlobalISel] clang-format --- llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp index ca6c7c56d6a01..dd02df817df03 100644 --- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp +++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp @@ -369,7 +369,8 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known, case TargetOpcode::G_UMULH: { computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts, Depth + 1); - if (Known.isUnknown()) break; + if (Known.isUnknown()) + break; computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts, Depth + 1); Known = KnownBits::mulhu(Known, Known2); @@ -378,7 +379,8 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known, case TargetOpcode::G_SMULH: { computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts, Depth + 1); - if (Known.isUnknown()) break; + if (Known.isUnknown()) + break; computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts, Depth + 1); Known = KnownBits::mulhs(Known, Known2); From 5263f16d26e90b5b492926b058c763892cab4c2d Mon Sep 17 00:00:00 2001 From: Pragyansh Chaturvedi Date: Sat, 20 Sep 2025 20:22:00 +0000 Subject: [PATCH 7/9] [GlobalISel] Update AMDGPU tests --- .../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll | 325 +++++++++-------- .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll | 344 +++++++++--------- .../CodeGen/AMDGPU/GlobalISel/udiv.i64.ll | 108 +++--- .../CodeGen/AMDGPU/GlobalISel/urem.i64.ll | 32 +- 4 files changed, 421 insertions(+), 388 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index c688db7c7bff3..f57fc005b994b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -2534,195 +2534,202 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v9, 0 -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v1 -; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v9 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v1 +; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v5 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GISEL-NEXT: v_trunc_f32_e32 v5, v4 -; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5] -; GISEL-NEXT: v_mul_hi_u32 v12, v7, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 -; GISEL-NEXT: v_mul_lo_u32 v13, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; GISEL-NEXT: v_trunc_f32_e32 v7, v4 +; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v7 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v4, v12, v3 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] +; GISEL-NEXT: v_mul_hi_u32 v8, v9, v3 +; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3 +; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7 +; GISEL-NEXT: v_mul_lo_u32 v14, v12, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v7, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v3 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v3 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v4, v12, v3 ; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v7, v4 -; GISEL-NEXT: v_mul_hi_u32 v0, v7, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] +; GISEL-NEXT: v_mul_hi_u32 v0, v9, v3 +; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3 +; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 +; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v2 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v4 +; GISEL-NEXT: v_mul_lo_u32 v4, v12, v7 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v8, v7, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v10, v3 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v0 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v12, v3, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, 0, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v10, v4 +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v10, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 -; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v2 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_mul_lo_u32 v5, 0, v3 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0 -; GISEL-NEXT: v_mul_hi_u32 v4, 0, v3 -; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 -; GISEL-NEXT: v_mov_b32_e32 v5, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v4, v[5:6] -; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v8 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v10, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v0, 0 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v3 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v4, v[7:8] +; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v5 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v9 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v0, v[7:8] +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v10, v6 ; GISEL-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v2 -; GISEL-NEXT: v_trunc_f32_e32 v8, v6 -; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v8 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 +; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v7, vcc +; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 0, v7 +; GISEL-NEXT: v_trunc_f32_e32 v7, v5 +; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v7 ; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v3 ; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v8 -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5 -; GISEL-NEXT: v_mov_b32_e32 v2, v7 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v15, v[2:3] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v14, v12, v[7:8] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v8, -1, v2, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v2, v15, v6 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v7 -; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v6 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 +; GISEL-NEXT: v_mov_b32_e32 v2, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[2:3] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, -1, v16, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[6:7] +; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v10, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v15, v5 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v6 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v8, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v2, vcc +; GISEL-NEXT: v_mul_hi_u32 v2, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v15, v7 -; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6 +; GISEL-NEXT: v_mul_lo_u32 v7, v15, v6 +; GISEL-NEXT: v_mul_hi_u32 v5, v15, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v10, v12, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_mul_hi_u32 v7, v15, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v2 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v15, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v10, 0 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v9, v1 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, v[2:3] -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v16, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v10, v[6:7] -; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v4, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1 -; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v10, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v12, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v14, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v12, v6, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v12, v2 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v15, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v0 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v4, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v1 +; GISEL-NEXT: v_mov_b32_e32 v1, v6 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v13, v10, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v14, v7, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v8, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v12 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v15, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v10, v5 +; GISEL-NEXT: v_mul_lo_u32 v14, v7, v1 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v13, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v1 +; GISEL-NEXT: v_mul_hi_u32 v5, v10, v5 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v13, v6 +; GISEL-NEXT: v_mul_hi_u32 v13, v7, v1 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v12, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v1, v10, v1 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6 +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v6 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 +; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], v10, v1, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v6, 0, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, v11, v7 -; GISEL-NEXT: v_mul_hi_u32 v14, v11, v5 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v2 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v13, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_mul_lo_u32 v9, 0, v7 +; GISEL-NEXT: v_mul_lo_u32 v7, v11, v1 +; GISEL-NEXT: v_mul_hi_u32 v10, v11, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v8, vcc ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v11, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_mul_lo_u32 v7, 0, v1 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v11, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v5, v6 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; GISEL-NEXT: v_mul_hi_u32 v10, 0, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v5, v6 +; GISEL-NEXT: v_mul_hi_u32 v1, 0, v1 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v10, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v1, v7 ; GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v10, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[1:2] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v12, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v9, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v10, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v5 ; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v6 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v6, vcc @@ -2733,8 +2740,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v9 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v10 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 @@ -2745,8 +2752,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_24bit: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 682c5bccbe844..19dc20c510041 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -3029,193 +3029,203 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v1 -; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v7 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 +; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GISEL-NEXT: v_trunc_f32_e32 v5, v4 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v3 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 ; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5] -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 -; GISEL-NEXT: v_mul_lo_u32 v13, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 +; GISEL-NEXT: v_mov_b32_e32 v3, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v5, v[3:4] +; GISEL-NEXT: v_mul_lo_u32 v3, v5, v7 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] +; GISEL-NEXT: v_mul_hi_u32 v9, v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v13, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v14, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v13, v3 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v3 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5] -; GISEL-NEXT: v_mul_hi_u32 v9, v8, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5] -; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0 -; GISEL-NEXT: v_mul_lo_u32 v0, v11, v3 -; GISEL-NEXT: v_mul_lo_u32 v5, v8, v4 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v3 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 +; GISEL-NEXT: v_mov_b32_e32 v3, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v5, v[3:4] +; GISEL-NEXT: v_mul_lo_u32 v3, v5, v7 +; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v0 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] +; GISEL-NEXT: v_mul_hi_u32 v0, v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v5, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; GISEL-NEXT: v_mul_lo_u32 v3, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v3, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v3, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, 0, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5 ; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v6, v11, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_mul_lo_u32 v5, 0, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v0, v6 +; GISEL-NEXT: v_mul_hi_u32 v9, 0, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v8, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v7, v[0:1] +; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v8, v[6:7] +; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v4 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v11, v5 +; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], 0, v6, vcc +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v9, v4 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v0 +; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], 0, v3 +; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v6 +; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[0:1] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v0, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v0, v9, v4 +; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v15, v10, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v13, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v15, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v10, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 +; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v9, 0 -; GISEL-NEXT: v_mul_hi_u32 v6, 0, v8 -; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v7 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, 0 +; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v7, v1 ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v6, v[0:1] -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v7 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v11, v7 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v0 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v12, 0 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6] -; GISEL-NEXT: v_mov_b32_e32 v0, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] -; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v12, v[8:9] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v10, v4 -; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], 0, v5 -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v7 -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v8 -; GISEL-NEXT: v_mul_hi_u32 v9, v12, v7 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v9 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[0:1] +; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v13, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v15, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v16, vcc +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v13, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v4 +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v5 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v11, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v5 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v12, v10, v5 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v11, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v4, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v9, v11, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v11, v7 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v11, v4 ; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v8 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8 -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v4 -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v11, v5, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v7, 0 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v0, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v9, -1, v6, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v8, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v1 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v7, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v12, v8, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v11, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v4 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v8, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, 0, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v2, v7 -; GISEL-NEXT: v_mul_hi_u32 v13, v2, v4 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v0, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v11, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v2, v7 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v10, v0 +; GISEL-NEXT: v_addc_u32_e64 v4, s[4:5], v9, v4, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v5, 0, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v2, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v13, v6, vcc +; GISEL-NEXT: v_mul_hi_u32 v6, v2, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, 0, v4 +; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v4, v5 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v5 +; GISEL-NEXT: v_mul_hi_u32 v11, 0, v4 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc ; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v6, v[0:1] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v8, v[5:6] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v7, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v10, vcc ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll index ded985ec3a1ec..4de10788a6bd7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -2053,82 +2053,90 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_mul_lo_u32 v12, v6, v2 ; GISEL-NEXT: v_mul_lo_u32 v13, 0, v2 ; GISEL-NEXT: v_mul_hi_u32 v14, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v15, 0, v2 -; GISEL-NEXT: v_mul_lo_u32 v2, v0, v5 +; GISEL-NEXT: v_mul_hi_u32 v2, 0, v2 +; GISEL-NEXT: v_mul_lo_u32 v15, v0, v5 ; GISEL-NEXT: v_mul_lo_u32 v16, 0, v5 ; GISEL-NEXT: v_mul_hi_u32 v17, v0, v5 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; GISEL-NEXT: v_mul_lo_u32 v12, v3, v15 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7 -; GISEL-NEXT: v_mul_lo_u32 v10, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GISEL-NEXT: v_mul_lo_u32 v7, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v9, v3, v4 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v4 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v15, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v1, v2 -; GISEL-NEXT: v_mul_lo_u32 v16, 0, v2 -; GISEL-NEXT: v_mul_hi_u32 v17, v1, v2 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v12, 0, v4 +; GISEL-NEXT: v_mul_hi_u32 v13, v3, v4 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 +; GISEL-NEXT: v_mul_lo_u32 v14, 0, v7 +; GISEL-NEXT: v_mul_hi_u32 v15, v1, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v2, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GISEL-NEXT: v_mul_lo_u32 v2, v3, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v4 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v17, v1, v5 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v7 ; GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v16, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v11 -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v13, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v17 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v7 -; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], 0, v8, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v17 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v10 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v2, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v14 -; GISEL-NEXT: v_subb_u32_e64 v14, s[6:7], 0, v9, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v11 +; GISEL-NEXT: v_subb_u32_e64 v11, s[6:7], 0, v12, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v14 -; GISEL-NEXT: v_add_i32_e64 v7, s[10:11], 1, v18 -; GISEL-NEXT: v_addc_u32_e64 v14, s[10:11], 0, v19, s[10:11] -; GISEL-NEXT: v_sub_i32_e64 v8, s[10:11], 0, v8 -; GISEL-NEXT: v_sub_i32_e64 v9, s[10:11], 0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v10, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v17, -1, v17, s[8:9] -; GISEL-NEXT: v_subbrev_u32_e64 v9, vcc, 0, v9, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v9 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v11 +; GISEL-NEXT: v_add_i32_e64 v9, s[10:11], 1, v18 +; GISEL-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v19, s[10:11] +; GISEL-NEXT: v_sub_i32_e64 v2, s[10:11], 0, v2 +; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], 0, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v13, s[6:7] +; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v15, s[8:9] +; GISEL-NEXT: v_subbrev_u32_e64 v12, vcc, 0, v12, s[4:5] ; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v9, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v12, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v3, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v14, vcc ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v3, v18, v7, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v6, v13, v16, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v9, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v19, v14, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, v19, v11, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index be5543b9b5b7f..a41ec8e7ce3ea 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -2058,34 +2058,42 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; GISEL-NEXT: v_mul_lo_u32 v2, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7 -; GISEL-NEXT: v_mul_lo_u32 v5, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GISEL-NEXT: v_mul_lo_u32 v8, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v9, 0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v12, 0, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v1, v7 -; GISEL-NEXT: v_mul_lo_u32 v11, 0, v7 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 +; GISEL-NEXT: v_mul_lo_u32 v13, 0, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GISEL-NEXT: v_mul_lo_u32 v2, v3, v2 +; GISEL-NEXT: v_mul_lo_u32 v5, v1, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v7 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v6, v8 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v6, v9 ; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v2, vcc ; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], 0, v2 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v10 +; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v11 ; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], 0, v4, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], 0, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v1 From b9ac7ff04faa188f4a1f0e66668dd730c114445e Mon Sep 17 00:00:00 2001 From: Pragyansh Chaturvedi Date: Sun, 21 Sep 2025 15:44:20 +0000 Subject: [PATCH 8/9] [GlobalISel] Revert "[GlobalISel] Update AMDGPU tests" This reverts commit 5263f16d26e90b5b492926b058c763892cab4c2d. --- .../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll | 325 ++++++++--------- .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll | 344 +++++++++--------- .../CodeGen/AMDGPU/GlobalISel/udiv.i64.ll | 108 +++--- .../CodeGen/AMDGPU/GlobalISel/urem.i64.ll | 32 +- 4 files changed, 388 insertions(+), 421 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index f57fc005b994b..c688db7c7bff3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -2534,202 +2534,195 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v1 -; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v5 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v9, 0 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v1 +; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GISEL-NEXT: v_trunc_f32_e32 v7, v4 -; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v7 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v12, v3 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3 -; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7 -; GISEL-NEXT: v_mul_lo_u32 v14, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; GISEL-NEXT: v_trunc_f32_e32 v5, v4 +; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5] +; GISEL-NEXT: v_mul_hi_u32 v12, v7, v3 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 +; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 +; GISEL-NEXT: v_mul_lo_u32 v13, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v7, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v3 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v12, v3 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v3 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 ; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3 -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 -; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v2 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v0, v7, v3 +; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, v12, v7 +; GISEL-NEXT: v_mul_lo_u32 v5, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v8, v7, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v12, v3, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v10, v4 -; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v10, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc +; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v10, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v4 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v2 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_mul_lo_u32 v5, 0, v3 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v0, 0 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v3 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v4, v[7:8] -; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v9 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v0, v[7:8] -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v10, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0 +; GISEL-NEXT: v_mul_hi_u32 v4, 0, v3 +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 +; GISEL-NEXT: v_mov_b32_e32 v5, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v4, v[5:6] +; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v8 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[5:6] +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v10, v7 ; GISEL-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 -; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v7, vcc -; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 0, v7 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v7 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v2 +; GISEL-NEXT: v_trunc_f32_e32 v8, v6 +; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v2 +; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v3 ; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v7 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[2:3] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, -1, v16, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v10, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v15, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v6 -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v8, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v2, vcc -; GISEL-NEXT: v_mul_hi_u32 v2, v12, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v8 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5 +; GISEL-NEXT: v_mov_b32_e32 v2, v7 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v15, v[2:3] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v14, v12, v[7:8] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v8, -1, v2, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v2, v15, v6 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v7 +; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v5, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v12, v6 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v15, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v15, v5 +; GISEL-NEXT: v_mul_lo_u32 v5, v15, v7 +; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v10, v12, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_mul_hi_u32 v7, v15, v7 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v12, v2 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v15, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v0 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v4, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v1 -; GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v13, v10, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v14, v7, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v12 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v15, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v10, v5 -; GISEL-NEXT: v_mul_lo_u32 v14, v7, v1 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v13, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v5, v10, v5 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v13, v6 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v1 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v12, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; GISEL-NEXT: v_mul_hi_u32 v1, v10, v1 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6 -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v6 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 -; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], v10, v1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v2 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v15, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v10, 0 +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v9, v1 +; GISEL-NEXT: v_mov_b32_e32 v2, v6 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, v[2:3] +; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v16, vcc +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v10, v[6:7] +; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v4, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1 +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, v10, v6 +; GISEL-NEXT: v_mul_hi_u32 v14, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v14, v12, v6 +; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_mul_hi_u32 v9, v10, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v14, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v6, v12, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v12, v6, vcc ; GISEL-NEXT: v_mul_lo_u32 v6, 0, v5 -; GISEL-NEXT: v_mul_lo_u32 v7, v11, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v11, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v11, v7 +; GISEL-NEXT: v_mul_hi_u32 v14, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v2 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v13, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, 0, v7 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GISEL-NEXT: v_mul_lo_u32 v7, 0, v1 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v11, v1 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v11, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v1, 0, v1 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v10, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v1, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v5, v6 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GISEL-NEXT: v_mul_hi_u32 v10, 0, v7 ; GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[1:2] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v10, v[1:2] ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v10, v[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v12, vcc +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v9, v[6:7] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v5 ; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v6 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v6, vcc @@ -2740,8 +2733,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v9 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 @@ -2752,8 +2745,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_24bit: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 19dc20c510041..682c5bccbe844 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -3029,203 +3029,193 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 -; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v1 +; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v7 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v3 -; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 +; GISEL-NEXT: v_trunc_f32_e32 v5, v4 ; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 -; GISEL-NEXT: v_mov_b32_e32 v3, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v5, v[3:4] -; GISEL-NEXT: v_mul_lo_u32 v3, v5, v7 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v13, v10, v8 -; GISEL-NEXT: v_mul_lo_u32 v14, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5] +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v3 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 +; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 +; GISEL-NEXT: v_mul_lo_u32 v13, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v13, v3 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v3 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 -; GISEL-NEXT: v_mov_b32_e32 v3, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v5, v[3:4] -; GISEL-NEXT: v_mul_lo_u32 v3, v5, v7 -; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v0 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] -; GISEL-NEXT: v_mul_hi_u32 v0, v10, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v3 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5] +; GISEL-NEXT: v_mul_hi_u32 v9, v8, v3 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5] +; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0 +; GISEL-NEXT: v_mul_lo_u32 v0, v11, v3 +; GISEL-NEXT: v_mul_lo_u32 v5, v8, v4 +; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_mul_hi_u32 v5, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v3, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v3, vcc +; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v10, v8 ; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v6, v10, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v11, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v0, v6 -; GISEL-NEXT: v_mul_hi_u32 v9, 0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v8, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v7, v[0:1] -; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v8, v[6:7] -; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v4 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v11, v5 -; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], 0, v6, vcc -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v9, v4 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v0 -; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], 0, v3 -; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v6 -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[0:1] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v0, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v0, v9, v4 -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 -; GISEL-NEXT: v_mul_hi_u32 v15, v10, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v13, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v9, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_mul_lo_u32 v5, 0, v8 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_mul_hi_u32 v6, v10, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, 0 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v7, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v9, 0 +; GISEL-NEXT: v_mul_hi_u32 v6, 0, v8 +; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v7 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[0:1] -; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v13, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v15, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v16, vcc -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v13, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, v10, v5 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v10, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v11, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v5 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v12, v10, v5 -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v11, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v4, v0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v6, v[0:1] +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v7 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v11, v7 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v11 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v0 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v12, 0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6] +; GISEL-NEXT: v_mov_b32_e32 v0, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] +; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v12, v[8:9] +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v10, v4 +; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], 0, v5 +; GISEL-NEXT: v_mul_lo_u32 v4, v11, v7 +; GISEL-NEXT: v_mul_lo_u32 v5, v12, v8 +; GISEL-NEXT: v_mul_hi_u32 v9, v12, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v11, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v11, v8 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v7 ; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v10, v0 -; GISEL-NEXT: v_addc_u32_e64 v4, s[4:5], v9, v4, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v5, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v10, v13, v6, vcc -; GISEL-NEXT: v_mul_hi_u32 v6, v2, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; GISEL-NEXT: v_mul_lo_u32 v9, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v2, v4 +; GISEL-NEXT: v_mul_hi_u32 v5, v12, v8 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v4 +; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v11, v5, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v7, 0 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v0, vcc +; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v9, -1, v6, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v8, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v1 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v7, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v12, v8, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v11, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v4 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v14, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v13, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v8, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, 0, v4 +; GISEL-NEXT: v_mul_lo_u32 v8, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v2, v4 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v0, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v11, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v7 +; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v11, 0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v4, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, 0, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc ; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v6, v[0:1] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v10, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v8, v[5:6] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v7, vcc ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll index 4de10788a6bd7..ded985ec3a1ec 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -2053,90 +2053,82 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_mul_lo_u32 v12, v6, v2 ; GISEL-NEXT: v_mul_lo_u32 v13, 0, v2 ; GISEL-NEXT: v_mul_hi_u32 v14, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v2, 0, v2 -; GISEL-NEXT: v_mul_lo_u32 v15, v0, v5 +; GISEL-NEXT: v_mul_hi_u32 v15, 0, v2 +; GISEL-NEXT: v_mul_lo_u32 v2, v0, v5 ; GISEL-NEXT: v_mul_lo_u32 v16, 0, v5 ; GISEL-NEXT: v_mul_hi_u32 v17, v0, v5 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 +; GISEL-NEXT: v_mul_lo_u32 v12, v3, v15 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7 +; GISEL-NEXT: v_mul_lo_u32 v10, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v13, v3, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 -; GISEL-NEXT: v_mul_lo_u32 v14, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v15, v1, v7 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v2, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_mul_lo_u32 v2, v3, v8 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v4 -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v8, vcc -; GISEL-NEXT: v_mul_lo_u32 v17, v1, v5 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v7 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_mul_lo_u32 v7, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v4 +; GISEL-NEXT: v_mul_hi_u32 v9, v3, v4 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v4 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v15, vcc +; GISEL-NEXT: v_mul_lo_u32 v14, v1, v2 +; GISEL-NEXT: v_mul_lo_u32 v16, 0, v2 +; GISEL-NEXT: v_mul_hi_u32 v17, v1, v2 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v2 ; GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v17 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v16, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v11 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v13, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v17 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], 0, v8, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v11 -; GISEL-NEXT: v_subb_u32_e64 v11, s[6:7], 0, v12, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v14 +; GISEL-NEXT: v_subb_u32_e64 v14, s[6:7], 0, v9, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v9 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v11 -; GISEL-NEXT: v_add_i32_e64 v9, s[10:11], 1, v18 -; GISEL-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v19, s[10:11] -; GISEL-NEXT: v_sub_i32_e64 v2, s[10:11], 0, v2 -; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], 0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v13, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v15, s[8:9] -; GISEL-NEXT: v_subbrev_u32_e64 v12, vcc, 0, v12, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v14 +; GISEL-NEXT: v_add_i32_e64 v7, s[10:11], 1, v18 +; GISEL-NEXT: v_addc_u32_e64 v14, s[10:11], 0, v19, s[10:11] +; GISEL-NEXT: v_sub_i32_e64 v8, s[10:11], 0, v8 +; GISEL-NEXT: v_sub_i32_e64 v9, s[10:11], 0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v10, s[6:7] +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v17, -1, v17, s[8:9] +; GISEL-NEXT: v_subbrev_u32_e64 v9, vcc, 0, v9, s[4:5] ; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v12, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v9, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v3, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v9, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v3, v18, v7, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v6, v13, v16, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v19, v11, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, v19, v14, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index a41ec8e7ce3ea..be5543b9b5b7f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -2058,42 +2058,34 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; GISEL-NEXT: v_mul_lo_u32 v2, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7 +; GISEL-NEXT: v_mul_lo_u32 v5, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, 0, v4 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_mul_lo_u32 v8, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, 0, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 -; GISEL-NEXT: v_mul_lo_u32 v13, 0, v7 +; GISEL-NEXT: v_mul_lo_u32 v10, v1, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, 0, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_mul_lo_u32 v2, v3, v2 -; GISEL-NEXT: v_mul_lo_u32 v5, v1, v5 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v7 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v6, v9 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v6, v8 ; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v2, vcc ; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], 0, v2 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v11 +; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v10 ; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], 0, v4, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], 0, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v1 From 31fa2f645c776e2abcca60381e71cc4360664711 Mon Sep 17 00:00:00 2001 From: Pragyansh Chaturvedi Date: Sun, 21 Sep 2025 15:47:12 +0000 Subject: [PATCH 9/9] [GlobalIsel] Revert "[GlobalISel] Early exit if the first arg is fully unknown in G_UMULH/G_SMULH" This reverts commit 7020b7892fe6e880e4315e5d1d82a935cbb057ae. --- llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp index dd02df817df03..1b8d192e53fcd 100644 --- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp +++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp @@ -369,8 +369,6 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known, case TargetOpcode::G_UMULH: { computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts, Depth + 1); - if (Known.isUnknown()) - break; computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts, Depth + 1); Known = KnownBits::mulhu(Known, Known2); @@ -379,8 +377,6 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known, case TargetOpcode::G_SMULH: { computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts, Depth + 1); - if (Known.isUnknown()) - break; computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts, Depth + 1); Known = KnownBits::mulhs(Known, Known2);