From 13c2599e3991d82bfce685af13777af7b0ff8d1a Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 30 Oct 2024 14:49:55 -0700 Subject: [PATCH 1/5] [GISel][AArch64][AMDGPU][RISCV] Canonicalize (sub X, C) -> (add X, -C) This matches InstCombine and DAGCombine. RISC-V only has an ADDI instruction so without this we need additional patterns to do the conversion. Some of the AMDGPU tests look like possible regressions. --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 3 + .../include/llvm/Target/GlobalISel/Combine.td | 12 +- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 22 + llvm/lib/Target/RISCV/RISCVGISel.td | 9 - .../AArch64/GlobalISel/combine-integer.mir | 4 +- .../GlobalISel/combine-narrow-binop.mir | 8 +- ...ercombiner-extending-loads-cornercases.mir | 4 +- .../prelegalizercombiner-trivial-arith.mir | 4 +- llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll | 10 +- .../AMDGPU/GlobalISel/cvt_f32_ubyte.ll | 4 +- .../CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll | 7 +- llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll | 560 ++++----- llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll | 627 +++++----- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 18 +- .../CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll | 70 +- llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll | 10 +- .../CodeGen/AMDGPU/GlobalISel/srem.i32.ll | 96 +- .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 1066 +++++++++-------- .../CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll | 34 +- .../CodeGen/AMDGPU/GlobalISel/urem.i32.ll | 48 +- llvm/test/CodeGen/AMDGPU/ctlz.ll | 8 +- llvm/test/CodeGen/AMDGPU/div_i128.ll | 28 +- llvm/test/CodeGen/AMDGPU/div_v2i128.ll | 280 ++--- llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 72 +- llvm/test/CodeGen/AMDGPU/itofp.i128.ll | 144 +-- .../CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll | 2 +- .../CodeGen/AMDGPU/llvm.is.fpclass.f16.ll | 44 +- .../CodeGen/AMDGPU/shrink-add-sub-constant.ll | 1018 +++++++++------- .../CodeGen/RISCV/GlobalISel/alu-roundtrip.ll | 18 +- .../instruction-select/alu-rv32.mir | 5 +- .../instruction-select/alu-rv64.mir | 10 +- .../jump-table-brjt-medium-rv64.mir | 4 +- .../jump-table-brjt-pic-rv32.mir | 4 +- .../jump-table-brjt-pic-rv64.mir | 4 +- .../jump-table-brjt-rv32.mir | 4 +- .../jump-table-brjt-small-rv64.mir | 4 +- 36 files changed, 2214 insertions(+), 2051 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 9240a3c3127eb..b09981eaef506 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -321,6 +321,9 @@ class CombinerHelper { bool matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal); void applyCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal); + // Transform a G_SUB with constant on the RHS to G_ADD. + bool matchCombineSubToAdd(MachineInstr &MI, BuildFnTy &MatchInfo); + // Transform a G_SHL with an extended source into a narrower shift if // possible. bool matchCombineShlOfExtend(MachineInstr &MI, RegisterImmPair &MatchData); diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index ead4149fc1106..9891db5ceb6fa 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -335,6 +335,13 @@ def mul_to_shl : GICombineRule< [{ return Helper.matchCombineMulToShl(*${mi}, ${matchinfo}); }]), (apply [{ Helper.applyCombineMulToShl(*${mi}, ${matchinfo}); }])>; +// (sub x, C) -> (add x, -C) +def sub_to_add : GICombineRule< + (defs root:$d, build_fn_matchinfo:$matchinfo), + (match (G_SUB $d, $op1, $op2):$mi, + [{ return Helper.matchCombineSubToAdd(*${mi}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFnNoErase(*${mi}, ${matchinfo}); }])>; + // shl ([asz]ext x), y => zext (shl x, y), if shift does not overflow int def reduce_shl_of_extend_matchdata : GIDefMatchData<"RegisterImmPair">; def reduce_shl_of_extend : GICombineRule< @@ -1903,8 +1910,9 @@ def bitreverse_shift : GICombineGroup<[bitreverse_shl, bitreverse_lshr]>; def select_combines : GICombineGroup<[select_undef_cmp, select_constant_cmp, select_to_iminmax, match_selects]>; -def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl, add_p2i_to_ptradd, - mul_by_neg_one, idempotent_prop]>; +def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl, sub_to_add, + add_p2i_to_ptradd, mul_by_neg_one, + idempotent_prop]>; def fma_combines : GICombineGroup<[combine_fadd_fmul_to_fmad_or_fma, combine_fadd_fpext_fmul_to_fmad_or_fma, combine_fadd_fma_fmul_to_fmad_or_fma, diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index b7ddf9f479ef8..91e5af9dfd8e2 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -2041,6 +2041,28 @@ void CombinerHelper::applyCombineMulToShl(MachineInstr &MI, Observer.changedInstr(MI); } +bool CombinerHelper::matchCombineSubToAdd(MachineInstr &MI, + BuildFnTy &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SUB && "Expected a G_SUB"); + auto MaybeImmVal = + getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); + if (!MaybeImmVal) + return false; + + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + + APInt NegImm = -MaybeImmVal->Value; + MatchInfo = [=, &MI](MachineIRBuilder &B) { + auto NegCst = B.buildConstant(Ty, NegImm); + Observer.changingInstr(MI); + MI.setDesc(B.getTII().get(TargetOpcode::G_ADD)); + MI.getOperand(2).setReg(NegCst.getReg(0)); + MI.clearFlag(MachineInstr::MIFlag::NoUWrap); + Observer.changedInstr(MI); + }; + return true; +} + // shl ([sza]ext x), y => zext (shl x, y), if shift does not overflow source bool CombinerHelper::matchCombineShlOfExtend(MachineInstr &MI, RegisterImmPair &MatchData) { diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td index 67e93b812421b..40aae220fbd47 100644 --- a/llvm/lib/Target/RISCV/RISCVGISel.td +++ b/llvm/lib/Target/RISCV/RISCVGISel.td @@ -96,15 +96,6 @@ def gi_sh2add_uw_op : GIComplexOperandMatcher">, def gi_sh3add_uw_op : GIComplexOperandMatcher">, GIComplexPatternEquiv; -// FIXME: Canonicalize (sub X, C) -> (add X, -C) earlier. -def : Pat<(XLenVT (sub GPR:$rs1, simm12Plus1:$imm)), - (ADDI GPR:$rs1, (NegImm simm12Plus1:$imm))>; - -let Predicates = [IsRV64] in { -def : Pat<(i32 (sub GPR:$rs1, simm12Plus1i32:$imm)), - (ADDIW GPR:$rs1, (i64 (NegImm $imm)))>; -} - // Ptr type used in patterns with GlobalISelEmitter def PtrVT : PtrValueTypeByHwMode; diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-integer.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-integer.mir index 2f10a497fa74c..5cbff0f0c74cb 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-integer.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-integer.mir @@ -308,8 +308,8 @@ body: | ; CHECK: liveins: $w0, $w1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %a:_(s64) = COPY $x0 - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 71 - ; CHECK-NEXT: %sub:_(s64) = G_SUB %a, [[C]] + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -71 + ; CHECK-NEXT: %sub:_(s64) = G_ADD %a, [[C]] ; CHECK-NEXT: $x0 = COPY %sub(s64) ; CHECK-NEXT: RET_ReallyLR implicit $x0 %a:_(s64) = COPY $x0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir index f207e9c149a47..e9d4af7da5d06 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir @@ -88,8 +88,8 @@ body: | ; CHECK-LABEL: name: test_combine_trunc_sub_i128 ; CHECK: %lhs:_(s128) = COPY $q0 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s128) - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 - ; CHECK-NEXT: %small:_(s32) = G_SUB [[TRUNC]], [[C]] + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -5 + ; CHECK-NEXT: %small:_(s32) = G_ADD [[TRUNC]], [[C]] ; CHECK-NEXT: $w0 = COPY %small(s32) %lhs:_(s128) = COPY $q0 %rhs:_(s128) = G_CONSTANT i128 5 @@ -103,8 +103,8 @@ body: | bb.1: ; CHECK-LABEL: name: test_combine_trunc_sub_i128_multi_use ; CHECK: %lhs:_(s128) = COPY $q0 - ; CHECK-NEXT: %rhs:_(s128) = G_CONSTANT i128 5 - ; CHECK-NEXT: %res:_(s128) = G_SUB %lhs, %rhs + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s128) = G_CONSTANT i128 -5 + ; CHECK-NEXT: %res:_(s128) = G_ADD %lhs, [[C]] ; CHECK-NEXT: %small:_(s32) = G_TRUNC %res(s128) ; CHECK-NEXT: $q0 = COPY %res(s128) ; CHECK-NEXT: $w0 = COPY %small(s32) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads-cornercases.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads-cornercases.mir index 04968dab3a37c..591b6a17928cb 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads-cornercases.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads-cornercases.mir @@ -95,7 +95,7 @@ body: | %11:_(s8) = G_CONSTANT i8 1 ; CHECK: [[T3:%[0-9]+]]:_(s8) = G_TRUNC [[T0]](s32) %7:_(s8) = G_SUB %2, %11 - ; CHECK: [[T4:%[0-9]+]]:_(s8) = G_SUB [[T3]], {{.*}} + ; CHECK: [[T4:%[0-9]+]]:_(s8) = G_ADD [[T3]], {{.*}} G_BR %bb.3.exit bb.3.exit: ; CHECK: bb.3.exit: @@ -197,7 +197,7 @@ body: | %7:_(s8) = G_CONSTANT i8 1 ; CHECK: [[T3:%[0-9]+]]:_(s8) = G_TRUNC [[T0]](s32) %8:_(s8) = G_SUB %2, %7 - ; CHECK: [[T4:%[0-9]+]]:_(s8) = G_SUB [[T3]], {{.*}} + ; CHECK: [[T4:%[0-9]+]]:_(s8) = G_ADD [[T3]], {{.*}} G_BR %bb.3.exit bb.3.exit: ; CHECK: bb.3.exit: diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-trivial-arith.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-trivial-arith.mir index 0900dd4267a2e..bc3be691bd25a 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-trivial-arith.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-trivial-arith.mir @@ -289,8 +289,8 @@ body: | ; CHECK: liveins: $w0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %x:_(s32) = COPY $w0 - ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1 - ; CHECK-NEXT: %op:_(s32) = G_SUB %x, %cst + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: %op:_(s32) = G_ADD %x, [[C]] ; CHECK-NEXT: $w0 = COPY %op(s32) ; CHECK-NEXT: RET_ReallyLR implicit $w0 %x:_(s32) = COPY $w0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index 63f5464371cc6..493e8cef63890 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -1669,7 +1669,7 @@ define i65 @v_ashr_i65(i65 %value, i65 %amount) { ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v3 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[0:1], v3 ; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], v8 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 64, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc0, v3 ; GFX6-NEXT: v_ashr_i64 v[10:11], v[4:5], v3 ; GFX6-NEXT: v_or_b32_e32 v6, v6, v8 ; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v5 @@ -1692,7 +1692,7 @@ define i65 @v_ashr_i65(i65 %value, i65 %amount) { ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v3 ; GFX8-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[8:9], v8, v[4:5] -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xffffffc0, v3 ; GFX8-NEXT: v_ashrrev_i64 v[10:11], v3, v[4:5] ; GFX8-NEXT: v_or_b32_e32 v6, v6, v8 ; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v5 @@ -1715,7 +1715,7 @@ define i65 @v_ashr_i65(i65 %value, i65 %amount) { ; GFX9-NEXT: v_sub_u32_e32 v8, 64, v3 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[8:9], v8, v[4:5] -; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffc0, v3 ; GFX9-NEXT: v_ashrrev_i64 v[10:11], v3, v[4:5] ; GFX9-NEXT: v_or_b32_e32 v6, v6, v8 ; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v5 @@ -1735,7 +1735,7 @@ define i65 @v_ashr_i65(i65 %value, i65 %amount) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_bfe_i32 v4, v2, 0, 1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v3 ; GFX10-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 ; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4 @@ -1758,7 +1758,7 @@ define i65 @v_ashr_i65(i65 %value, i65 %amount) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_bfe_i32 v4, v2, 0, 1 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v3 -; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v3 ; GFX11-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 ; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll index 405b1e8f3a250..46d6b86789c77 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -1438,7 +1438,7 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) { ; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; SI-NEXT: v_ffbh_i32_e32 v3, 0 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v2 -; SI-NEXT: v_subrev_i32_e32 v3, vcc, 1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v3 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: v_min_u32_e32 v2, v3, v2 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v2 @@ -1456,7 +1456,7 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) { ; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; VI-NEXT: v_ffbh_i32_e32 v3, 0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2 -; VI-NEXT: v_subrev_u32_e32 v3, vcc, 1, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v3 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_min_u32_e32 v2, v3, v2 ; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll index 146f344930a4e..6e55d7fdb5e95 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll @@ -4101,7 +4101,7 @@ define float @v_fdiv_f32_constrhs0_dynamic_25ulp(float %x) #0 { ; GFX10-NEXT: v_rcp_f32_e32 v1, 0x3f40e400 ; GFX10-NEXT: v_frexp_mant_f32_e32 v2, v0 ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v0, 14, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, -14, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX10-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4112,10 +4112,9 @@ define float @v_fdiv_f32_constrhs0_dynamic_25ulp(float %x) #0 { ; GFX11-NEXT: v_rcp_f32_e32 v1, 0x3f40e400 ; GFX11-NEXT: v_frexp_mant_f32_e32 v2, v0 ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_subrev_nc_u32_e32 v0, 14, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX11-NEXT: v_dual_mul_f32 v1, v2, v1 :: v_dual_add_nc_u32 v0, -14, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; EG-LABEL: v_fdiv_f32_constrhs0_dynamic_25ulp: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index 3bd3486ec261d..5d76b542fad89 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -20,10 +20,10 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 7, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, -7, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 7, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, -7, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 6, v0 @@ -51,10 +51,10 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 -; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 7, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, -7, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 7, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, -7, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_sub_u16_e32 v1, 6, v0 @@ -82,10 +82,10 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 -; GFX9-NEXT: v_subrev_u32_e32 v1, 7, v0 +; GFX9-NEXT: v_add_u32_e32 v1, -7, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_subrev_u32_e32 v1, 7, v0 +; GFX9-NEXT: v_add_u32_e32 v1, -7, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_sub_u16_e32 v1, 6, v0 @@ -113,10 +113,10 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX10-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, -7, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, -7, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_sub_nc_u16 v1, 6, v0 @@ -150,11 +150,11 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX11-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v1, -7, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v1, -7, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo @@ -189,10 +189,10 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 7, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, -7, v2 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 7, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, -7, v2 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 6, v2 @@ -219,10 +219,10 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX8-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 7, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, -7, v2 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 7, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, -7, v2 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_sub_u16_e32 v3, 6, v2 @@ -249,10 +249,10 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_subrev_u32_e32 v3, 7, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -7, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, 7, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -7, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_sub_u16_e32 v3, 6, v2 @@ -279,10 +279,10 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX10-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, -7, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, -7, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX10-NEXT: v_sub_nc_u16 v3, 6, v2 @@ -315,11 +315,11 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX11-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v2, v2, v3 -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, -7, v2 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, -7, v2 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo @@ -1550,16 +1550,16 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x170001 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_mul_lo_u32 v2, v0, v1 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 24, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v0, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 24, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 23, v0 @@ -1580,16 +1580,16 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x170001 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 -; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 24, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 24, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v0, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 23, v0 @@ -1616,10 +1616,10 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 -; GFX9-NEXT: v_subrev_u32_e32 v1, 24, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0xffffffe8, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_subrev_u32_e32 v1, 24, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0xffffffe8, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_sub_u32_e32 v1, 23, v0 @@ -1644,10 +1644,10 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX10-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0xffffffe8, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0xffffffe8, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 23, v0 @@ -1678,11 +1678,11 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xffffffe8, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xffffffe8, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -1710,16 +1710,16 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 23 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4 -; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_mul_lo_u32 v5, v3, v4 +; GFX6-NEXT: v_mul_hi_u32 v5, v3, v5 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v4 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xffffffe8, v2 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v2 @@ -1740,16 +1740,16 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX8-NEXT: v_bfe_u32 v1, v1, 1, 23 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX8-NEXT: v_mul_lo_u32 v4, v3, v4 -; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; GFX8-NEXT: v_mul_lo_u32 v5, v3, v4 +; GFX8-NEXT: v_mul_hi_u32 v5, v3, v5 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX8-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v4 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xffffffe8, v2 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v2 @@ -1776,10 +1776,10 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0xffffffe8, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0xffffffe8, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_sub_u32_e32 v3, 23, v2 @@ -1804,10 +1804,10 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX10-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v2 @@ -1838,11 +1838,11 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX11-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, v2, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -1887,7 +1887,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6-NEXT: s_lshr_b32 s0, s2, 16 ; GFX6-NEXT: s_lshr_b32 s1, s3, 8 ; GFX6-NEXT: s_bfe_u32 s8, s2, 0x80008 -; GFX6-NEXT: v_mul_lo_u32 v3, v2, v3 +; GFX6-NEXT: v_mul_lo_u32 v4, v2, v3 ; GFX6-NEXT: s_and_b32 s7, s2, 0xff ; GFX6-NEXT: s_lshl_b32 s8, s8, 8 ; GFX6-NEXT: s_and_b32 s0, s0, 0xff @@ -1906,7 +1906,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX6-NEXT: s_lshr_b32 s1, s4, 16 ; GFX6-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4 ; GFX6-NEXT: s_and_b32 s3, s4, 0xff ; GFX6-NEXT: s_lshl_b32 s7, s7, 8 ; GFX6-NEXT: s_and_b32 s1, s1, 0xff @@ -1915,53 +1915,53 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s1, s3, s1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v3, s1, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_mul_hi_u32 v4, s1, v2 ; GFX6-NEXT: s_lshr_b32 s2, s5, 8 ; GFX6-NEXT: s_and_b32 s3, s5, 0xff -; GFX6-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NEXT: v_mov_b32_e32 v5, s4 ; GFX6-NEXT: s_and_b32 s2, s2, 0xff -; GFX6-NEXT: v_alignbit_b32 v4, s3, v4, 24 +; GFX6-NEXT: v_alignbit_b32 v5, s3, v5, 24 ; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX6-NEXT: v_mul_lo_u32 v3, v3, 24 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX6-NEXT: v_mul_lo_u32 v4, v4, 24 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: v_or_b32_e32 v4, s2, v4 -; GFX6-NEXT: v_mul_hi_u32 v2, v4, v2 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 24, v3 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v3 +; GFX6-NEXT: v_or_b32_e32 v5, s2, v5 +; GFX6-NEXT: v_mul_hi_u32 v2, v5, v2 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s1, v4 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v4, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 ; GFX6-NEXT: v_mul_lo_u32 v2, v2, 24 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 24, v3 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v4, v2 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 23, v3 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v4, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 23, v4 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v2, v3 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v3 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX6-NEXT: s_lshr_b32 s0, s0, 1 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffffff, v5 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: v_lshl_b32_e32 v3, s6, v3 -; GFX6-NEXT: v_lshr_b32_e32 v5, s0, v5 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 23, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffffff, v6 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX6-NEXT: v_lshl_b32_e32 v4, s6, v4 +; GFX6-NEXT: v_lshr_b32_e32 v6, s0, v6 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_bfe_u32 v2, v3, 8, 8 +; GFX6-NEXT: v_bfe_u32 v2, v4, 8, 8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_bfe_u32 v2, v3, 16, 8 +; GFX6-NEXT: v_bfe_u32 v2, v4, 16, 8 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v0 @@ -2021,7 +2021,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8-NEXT: v_not_b32_e32 v1, 23 ; GFX8-NEXT: s_or_b32 s3, s8, s3 ; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX8-NEXT: v_mul_lo_u32 v1, v0, v1 +; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: s_or_b32 s3, s3, s6 @@ -2031,67 +2031,67 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8-NEXT: s_lshr_b32 s8, s4, 24 ; GFX8-NEXT: s_and_b32 s4, s4, 0xff ; GFX8-NEXT: s_lshl_b32 s6, s6, 8 -; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: s_or_b32 s4, s4, s6 ; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: s_or_b32 s4, s4, s6 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, s4, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_lshr_b32 s9, s5, 8 ; GFX8-NEXT: s_and_b32 s5, s5, 0xff ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX8-NEXT: v_mul_lo_u32 v2, v2, 24 ; GFX8-NEXT: s_and_b32 s6, s9, 0xff ; GFX8-NEXT: s_or_b32 s5, s8, s5 ; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: s_or_b32 s5, s5, s6 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s4, v1 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s4, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX8-NEXT: s_lshr_b32 s0, s2, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX8-NEXT: v_lshrrev_b32_e64 v2, v2, s0 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 +; GFX8-NEXT: s_lshr_b32 s0, s2, 1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX8-NEXT: v_lshrrev_b32_e64 v3, v3, s0 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v0 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v0, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v0, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 23, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX8-NEXT: s_lshr_b32 s0, s3, 1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s1 -; GFX8-NEXT: v_lshrrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e64 v1, v1, s0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_mov_b32_e32 v4, 16 -; GFX8-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_or_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v2 ; GFX8-NEXT: v_readfirstlane_b32 s1, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -2172,10 +2172,10 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 -; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -2186,10 +2186,10 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9-NEXT: v_lshrrev_b32_e64 v2, v2, s2 ; GFX9-NEXT: v_sub_u32_e32 v0, s5, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, s0, v1, v2 -; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_sub_u32_e32 v2, 23, v0 @@ -2282,9 +2282,9 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX10-NEXT: s_lshr_b32 s4, s3, 8 ; GFX10-NEXT: s_and_b32 s5, s9, 0xff ; GFX10-NEXT: s_and_b32 s3, s3, 0xff -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v0 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 ; GFX10-NEXT: s_and_b32 s4, s4, 0xff @@ -2293,13 +2293,13 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX10-NEXT: s_lshl_b32 s5, s5, 16 ; GFX10-NEXT: s_or_b32 s3, s10, s3 ; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 ; GFX10-NEXT: s_or_b32 s2, s2, s5 ; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: s_or_b32 s3, s3, s4 @@ -2399,9 +2399,9 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX11-NEXT: s_and_b32 s5, s8, 0xff ; GFX11-NEXT: s_lshr_b32 s4, s3, 8 ; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v0 ; GFX11-NEXT: s_and_b32 s3, s3, 0xff ; GFX11-NEXT: s_lshl_b32 s5, s5, 16 ; GFX11-NEXT: s_lshl_b32 s3, s3, 8 @@ -2410,7 +2410,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX11-NEXT: s_and_b32 s4, s4, 0xff ; GFX11-NEXT: s_or_b32 s2, s2, s5 ; GFX11-NEXT: s_or_b32 s3, s9, s3 -; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 ; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 @@ -2423,7 +2423,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX11-NEXT: s_lshr_b32 s3, s3, 1 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 23, v1 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2 @@ -2479,31 +2479,31 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX6-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 23 -; GFX6-NEXT: v_mul_lo_u32 v7, v6, v7 -; GFX6-NEXT: v_mul_hi_u32 v7, v6, v7 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GFX6-NEXT: v_mul_hi_u32 v7, v4, v6 +; GFX6-NEXT: v_mul_lo_u32 v8, v6, v7 +; GFX6-NEXT: v_mul_hi_u32 v8, v6, v8 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 ; GFX6-NEXT: v_mul_hi_u32 v6, v5, v6 -; GFX6-NEXT: v_mul_lo_u32 v7, v7, 24 +; GFX6-NEXT: v_mul_lo_u32 v8, v8, 24 ; GFX6-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v7 -; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, 24, v4 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v4, v7 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, 24, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v4, v7 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 23, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 23, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v7 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v5, v6 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v2, v7 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xffffffe8, v2 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 23, v2 @@ -2526,31 +2526,31 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX8-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX8-NEXT: v_bfe_u32 v2, v2, 1, 23 -; GFX8-NEXT: v_mul_lo_u32 v7, v6, v7 -; GFX8-NEXT: v_mul_hi_u32 v7, v6, v7 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 -; GFX8-NEXT: v_mul_hi_u32 v7, v4, v6 +; GFX8-NEXT: v_mul_lo_u32 v8, v6, v7 +; GFX8-NEXT: v_mul_hi_u32 v8, v6, v8 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 +; GFX8-NEXT: v_mul_hi_u32 v8, v4, v6 ; GFX8-NEXT: v_mul_hi_u32 v6, v5, v6 -; GFX8-NEXT: v_mul_lo_u32 v7, v7, 24 +; GFX8-NEXT: v_mul_lo_u32 v8, v8, 24 ; GFX8-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v7 -; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, 24, v4 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v8 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v7 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, 24, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v7 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, 23, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 23, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v7 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v8 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v5, v6 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 24, v2 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v2, v7 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 24, v2 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffffe8, v2 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 23, v2 @@ -2583,21 +2583,21 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX9-NEXT: v_sub_u32_e32 v4, v4, v7 ; GFX9-NEXT: v_sub_u32_e32 v5, v5, v6 -; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 +; GFX9-NEXT: v_add_u32_e32 v6, 0xffffffe8, v4 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 +; GFX9-NEXT: v_add_u32_e32 v6, 0xffffffe8, v4 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX9-NEXT: v_sub_u32_e32 v6, 23, v4 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffffff, v6 -; GFX9-NEXT: v_subrev_u32_e32 v7, 24, v5 +; GFX9-NEXT: v_add_u32_e32 v7, 0xffffffe8, v5 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, v6, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, v4, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v7, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, 24, v2 +; GFX9-NEXT: v_add_u32_e32 v4, 0xffffffe8, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_sub_u32_e32 v4, 23, v2 @@ -2627,15 +2627,15 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX10-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v7 ; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v6 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 0xffffffe8, v4 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v7, 0xffffffe8, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 0xffffffe8, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v7, 0xffffffe8, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v4 @@ -2679,34 +2679,32 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX11-NEXT: v_sub_nc_u32_e32 v5, v5, v6 -; GFX11-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0xffffffe8, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0xffffffe8, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0xffffffe8, v5 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_add_nc_u32 v7, 0xffffffe8, v5 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo -; GFX11-NEXT: v_sub_nc_u32_e32 v7, 23, v5 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffffff, v7 ; GFX11-NEXT: v_sub_nc_u32_e32 v6, 23, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v7, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v7 :: v_dual_and_b32 v4, 0xffffff, v4 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffffff, v6 -; GFX11-NEXT: v_lshl_or_b32 v1, v1, v5, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_sub_nc_u32_e32 v7, 23, v5 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v6, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffffff, v7 ; GFX11-NEXT: v_lshl_or_b32 v0, v0, v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v7, v3 +; GFX11-NEXT: v_lshl_or_b32 v1, v1, v5, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i24> @llvm.fshl.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) ret <2 x i24> %result @@ -6061,11 +6059,11 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v8 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 64, v15 -; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, 64, v15 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, 0xffffffc0, v15 ; GFX6-NEXT: v_lshr_b64 v[9:10], v[0:1], v9 ; GFX6-NEXT: v_lshl_b64 v[11:12], v[2:3], v15 ; GFX6-NEXT: v_lshl_b64 v[13:14], v[0:1], v15 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v16 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v17 ; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX6-NEXT: v_or_b32_e32 v10, v10, v12 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 @@ -6082,8 +6080,9 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[6:7], 1 ; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v4 +; GFX6-NEXT: v_not_b32_e32 v16, 63 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v14 -; GFX6-NEXT: v_subrev_i32_e32 v15, vcc, 64, v14 +; GFX6-NEXT: v_add_i32_e32 v15, vcc, v14, v16 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], v14 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], v6 ; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], v14 @@ -6109,11 +6108,11 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v8 ; GFX8-NEXT: v_sub_u32_e32 v9, vcc, 64, v15 -; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, 64, v15 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xffffffc0, v15 ; GFX8-NEXT: v_lshrrev_b64 v[9:10], v9, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3] ; GFX8-NEXT: v_lshlrev_b64 v[13:14], v15, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v16, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v17, v[0:1] ; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX8-NEXT: v_or_b32_e32 v10, v10, v12 ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 @@ -6130,8 +6129,9 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[6:7] ; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v4 +; GFX8-NEXT: v_not_b32_e32 v16, 63 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v14 -; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, 64, v14 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v14, v16 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v14, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3] ; GFX8-NEXT: v_lshrrev_b64 v[8:9], v14, v[2:3] @@ -6157,7 +6157,7 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v15, 0x7f, v8 ; GFX9-NEXT: v_sub_u32_e32 v9, 64, v15 -; GFX9-NEXT: v_subrev_u32_e32 v16, 64, v15 +; GFX9-NEXT: v_add_u32_e32 v16, 0xffffffc0, v15 ; GFX9-NEXT: v_lshrrev_b64 v[9:10], v9, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3] ; GFX9-NEXT: v_lshlrev_b64 v[13:14], v15, v[0:1] @@ -6178,7 +6178,7 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v4 ; GFX9-NEXT: v_lshl_or_b32 v1, v6, 31, v1 ; GFX9-NEXT: v_sub_u32_e32 v6, 64, v14 -; GFX9-NEXT: v_subrev_u32_e32 v15, 64, v14 +; GFX9-NEXT: v_add_u32_e32 v15, 0xffffffc0, v14 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v14, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3] ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v14, v[2:3] @@ -6210,7 +6210,7 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v10 ; GFX10-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3] ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 31, v5 -; GFX10-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 +; GFX10-NEXT: v_add_nc_u32_e32 v20, 0xffffffc0, v18 ; GFX10-NEXT: v_lshrrev_b64 v[10:11], v11, v[0:1] ; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], v18, v[0:1] @@ -6218,7 +6218,7 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 ; GFX10-NEXT: v_or_b32_e32 v10, v10, v8 -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 64, v19 +; GFX10-NEXT: v_add_nc_u32_e32 v8, 0xffffffc0, v19 ; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[12:13] ; GFX10-NEXT: v_or_b32_e32 v11, v11, v9 ; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v19 @@ -6258,34 +6258,34 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v10 ; GFX11-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3] ; GFX11-NEXT: v_lshl_or_b32 v5, v6, 31, v5 -; GFX11-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 +; GFX11-NEXT: v_lshlrev_b64 v[6:7], v18, v[0:1] ; GFX11-NEXT: v_lshrrev_b64 v[10:11], v11, v[0:1] ; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v19 -; GFX11-NEXT: v_lshlrev_b64 v[6:7], v18, v[0:1] -; GFX11-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5] -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 0xffffffc0, v18 +; GFX11-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5] +; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v19 ; GFX11-NEXT: v_or_b32_e32 v10, v10, v8 -; GFX11-NEXT: v_subrev_nc_u32_e32 v8, 64, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0xffffffc0, v19 ; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[12:13] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] ; GFX11-NEXT: v_or_b32_e32 v11, v11, v9 -; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v19 -; GFX11-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo -; GFX11-NEXT: v_lshrrev_b64 v[8:9], v8, v[12:13] ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v19 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo +; GFX11-NEXT: v_lshrrev_b64 v[8:9], v8, v[12:13] +; GFX11-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo ; GFX11-NEXT: v_or_b32_e32 v14, v14, v16 ; GFX11-NEXT: v_or_b32_e32 v15, v15, v17 +; GFX11-NEXT: v_dual_cndmask_b32 v10, v0, v10 :: v_dual_cndmask_b32 v11, v1, v11 ; GFX11-NEXT: v_lshrrev_b64 v[0:1], v19, v[12:13] -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v18 -; GFX11-NEXT: v_dual_cndmask_b32 v6, 0, v6 :: v_dual_cndmask_b32 v7, 0, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v14, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v18 ; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v15, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, v5, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v1, s0 @@ -6307,15 +6307,15 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 64, v7 ; GFX6-NEXT: v_lshr_b64 v[1:2], s[0:1], v1 ; GFX6-NEXT: v_lshl_b64 v[3:4], s[2:3], v7 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 64, v7 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0xffffffc0, v7 ; GFX6-NEXT: v_lshl_b64 v[5:6], s[0:1], v7 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX6-NEXT: v_or_b32_e32 v4, v2, v4 -; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v8 +; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v9 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GFX6-NEXT: v_not_b32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s8, 0 -; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc @@ -6324,33 +6324,34 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX6-NEXT: s_lshl_b32 s9, s6, 31 -; GFX6-NEXT: v_and_b32_e32 v10, 0x7f, v0 +; GFX6-NEXT: v_and_b32_e32 v11, 0x7f, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v10, v2, v4, vcc ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX6-NEXT: s_lshr_b64 s[2:3], s[6:7], 1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v10 -; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v10 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v11 +; GFX6-NEXT: v_not_b32_e32 v8, 63 +; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v11 ; GFX6-NEXT: v_lshl_b64 v[2:3], s[2:3], v2 -; GFX6-NEXT: v_subrev_i32_e32 v11, vcc, 64, v10 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v11, v8 ; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_lshr_b64 v[0:1], s[2:3], v11 -; GFX6-NEXT: v_lshr_b64 v[4:5], s[2:3], v10 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 +; GFX6-NEXT: v_lshr_b64 v[0:1], s[2:3], v8 +; GFX6-NEXT: v_lshr_b64 v[4:5], s[2:3], v11 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v11 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; GFX6-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v6, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v10, v3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshl_i128_ssv: @@ -6359,15 +6360,15 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 64, v7 ; GFX8-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[3:4], v7, s[2:3] -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 64, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xffffffc0, v7 ; GFX8-NEXT: v_lshlrev_b64 v[5:6], v7, s[0:1] ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX8-NEXT: v_or_b32_e32 v4, v2, v4 -; GFX8-NEXT: v_lshlrev_b64 v[1:2], v8, s[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[1:2], v9, s[0:1] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: s_mov_b32 s8, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc @@ -6376,33 +6377,34 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX8-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX8-NEXT: s_lshl_b32 s9, s6, 31 -; GFX8-NEXT: v_and_b32_e32 v10, 0x7f, v0 +; GFX8-NEXT: v_and_b32_e32 v11, 0x7f, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v10, v2, v4, vcc ; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX8-NEXT: s_lshr_b64 s[2:3], s[6:7], 1 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v10 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v10, s[0:1] +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v11 +; GFX8-NEXT: v_not_b32_e32 v8, 63 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] -; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, 64, v10 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v11, v8 ; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, s[2:3] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v8, s[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v11, s[2:3] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v6, v1 ; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v10, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshl_i128_ssv: @@ -6411,7 +6413,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX9-NEXT: v_sub_u32_e32 v1, 64, v7 ; GFX9-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[3:4], v7, s[2:3] -; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v7 +; GFX9-NEXT: v_add_u32_e32 v8, 0xffffffc0, v7 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], v7, s[0:1] ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_or_b32_e32 v4, v2, v4 @@ -6436,7 +6438,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v10, s[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] -; GFX9-NEXT: v_subrev_u32_e32 v11, 64, v10 +; GFX9-NEXT: v_add_u32_e32 v11, 0xffffffc0, v10 ; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v11, s[2:3] @@ -6471,12 +6473,12 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v3, s[0:1] ; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v13 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 +; GFX10-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v12 ; GFX10-NEXT: v_lshrrev_b64 v[6:7], v13, s[8:9] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], v12, s[0:1] ; GFX10-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v0, 64, v13 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0xffffffc0, v13 ; GFX10-NEXT: v_lshlrev_b64 v[8:9], v8, s[6:7] ; GFX10-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1] ; GFX10-NEXT: v_or_b32_e32 v3, v3, v1 @@ -6522,7 +6524,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 64, v12 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v12, s[2:3] ; GFX11-NEXT: v_sub_nc_u32_e32 v8, 64, v13 -; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v12 ; GFX11-NEXT: v_lshrrev_b64 v[6:7], v13, s[8:9] ; GFX11-NEXT: v_lshrrev_b64 v[2:3], v3, s[0:1] ; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 0, v12 @@ -6531,7 +6533,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v13 ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v13 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX11-NEXT: v_subrev_nc_u32_e32 v0, 64, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0xffffffc0, v13 ; GFX11-NEXT: v_or_b32_e32 v3, v3, v1 ; GFX11-NEXT: v_or_b32_e32 v6, v6, v8 ; GFX11-NEXT: v_or_b32_e32 v7, v7, v9 @@ -7677,12 +7679,13 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v16 +; GFX6-NEXT: v_not_b32_e32 v25, 63 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v19 -; GFX6-NEXT: v_subrev_i32_e32 v25, vcc, 64, v19 +; GFX6-NEXT: v_add_i32_e32 v26, vcc, v19, v25 ; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17 ; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v19 ; GFX6-NEXT: v_lshl_b64 v[23:24], v[0:1], v19 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v25 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v26 ; GFX6-NEXT: v_or_b32_e32 v17, v17, v21 ; GFX6-NEXT: v_or_b32_e32 v18, v18, v22 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 @@ -7700,7 +7703,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], 1 ; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v8 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v23 -; GFX6-NEXT: v_subrev_i32_e32 v24, vcc, 64, v23 +; GFX6-NEXT: v_add_i32_e32 v24, vcc, v23, v25 ; GFX6-NEXT: v_lshr_b64 v[8:9], v[0:1], v23 ; GFX6-NEXT: v_lshl_b64 v[10:11], v[2:3], v10 ; GFX6-NEXT: v_lshr_b64 v[16:17], v[2:3], v23 @@ -7719,7 +7722,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v18 ; GFX6-NEXT: v_or_b32_e32 v3, v19, v3 -; GFX6-NEXT: v_subrev_i32_e32 v19, vcc, 64, v18 +; GFX6-NEXT: v_add_i32_e32 v19, vcc, v18, v25 ; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v8 ; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v18 ; GFX6-NEXT: v_lshl_b64 v[16:17], v[4:5], v18 @@ -7741,7 +7744,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], 1 ; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v8 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v14 -; GFX6-NEXT: v_subrev_i32_e32 v15, vcc, 64, v14 +; GFX6-NEXT: v_add_i32_e32 v15, vcc, v14, v25 ; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v14 ; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v10 ; GFX6-NEXT: v_lshr_b64 v[12:13], v[6:7], v14 @@ -7768,12 +7771,13 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v16 +; GFX8-NEXT: v_not_b32_e32 v25, 63 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v19 -; GFX8-NEXT: v_subrev_u32_e32 v25, vcc, 64, v19 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v19, v25 ; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] ; GFX8-NEXT: v_lshlrev_b64 v[23:24], v19, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v25, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v26, v[0:1] ; GFX8-NEXT: v_or_b32_e32 v17, v17, v21 ; GFX8-NEXT: v_or_b32_e32 v18, v18, v22 ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 @@ -7791,7 +7795,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[10:11] ; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v8 ; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v23 -; GFX8-NEXT: v_subrev_u32_e32 v24, vcc, 64, v23 +; GFX8-NEXT: v_add_u32_e32 v24, vcc, v23, v25 ; GFX8-NEXT: v_lshrrev_b64 v[8:9], v23, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] ; GFX8-NEXT: v_lshrrev_b64 v[16:17], v23, v[2:3] @@ -7810,7 +7814,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v18 ; GFX8-NEXT: v_or_b32_e32 v3, v19, v3 -; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, 64, v18 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v18, v25 ; GFX8-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] ; GFX8-NEXT: v_lshlrev_b64 v[16:17], v18, v[4:5] @@ -7832,7 +7836,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] ; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v8 ; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v14 -; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, 64, v14 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v14, v25 ; GFX8-NEXT: v_lshrrev_b64 v[8:9], v14, v[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] ; GFX8-NEXT: v_lshrrev_b64 v[12:13], v14, v[6:7] @@ -7860,7 +7864,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v16 ; GFX9-NEXT: v_sub_u32_e32 v17, 64, v19 -; GFX9-NEXT: v_subrev_u32_e32 v25, 64, v19 +; GFX9-NEXT: v_add_u32_e32 v25, 0xffffffc0, v19 ; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] ; GFX9-NEXT: v_lshlrev_b64 v[23:24], v19, v[0:1] @@ -7881,7 +7885,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v8 ; GFX9-NEXT: v_lshl_or_b32 v1, v10, 31, v1 ; GFX9-NEXT: v_sub_u32_e32 v10, 64, v23 -; GFX9-NEXT: v_subrev_u32_e32 v24, 64, v23 +; GFX9-NEXT: v_add_u32_e32 v24, 0xffffffc0, v23 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v23, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] ; GFX9-NEXT: v_lshrrev_b64 v[16:17], v23, v[2:3] @@ -7900,7 +7904,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc ; GFX9-NEXT: v_sub_u32_e32 v8, 64, v18 ; GFX9-NEXT: v_or_b32_e32 v3, v19, v3 -; GFX9-NEXT: v_subrev_u32_e32 v19, 64, v18 +; GFX9-NEXT: v_add_u32_e32 v19, 0xffffffc0, v18 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] ; GFX9-NEXT: v_lshlrev_b64 v[16:17], v18, v[4:5] @@ -7921,7 +7925,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] ; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v8 ; GFX9-NEXT: v_sub_u32_e32 v10, 64, v14 -; GFX9-NEXT: v_subrev_u32_e32 v15, 64, v14 +; GFX9-NEXT: v_add_u32_e32 v15, 0xffffffc0, v14 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v14, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] ; GFX9-NEXT: v_lshrrev_b64 v[12:13], v14, v[6:7] @@ -7956,13 +7960,13 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-NEXT: v_lshl_or_b32 v9, v10, 31, v9 ; GFX10-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] ; GFX10-NEXT: v_lshrrev_b64 v[16:17], v17, v[0:1] -; GFX10-NEXT: v_subrev_nc_u32_e32 v29, 64, v27 +; GFX10-NEXT: v_add_nc_u32_e32 v29, 0xffffffc0, v27 ; GFX10-NEXT: v_sub_nc_u32_e32 v25, 64, v28 ; GFX10-NEXT: v_lshlrev_b64 v[21:22], v27, v[0:1] ; GFX10-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27 ; GFX10-NEXT: v_or_b32_e32 v18, v16, v18 -; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 64, v28 +; GFX10-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v28 ; GFX10-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11] ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v29, v[0:1] ; GFX10-NEXT: v_or_b32_e32 v19, v17, v19 @@ -7999,10 +8003,10 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-NEXT: v_lshl_or_b32 v9, v14, 31, v9 ; GFX10-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15] ; GFX10-NEXT: v_sub_nc_u32_e32 v20, 64, v22 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 64, v24 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffc0, v24 ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v24 ; GFX10-NEXT: v_or_b32_e32 v12, v10, v12 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v22 +; GFX10-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v22 ; GFX10-NEXT: v_lshrrev_b64 v[18:19], v22, v[8:9] ; GFX10-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] ; GFX10-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] @@ -8049,19 +8053,19 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX11-NEXT: v_cndmask_b32_e32 v22, 0, v22, vcc_lo ; GFX11-NEXT: v_sub_nc_u32_e32 v17, 64, v27 ; GFX11-NEXT: v_lshlrev_b64 v[18:19], v27, v[2:3] -; GFX11-NEXT: v_subrev_nc_u32_e32 v29, 64, v27 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v27 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b64 v[16:17], v17, v[0:1] -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v29, v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_or_b32_e32 v19, v17, v19 ; GFX11-NEXT: v_or_b32_e32 v18, v16, v18 -; GFX11-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_cndmask_b32 v19, v1, v19 :: v_dual_cndmask_b32 v18, v0, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 0xffffffc0, v27 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v19, v17, v19 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v29, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v18, v0, v18 :: v_dual_cndmask_b32 v19, v1, v19 ; GFX11-NEXT: v_sub_nc_u32_e32 v25, 64, v28 -; GFX11-NEXT: v_subrev_nc_u32_e32 v16, 64, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v28 ; GFX11-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9] ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v28 ; GFX11-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11] @@ -8095,26 +8099,26 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX11-NEXT: v_lshl_or_b32 v9, v14, 31, v9 ; GFX11-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15] ; GFX11-NEXT: v_sub_nc_u32_e32 v20, 64, v22 -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 64, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffc0, v24 ; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v22 ; GFX11-NEXT: v_or_b32_e32 v12, v10, v12 -; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v22 ; GFX11-NEXT: v_lshrrev_b64 v[18:19], v22, v[8:9] ; GFX11-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] ; GFX11-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] ; GFX11-NEXT: v_or_b32_e32 v5, v11, v13 -; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[14:15] ; GFX11-NEXT: v_cndmask_b32_e32 v13, 0, v16, vcc_lo +; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[14:15] ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v22 ; GFX11-NEXT: v_or_b32_e32 v16, v18, v20 ; GFX11-NEXT: v_or_b32_e32 v18, v19, v21 ; GFX11-NEXT: v_dual_cndmask_b32 v12, v3, v12 :: v_dual_cndmask_b32 v5, v4, v5 ; GFX11-NEXT: v_lshrrev_b64 v[3:4], v22, v[14:15] -; GFX11-NEXT: v_cndmask_b32_e32 v14, 0, v17, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v16, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v24 ; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v18, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v14, 0, v17, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v6, v12, v6, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v5, v7, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v10, v8, s1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index 58304d2072d7f..dbc8f12c2c25c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -21,10 +21,10 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 7, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, -7, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 7, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, -7, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 6, v0 @@ -51,10 +51,10 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 -; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 7, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, -7, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 7, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, -7, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_sub_u16_e32 v1, 6, v0 @@ -81,10 +81,10 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 -; GFX9-NEXT: v_subrev_u32_e32 v1, 7, v0 +; GFX9-NEXT: v_add_u32_e32 v1, -7, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_subrev_u32_e32 v1, 7, v0 +; GFX9-NEXT: v_add_u32_e32 v1, -7, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_sub_u16_e32 v1, 6, v0 @@ -111,10 +111,10 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX10-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, -7, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, -7, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_sub_nc_u16 v1, 6, v0 @@ -147,11 +147,11 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX11-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v1, -7, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v1, -7, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -186,10 +186,10 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 7, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, -7, v2 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 7, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, -7, v2 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 6, v2 @@ -216,10 +216,10 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX8-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 7, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, -7, v2 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 7, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, -7, v2 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_sub_u16_e32 v3, 6, v2 @@ -246,10 +246,10 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_subrev_u32_e32 v3, 7, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -7, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, 7, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -7, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_sub_u16_e32 v3, 6, v2 @@ -276,10 +276,10 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX10-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, -7, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, -7, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX10-NEXT: v_sub_nc_u16 v3, 6, v2 @@ -312,11 +312,11 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX11-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, v2, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, -7, v2 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, -7, v2 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -1552,16 +1552,16 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffffff -; GFX6-NEXT: v_mul_lo_u32 v1, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_mul_lo_u32 v2, v0, v1 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 24, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v0, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 24, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 23, v0 @@ -1583,16 +1583,16 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_and_b32 s1, s1, 0xffffff -; GFX8-NEXT: v_mul_lo_u32 v1, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 -; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 24, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 24, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v0, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 23, v0 @@ -1620,10 +1620,10 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 -; GFX9-NEXT: v_subrev_u32_e32 v1, 24, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0xffffffe8, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_subrev_u32_e32 v1, 24, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0xffffffe8, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_sub_u32_e32 v1, 23, v0 @@ -1649,10 +1649,10 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX10-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0xffffffe8, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0xffffffe8, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 23, v0 @@ -1684,11 +1684,11 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xffffffe8, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xffffffe8, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -1717,16 +1717,16 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4 -; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_mul_lo_u32 v5, v3, v4 +; GFX6-NEXT: v_mul_hi_u32 v5, v3, v5 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v4 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xffffffe8, v2 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v2 @@ -1748,16 +1748,16 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX8-NEXT: v_mul_lo_u32 v4, v3, v4 -; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; GFX8-NEXT: v_mul_lo_u32 v5, v3, v4 +; GFX8-NEXT: v_mul_hi_u32 v5, v3, v5 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX8-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v4 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xffffffe8, v2 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v2 @@ -1785,10 +1785,10 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0xffffffe8, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0xffffffe8, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_sub_u32_e32 v3, 23, v2 @@ -1814,10 +1814,10 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX10-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v2 @@ -1849,11 +1849,11 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX11-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, v2, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -1888,7 +1888,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6-NEXT: s_lshr_b32 s1, s2, 16 ; GFX6-NEXT: s_lshr_b32 s7, s3, 8 ; GFX6-NEXT: s_bfe_u32 s10, s2, 0x80008 -; GFX6-NEXT: v_mul_lo_u32 v3, v2, v3 +; GFX6-NEXT: v_mul_lo_u32 v4, v2, v3 ; GFX6-NEXT: s_or_b32 s8, s8, s9 ; GFX6-NEXT: s_and_b32 s9, s2, 0xff ; GFX6-NEXT: s_lshl_b32 s10, s10, 8 @@ -1908,7 +1908,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6-NEXT: v_or_b32_e32 v1, s2, v1 ; GFX6-NEXT: s_lshr_b32 s2, s4, 16 ; GFX6-NEXT: s_bfe_u32 s9, s4, 0x80008 -; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4 ; GFX6-NEXT: s_and_b32 s7, s4, 0xff ; GFX6-NEXT: s_lshl_b32 s9, s9, 8 ; GFX6-NEXT: s_and_b32 s2, s2, 0xff @@ -1917,62 +1917,62 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s2, s7, s2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v3, s2, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_mul_hi_u32 v4, s2, v2 ; GFX6-NEXT: s_lshr_b32 s3, s5, 8 ; GFX6-NEXT: s_and_b32 s5, s5, 0xff -; GFX6-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NEXT: v_mov_b32_e32 v5, s4 ; GFX6-NEXT: s_and_b32 s3, s3, 0xff -; GFX6-NEXT: v_alignbit_b32 v4, s5, v4, 24 +; GFX6-NEXT: v_alignbit_b32 v5, s5, v5, 24 ; GFX6-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX6-NEXT: v_mul_lo_u32 v3, v3, 24 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX6-NEXT: v_mul_lo_u32 v4, v4, 24 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: v_or_b32_e32 v4, s3, v4 -; GFX6-NEXT: v_mul_hi_u32 v2, v4, v2 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 24, v3 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v3 +; GFX6-NEXT: v_or_b32_e32 v5, s3, v5 +; GFX6-NEXT: v_mul_hi_u32 v2, v5, v2 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s2, v4 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v4, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 ; GFX6-NEXT: v_mul_lo_u32 v2, v2, 24 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 24, v3 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v4, v2 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 23, v3 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v4, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 23, v4 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v2, v3 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: s_and_b32 s6, s6, 0xff -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX6-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v3 ; GFX6-NEXT: s_lshl_b32 s2, s6, 17 ; GFX6-NEXT: s_lshl_b32 s3, s8, 1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffffff, v5 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffffff, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: v_lshl_b32_e32 v5, s2, v5 -; GFX6-NEXT: v_lshr_b32_e32 v3, s1, v3 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 23, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffffff, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX6-NEXT: v_lshl_b32_e32 v6, s2, v6 +; GFX6-NEXT: v_lshr_b32_e32 v4, s1, v4 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v2 ; GFX6-NEXT: s_lshl_b32 s0, s0, 17 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_bfe_u32 v2, v3, 8, 8 +; GFX6-NEXT: v_bfe_u32 v2, v4, 8, 8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_bfe_u32 v2, v3, 16, 8 +; GFX6-NEXT: v_bfe_u32 v2, v4, 16, 8 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v0 @@ -2024,7 +2024,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8-NEXT: v_not_b32_e32 v1, 23 ; GFX8-NEXT: s_or_b32 s3, s10, s3 ; GFX8-NEXT: s_and_b32 s8, 0xffff, s8 -; GFX8-NEXT: v_mul_lo_u32 v1, v0, v1 +; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 ; GFX8-NEXT: s_or_b32 s3, s3, s8 @@ -2034,75 +2034,75 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8-NEXT: s_lshr_b32 s10, s4, 24 ; GFX8-NEXT: s_and_b32 s4, s4, 0xff ; GFX8-NEXT: s_lshl_b32 s8, s8, 8 -; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: s_or_b32 s4, s4, s8 ; GFX8-NEXT: s_and_b32 s8, s9, 0xff ; GFX8-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 ; GFX8-NEXT: s_or_b32 s4, s4, s8 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, s4, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_lshr_b32 s11, s5, 8 ; GFX8-NEXT: s_and_b32 s5, s5, 0xff ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX8-NEXT: v_mul_lo_u32 v2, v2, 24 ; GFX8-NEXT: s_and_b32 s8, s11, 0xff ; GFX8-NEXT: s_or_b32 s5, s10, s5 ; GFX8-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 ; GFX8-NEXT: s_or_b32 s5, s5, s8 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s4, v1 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s4, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v2 ; GFX8-NEXT: s_lshl_b32 s4, s6, 17 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: s_or_b32 s0, s4, s0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_lshrrev_b32_e64 v1, v1, s2 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0 +; GFX8-NEXT: v_lshrrev_b32_e64 v2, v2, s2 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v0 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v0, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v0, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 23, v0 ; GFX8-NEXT: s_lshl_b32 s0, s7, 17 ; GFX8-NEXT: s_lshl_b32 s1, s1, 1 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s3 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_mov_b32_e32 v4, 16 -; GFX8-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_or_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v2 ; GFX8-NEXT: v_readfirstlane_b32 s1, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -2175,11 +2175,11 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9-NEXT: s_or_b32 s5, s5, s8 ; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 -; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 ; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX9-NEXT: s_and_b32 s6, 0xffff, s6 @@ -2193,10 +2193,10 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9-NEXT: v_lshrrev_b32_e64 v1, v1, s2 ; GFX9-NEXT: v_sub_u32_e32 v0, s5, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, s0, v2, v1 -; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 @@ -2294,23 +2294,23 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX10-NEXT: s_and_b32 s4, s11, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 ; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 ; GFX10-NEXT: s_lshl_b32 s4, s4, 16 ; GFX10-NEXT: s_lshl_b32 s1, s1, 1 ; GFX10-NEXT: s_or_b32 s2, s2, s4 ; GFX10-NEXT: s_and_b32 s4, s13, 0xff ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v1 ; GFX10-NEXT: s_lshl_b32 s4, s4, 16 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 ; GFX10-NEXT: s_or_b32 s3, s3, s4 ; GFX10-NEXT: s_lshl_b32 s4, s7, 17 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: s_or_b32 s0, s4, s0 @@ -2393,69 +2393,67 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX11-NEXT: s_and_b32 s0, s0, 0xff ; GFX11-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX11-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX11-NEXT: s_lshr_b32 s13, s3, 8 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff ; GFX11-NEXT: s_lshl_b32 s6, s6, 8 ; GFX11-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_and_b32 s13, s13, 0xff +; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX11-NEXT: s_or_b32 s0, s0, s6 ; GFX11-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX11-NEXT: s_or_b32 s3, s12, s3 ; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX11-NEXT: s_lshr_b32 s13, s3, 8 -; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX11-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-NEXT: s_and_b32 s3, s3, 0xff -; GFX11-NEXT: s_and_b32 s13, s13, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_or_b32 s3, s12, s3 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, s4, v1 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, s5, v0 ; GFX11-NEXT: s_lshl_b32 s4, s10, 8 ; GFX11-NEXT: s_and_b32 s10, 0xffff, s13 ; GFX11-NEXT: s_or_b32 s2, s2, s4 -; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 ; GFX11-NEXT: s_lshl_b32 s4, s9, 16 ; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX11-NEXT: s_lshl_b32 s5, s10, 16 ; GFX11-NEXT: s_or_b32 s2, s2, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_add_nc_u32 v2, 0xffffffe8, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX11-NEXT: s_lshl_b32 s4, s7, 17 -; GFX11-NEXT: s_lshl_b32 s5, s10, 16 -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v1 ; GFX11-NEXT: s_or_b32 s0, s4, s0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 ; GFX11-NEXT: s_lshl_b32 s1, s1, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_add_nc_u32 v2, 0xffffffe8, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s2 -; GFX11-NEXT: s_or_b32 s2, s3, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v1 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v3 +; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX11-NEXT: s_or_b32 s2, s3, s5 ; GFX11-NEXT: v_lshl_or_b32 v1, s0, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v3 +; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, s2 ; GFX11-NEXT: s_lshl_b32 s0, s8, 17 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v3 ; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshl_or_b32 v0, s0, v2, v0 ; GFX11-NEXT: v_bfe_u32 v2, v1, 8, 8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -2491,32 +2489,32 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_mul_lo_u32 v7, v6, v7 +; GFX6-NEXT: v_mul_lo_u32 v8, v6, v7 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffffff, v3 -; GFX6-NEXT: v_mul_hi_u32 v7, v6, v7 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GFX6-NEXT: v_mul_hi_u32 v7, v4, v6 +; GFX6-NEXT: v_mul_hi_u32 v8, v6, v8 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 ; GFX6-NEXT: v_mul_hi_u32 v6, v5, v6 -; GFX6-NEXT: v_mul_lo_u32 v7, v7, 24 +; GFX6-NEXT: v_mul_lo_u32 v8, v8, 24 ; GFX6-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v7 -; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, 24, v4 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v4, v7 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, 24, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v4, v7 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 23, v4 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffffff, v7 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 23, v4 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffffff, v8 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v7, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v8, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v5, v6 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v2, v7 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xffffffe8, v2 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 23, v2 @@ -2540,32 +2538,32 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_mul_lo_u32 v7, v6, v7 +; GFX8-NEXT: v_mul_lo_u32 v8, v6, v7 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffffff, v3 -; GFX8-NEXT: v_mul_hi_u32 v7, v6, v7 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 -; GFX8-NEXT: v_mul_hi_u32 v7, v4, v6 +; GFX8-NEXT: v_mul_hi_u32 v8, v6, v8 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 +; GFX8-NEXT: v_mul_hi_u32 v8, v4, v6 ; GFX8-NEXT: v_mul_hi_u32 v6, v5, v6 -; GFX8-NEXT: v_mul_lo_u32 v7, v7, 24 +; GFX8-NEXT: v_mul_lo_u32 v8, v8, 24 ; GFX8-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v7 -; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, 24, v4 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v8 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v7 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, 24, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v7 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, 23, v4 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffffff, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 23, v4 +; GFX8-NEXT: v_and_b32_e32 v8, 0xffffff, v8 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, v7, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v8, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v5, v6 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 24, v2 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v2, v7 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 24, v2 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffffe8, v2 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 23, v2 @@ -2599,10 +2597,10 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX9-NEXT: v_sub_u32_e32 v4, v4, v7 ; GFX9-NEXT: v_sub_u32_e32 v5, v5, v6 -; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 +; GFX9-NEXT: v_add_u32_e32 v6, 0xffffffe8, v4 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 +; GFX9-NEXT: v_add_u32_e32 v6, 0xffffffe8, v4 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX9-NEXT: v_sub_u32_e32 v6, 23, v4 @@ -2610,10 +2608,10 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX9-NEXT: v_and_b32_e32 v6, 0xffffff, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, v6, v2 -; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v5 +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v5 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, 24, v2 +; GFX9-NEXT: v_add_u32_e32 v4, 0xffffffe8, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_sub_u32_e32 v4, 23, v2 @@ -2645,15 +2643,15 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX10-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v7 ; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v6 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 0xffffffe8, v4 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v7, 0xffffffe8, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 0xffffffe8, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v7, 0xffffffe8, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v4 @@ -2675,12 +2673,11 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; GFX11-NEXT: v_dual_mul_f32 v6, 0x4f7ffffe, v6 :: v_dual_lshlrev_b32 v1, 1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX11-NEXT: v_mul_lo_u32 v7, 0xffffffe8, v6 @@ -2697,34 +2694,33 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_hi_u32 v6, v5, v6 ; GFX11-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v5, v5, v6 -; GFX11-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 -; GFX11-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0xffffffe8, v4 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 -; GFX11-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0xffffffe8, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0xffffffe8, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 -; GFX11-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_add_nc_u32 v7, 0xffffffe8, v5 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_sub_nc_u32_e32 v6, 23, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v7 :: v_dual_and_b32 v4, 0xffffff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v6, 0xffffff, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_sub_nc_u32_e32 v7, 23, v5 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshl_or_b32 v0, v0, v6, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_lshl_or_b32 v1, v1, v4, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) @@ -6087,13 +6083,14 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX6-NEXT: v_not_b32_e32 v0, v8 ; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v15 +; GFX6-NEXT: v_not_b32_e32 v16, 63 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[9:10], v0 ; GFX6-NEXT: v_lshl_b64 v[11:12], v[2:3], v15 -; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, 64, v15 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v15, v16 ; GFX6-NEXT: v_lshl_b64 v[13:14], v[9:10], v15 ; GFX6-NEXT: v_or_b32_e32 v11, v0, v11 ; GFX6-NEXT: v_or_b32_e32 v12, v1, v12 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[9:10], v16 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[9:10], v17 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 ; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v13, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v13, 0, v14, vcc @@ -6106,7 +6103,7 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v14 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], v14 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[6:7], v2 -; GFX6-NEXT: v_subrev_i32_e32 v15, vcc, 64, v14 +; GFX6-NEXT: v_add_i32_e32 v15, vcc, v14, v16 ; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[6:7], v15 @@ -6135,13 +6132,14 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX8-NEXT: v_not_b32_e32 v0, v8 ; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v0 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v15 +; GFX8-NEXT: v_not_b32_e32 v16, 63 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[9:10] ; GFX8-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3] -; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, 64, v15 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v15, v16 ; GFX8-NEXT: v_lshlrev_b64 v[13:14], v15, v[9:10] ; GFX8-NEXT: v_or_b32_e32 v11, v0, v11 ; GFX8-NEXT: v_or_b32_e32 v12, v1, v12 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v16, v[9:10] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v17, v[9:10] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 ; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v13, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v13, 0, v14, vcc @@ -6154,7 +6152,7 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v14 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v14, v[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] -; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, 64, v14 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v14, v16 ; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v15, v[6:7] @@ -6185,7 +6183,7 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX9-NEXT: v_sub_u32_e32 v0, 64, v15 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[9:10] ; GFX9-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3] -; GFX9-NEXT: v_subrev_u32_e32 v16, 64, v15 +; GFX9-NEXT: v_add_u32_e32 v16, 0xffffffc0, v15 ; GFX9-NEXT: v_lshlrev_b64 v[13:14], v15, v[9:10] ; GFX9-NEXT: v_or_b32_e32 v11, v0, v11 ; GFX9-NEXT: v_or_b32_e32 v12, v1, v12 @@ -6202,7 +6200,7 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX9-NEXT: v_cndmask_b32_e32 v12, v1, v3, vcc ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v14, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] -; GFX9-NEXT: v_subrev_u32_e32 v15, 64, v14 +; GFX9-NEXT: v_add_u32_e32 v15, 0xffffffc0, v14 ; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v15, v[6:7] @@ -6232,9 +6230,9 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX10-NEXT: v_and_b32_e32 v18, 0x7f, v9 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v10 ; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19 -; GFX10-NEXT: v_subrev_nc_u32_e32 v21, 64, v19 +; GFX10-NEXT: v_add_nc_u32_e32 v21, 0xffffffc0, v19 ; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v18 -; GFX10-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 +; GFX10-NEXT: v_add_nc_u32_e32 v20, 0xffffffc0, v18 ; GFX10-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3] ; GFX10-NEXT: v_lshrrev_b64 v[12:13], v19, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] @@ -6273,47 +6271,48 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_not_b32_e32 v9, v8 +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX11-NEXT: v_lshrrev_b32_e32 v10, 31, v1 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_b32_e32 v18, 0x7f, v9 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v18 ; GFX11-NEXT: v_lshlrev_b64 v[14:15], v18, v[0:1] ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 ; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v8 -; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v18 -; GFX11-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 0xffffffc0, v18 ; GFX11-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3] +; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] ; GFX11-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc_lo ; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v19 -; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] -; GFX11-NEXT: v_subrev_nc_u32_e32 v21, 64, v19 -; GFX11-NEXT: v_lshrrev_b64 v[12:13], v19, v[4:5] ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] -; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] +; GFX11-NEXT: v_lshrrev_b64 v[12:13], v19, v[4:5] ; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v19 ; GFX11-NEXT: v_or_b32_e32 v10, v10, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 0xffffffc0, v19 +; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] ; GFX11-NEXT: v_or_b32_e32 v11, v11, v9 -; GFX11-NEXT: v_lshrrev_b64 v[8:9], v21, v[6:7] ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo +; GFX11-NEXT: v_lshrrev_b64 v[8:9], v21, v[6:7] ; GFX11-NEXT: v_or_b32_e32 v12, v12, v16 ; GFX11-NEXT: v_or_b32_e32 v13, v13, v17 -; GFX11-NEXT: v_dual_cndmask_b32 v10, v0, v10 :: v_dual_cndmask_b32 v11, v1, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo ; GFX11-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7] -; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v15, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v12, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v18 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v12, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v9, v13, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s1 +; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v15, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v6, v5, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v0, v14, v4 ; GFX11-NEXT: v_or_b32_e32 v1, v7, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) @@ -6335,46 +6334,47 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v1 ; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 64, v7 +; GFX6-NEXT: v_not_b32_e32 v8, 63 ; GFX6-NEXT: v_lshr_b64 v[1:2], s[8:9], v1 ; GFX6-NEXT: v_lshl_b64 v[3:4], s[0:1], v7 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 64, v7 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v7, v8 ; GFX6-NEXT: v_lshl_b64 v[5:6], s[8:9], v7 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX6-NEXT: v_or_b32_e32 v4, v2, v4 -; GFX6-NEXT: v_lshl_b64 v[1:2], s[8:9], v8 +; GFX6-NEXT: v_lshl_b64 v[1:2], s[8:9], v9 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_mov_b32_e32 v3, s0 ; GFX6-NEXT: v_mov_b32_e32 v4, s1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX6-NEXT: v_and_b32_e32 v10, 0x7f, v0 +; GFX6-NEXT: v_and_b32_e32 v11, 0x7f, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v10 -; GFX6-NEXT: v_lshr_b64 v[0:1], s[4:5], v10 +; GFX6-NEXT: v_cndmask_b32_e32 v10, v2, v4, vcc +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v11 +; GFX6-NEXT: v_lshr_b64 v[0:1], s[4:5], v11 ; GFX6-NEXT: v_lshl_b64 v[2:3], s[6:7], v2 -; GFX6-NEXT: v_subrev_i32_e32 v11, vcc, 64, v10 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v11, v8 ; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_lshr_b64 v[0:1], s[6:7], v11 -; GFX6-NEXT: v_lshr_b64 v[4:5], s[6:7], v10 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 +; GFX6-NEXT: v_lshr_b64 v[0:1], s[6:7], v8 +; GFX6-NEXT: v_lshr_b64 v[4:5], s[6:7], v11 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v11 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; GFX6-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v6, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v10, v3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshr_i128_ssv: @@ -6387,46 +6387,47 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v1 ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 64, v7 +; GFX8-NEXT: v_not_b32_e32 v8, 63 ; GFX8-NEXT: v_lshrrev_b64 v[1:2], v1, s[8:9] ; GFX8-NEXT: v_lshlrev_b64 v[3:4], v7, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 64, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v7, v8 ; GFX8-NEXT: v_lshlrev_b64 v[5:6], v7, s[8:9] ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX8-NEXT: v_or_b32_e32 v4, v2, v4 -; GFX8-NEXT: v_lshlrev_b64 v[1:2], v8, s[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[1:2], v9, s[8:9] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX8-NEXT: v_and_b32_e32 v10, 0x7f, v0 +; GFX8-NEXT: v_and_b32_e32 v11, 0x7f, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v10 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v10, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v10, v2, v4, vcc +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v11 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, s[6:7] -; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, 64, v10 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v11, v8 ; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[6:7] -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, s[6:7] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v8, s[6:7] +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v11, s[6:7] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v6, v1 ; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v10, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshr_i128_ssv: @@ -6441,7 +6442,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX9-NEXT: v_sub_u32_e32 v1, 64, v7 ; GFX9-NEXT: v_lshrrev_b64 v[1:2], v1, s[8:9] ; GFX9-NEXT: v_lshlrev_b64 v[3:4], v7, s[0:1] -; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v7 +; GFX9-NEXT: v_add_u32_e32 v8, 0xffffffc0, v7 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], v7, s[8:9] ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_or_b32_e32 v4, v2, v4 @@ -6460,7 +6461,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v10, s[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[6:7] -; GFX9-NEXT: v_subrev_u32_e32 v11, 64, v10 +; GFX9-NEXT: v_add_u32_e32 v11, 0xffffffc0, v10 ; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v11, s[6:7] @@ -6492,10 +6493,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v13 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 ; GFX10-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] -; GFX10-NEXT: v_subrev_nc_u32_e32 v14, 64, v13 +; GFX10-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v13 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[8:9] -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 +; GFX10-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v12 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], v13, s[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[8:9], v8, s[6:7] ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] @@ -6544,11 +6545,11 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX11-NEXT: v_and_b32_e32 v13, 0x7f, v0 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v12 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v12, s[8:9] -; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v12 ; GFX11-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo ; GFX11-NEXT: v_sub_nc_u32_e32 v8, 64, v13 ; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] -; GFX11-NEXT: v_subrev_nc_u32_e32 v14, 64, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v13 ; GFX11-NEXT: v_lshrrev_b64 v[4:5], v13, s[4:5] ; GFX11-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1] ; GFX11-NEXT: v_lshlrev_b64 v[8:9], v8, s[6:7] @@ -7718,13 +7719,14 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-NEXT: v_not_b32_e32 v0, v16 ; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v19 +; GFX6-NEXT: v_not_b32_e32 v25, 63 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[17:18], v0 ; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v19 -; GFX6-NEXT: v_subrev_i32_e32 v25, vcc, 64, v19 +; GFX6-NEXT: v_add_i32_e32 v26, vcc, v19, v25 ; GFX6-NEXT: v_lshl_b64 v[23:24], v[17:18], v19 ; GFX6-NEXT: v_or_b32_e32 v21, v0, v21 ; GFX6-NEXT: v_or_b32_e32 v22, v1, v22 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[17:18], v25 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[17:18], v26 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 ; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v23, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v23, 0, v24, vcc @@ -7737,7 +7739,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v22 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], v22 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[10:11], v2 -; GFX6-NEXT: v_subrev_i32_e32 v24, vcc, 64, v22 +; GFX6-NEXT: v_add_i32_e32 v24, vcc, v22, v25 ; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v24 @@ -7761,7 +7763,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-NEXT: v_lshr_b64 v[4:5], v[8:9], v4 ; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v18 ; GFX6-NEXT: v_or_b32_e32 v2, v19, v2 -; GFX6-NEXT: v_subrev_i32_e32 v19, vcc, 64, v18 +; GFX6-NEXT: v_add_i32_e32 v19, vcc, v18, v25 ; GFX6-NEXT: v_lshl_b64 v[16:17], v[8:9], v18 ; GFX6-NEXT: v_or_b32_e32 v10, v4, v10 ; GFX6-NEXT: v_or_b32_e32 v11, v5, v11 @@ -7778,7 +7780,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v18 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], v18 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[14:15], v6 -; GFX6-NEXT: v_subrev_i32_e32 v19, vcc, 64, v18 +; GFX6-NEXT: v_add_i32_e32 v19, vcc, v18, v25 ; GFX6-NEXT: v_or_b32_e32 v6, v4, v6 ; GFX6-NEXT: v_or_b32_e32 v7, v5, v7 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[14:15], v19 @@ -7809,13 +7811,14 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX8-NEXT: v_not_b32_e32 v0, v16 ; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v0 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v19 +; GFX8-NEXT: v_not_b32_e32 v25, 63 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18] ; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] -; GFX8-NEXT: v_subrev_u32_e32 v25, vcc, 64, v19 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v19, v25 ; GFX8-NEXT: v_lshlrev_b64 v[23:24], v19, v[17:18] ; GFX8-NEXT: v_or_b32_e32 v21, v0, v21 ; GFX8-NEXT: v_or_b32_e32 v22, v1, v22 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v25, v[17:18] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v26, v[17:18] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 ; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v23, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v23, 0, v24, vcc @@ -7828,7 +7831,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v22 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v22, v[8:9] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11] -; GFX8-NEXT: v_subrev_u32_e32 v24, vcc, 64, v22 +; GFX8-NEXT: v_add_u32_e32 v24, vcc, v22, v25 ; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] @@ -7852,7 +7855,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9] ; GFX8-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] ; GFX8-NEXT: v_or_b32_e32 v2, v19, v2 -; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, 64, v18 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v18, v25 ; GFX8-NEXT: v_lshlrev_b64 v[16:17], v18, v[8:9] ; GFX8-NEXT: v_or_b32_e32 v10, v4, v10 ; GFX8-NEXT: v_or_b32_e32 v11, v5, v11 @@ -7869,7 +7872,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v18 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v18, v[12:13] ; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[14:15] -; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, 64, v18 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v18, v25 ; GFX8-NEXT: v_or_b32_e32 v6, v4, v6 ; GFX8-NEXT: v_or_b32_e32 v7, v5, v7 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v19, v[14:15] @@ -7902,7 +7905,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_sub_u32_e32 v0, 64, v19 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18] ; GFX9-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] -; GFX9-NEXT: v_subrev_u32_e32 v25, 64, v19 +; GFX9-NEXT: v_add_u32_e32 v25, 0xffffffc0, v19 ; GFX9-NEXT: v_lshlrev_b64 v[23:24], v19, v[17:18] ; GFX9-NEXT: v_or_b32_e32 v21, v0, v21 ; GFX9-NEXT: v_or_b32_e32 v22, v1, v22 @@ -7919,7 +7922,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_cndmask_b32_e32 v21, v1, v3, vcc ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v22, v[8:9] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11] -; GFX9-NEXT: v_subrev_u32_e32 v24, 64, v22 +; GFX9-NEXT: v_add_u32_e32 v24, 0xffffffc0, v22 ; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] @@ -7942,7 +7945,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] ; GFX9-NEXT: v_or_b32_e32 v2, v19, v2 -; GFX9-NEXT: v_subrev_u32_e32 v19, 64, v18 +; GFX9-NEXT: v_add_u32_e32 v19, 0xffffffc0, v18 ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc ; GFX9-NEXT: v_lshlrev_b64 v[16:17], v18, v[8:9] ; GFX9-NEXT: v_or_b32_e32 v10, v4, v10 @@ -7960,7 +7963,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_cndmask_b32_e32 v11, v5, v7, vcc ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v18, v[12:13] ; GFX9-NEXT: v_lshlrev_b64 v[6:7], v6, v[14:15] -; GFX9-NEXT: v_subrev_u32_e32 v19, 64, v18 +; GFX9-NEXT: v_add_u32_e32 v19, 0xffffffc0, v18 ; GFX9-NEXT: v_or_b32_e32 v6, v4, v6 ; GFX9-NEXT: v_or_b32_e32 v7, v5, v7 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v19, v[14:15] @@ -7991,11 +7994,11 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-NEXT: v_and_b32_e32 v25, 0x7f, v17 ; GFX10-NEXT: v_lshrrev_b32_e32 v17, 31, v1 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX10-NEXT: v_subrev_nc_u32_e32 v27, 64, v26 +; GFX10-NEXT: v_add_nc_u32_e32 v27, 0xffffffc0, v26 ; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v26 ; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v25 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v17 -; GFX10-NEXT: v_subrev_nc_u32_e32 v19, 64, v25 +; GFX10-NEXT: v_add_nc_u32_e32 v19, 0xffffffc0, v25 ; GFX10-NEXT: v_lshlrev_b64 v[23:24], v25, v[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25 ; GFX10-NEXT: v_lshrrev_b64 v[17:18], v18, v[0:1] @@ -8035,12 +8038,12 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-NEXT: v_lshrrev_b64 v[8:9], v9, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[10:11], v25, v[6:7] ; GFX10-NEXT: v_sub_nc_u32_e32 v20, 64, v23 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 64, v25 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffc0, v25 ; GFX10-NEXT: v_or_b32_e32 v2, v18, v2 ; GFX10-NEXT: v_lshlrev_b64 v[16:17], v25, v[4:5] ; GFX10-NEXT: v_lshrrev_b64 v[18:19], v23, v[12:13] ; GFX10-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 64, v23 +; GFX10-NEXT: v_add_nc_u32_e32 v8, 0xffffffc0, v23 ; GFX10-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25 ; GFX10-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] @@ -8091,41 +8094,41 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX11-NEXT: v_cndmask_b32_e32 v24, 0, v24, vcc_lo ; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v25 ; GFX11-NEXT: v_lshlrev_b64 v[21:22], v25, v[2:3] -; GFX11-NEXT: v_subrev_nc_u32_e32 v19, 64, v25 -; GFX11-NEXT: v_subrev_nc_u32_e32 v27, 64, v26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v26 ; GFX11-NEXT: v_lshrrev_b64 v[17:18], v18, v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v19, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v22, v18, v22 -; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v19, 0xffffffc0, v25 ; GFX11-NEXT: v_or_b32_e32 v21, v17, v21 +; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v26 ; GFX11-NEXT: v_lshrrev_b64 v[16:17], v26, v[8:9] -; GFX11-NEXT: v_cndmask_b32_e32 v22, v1, v22, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v19, v[0:1] ; GFX11-NEXT: v_lshlrev_b64 v[18:19], v18, v[10:11] -; GFX11-NEXT: v_cndmask_b32_e32 v21, v0, v21, vcc_lo -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v27, v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v22, v1, v22 :: v_dual_cndmask_b32 v21, v0, v21 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v25 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v27, 0xffffffc0, v26 ; GFX11-NEXT: v_or_b32_e32 v16, v16, v18 ; GFX11-NEXT: v_or_b32_e32 v17, v17, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v27, v[10:11] ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v16, s0 ; GFX11-NEXT: v_not_b32_e32 v16, v20 ; GFX11-NEXT: v_cndmask_b32_e32 v18, v21, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v17, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v3, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v26 -; GFX11-NEXT: v_and_b32_e32 v25, 0x7f, v16 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v17, s0 ; GFX11-NEXT: v_lshrrev_b64 v[2:3], v26, v[10:11] -; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v9 :: v_dual_cndmask_b32 v0, v0, v8 +; GFX11-NEXT: v_and_b32_e32 v25, 0x7f, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v8 :: v_dual_cndmask_b32 v1, v1, v9 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 31, v5 ; GFX11-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] ; GFX11-NEXT: v_sub_nc_u32_e32 v9, 64, v25 ; GFX11-NEXT: v_cndmask_b32_e64 v26, 0, v3, s0 -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 64, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffc0, v25 ; GFX11-NEXT: v_or_b32_e32 v6, v6, v8 ; GFX11-NEXT: v_or_b32_e32 v0, v23, v0 ; GFX11-NEXT: v_lshrrev_b64 v[8:9], v9, v[4:5] @@ -8143,7 +8146,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_dual_cndmask_b32 v11, 0, v16 :: v_dual_cndmask_b32 v10, v3, v10 ; GFX11-NEXT: v_sub_nc_u32_e32 v20, 64, v23 -; GFX11-NEXT: v_subrev_nc_u32_e32 v8, 64, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0xffffffc0, v23 ; GFX11-NEXT: v_lshrrev_b64 v[18:19], v23, v[12:13] ; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v23 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 5dd4fa0809131..cc185aff9eff2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -1577,7 +1577,7 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX6-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, 0 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v3 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 64, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc0, v3 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[0:1], v3 ; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], v8 ; GFX6-NEXT: v_lshr_b64 v[10:11], v[4:5], v3 @@ -1599,7 +1599,7 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX8-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v3 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xffffffc0, v3 ; GFX8-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[8:9], v8, v[4:5] ; GFX8-NEXT: v_lshrrev_b64 v[10:11], v3, v[4:5] @@ -1621,7 +1621,7 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX9-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_sub_u32_e32 v8, 64, v3 -; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffc0, v3 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[8:9], v8, v[4:5] ; GFX9-NEXT: v_lshrrev_b64 v[10:11], v3, v[4:5] @@ -1643,7 +1643,7 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v3 ; GFX10-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v3 @@ -1664,20 +1664,20 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v4, 1, v2 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v3 -; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v3 ; GFX11-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 ; GFX11-NEXT: v_lshlrev_b64 v[8:9], v2, v[4:5] -; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] -; GFX11-NEXT: v_lshrrev_b64 v[4:5], v3, v[4:5] ; GFX11-NEXT: v_or_b32_e32 v2, v6, v8 ; GFX11-NEXT: v_or_b32_e32 v6, v7, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v3 +; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] +; GFX11-NEXT: v_lshrrev_b64 v[4:5], v3, v[4:5] ; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v6, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = lshr i65 %value, %amount ret i65 %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll index 2c2f8e914447d..88eb0e4b848c9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll @@ -325,7 +325,7 @@ define i32 @v_sdiv_i32_oddk_denom(i32 %num) { ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] -; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, 0x12d8fb, v0 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 0xffed2705, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5] ; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 @@ -353,29 +353,29 @@ define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) { ; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, v4, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v4, v5 +; GISEL-NEXT: v_mul_hi_u32 v7, v4, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v0, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, v1, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, v5, v3 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v4 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 +; GISEL-NEXT: v_mul_lo_u32 v8, v7, v3 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v7 +; GISEL-NEXT: v_mul_lo_u32 v10, v4, v3 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v4 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v10 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v0, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v0, v5 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[6:7] -; GISEL-NEXT: v_subrev_i32_e32 v8, vcc, 0x12d8fb, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v11, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 0xffed2705, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] ; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v0, v7, v5, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc ; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2 @@ -398,29 +398,29 @@ define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) { ; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_mul_lo_u32 v4, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v0, v3 +; CGP-NEXT: v_mul_lo_u32 v7, v3, v4 +; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v1, v3 -; CGP-NEXT: v_mul_lo_u32 v7, v4, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; CGP-NEXT: v_mul_lo_u32 v9, v3, v5 -; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 +; CGP-NEXT: v_mul_lo_u32 v8, v7, v5 +; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v7 +; CGP-NEXT: v_mul_lo_u32 v10, v3, v5 +; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v10 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v7, vcc, v0, v5 +; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] +; CGP-NEXT: v_add_i32_e32 v4, vcc, v0, v4 ; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 -; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[6:7] -; CGP-NEXT: v_subrev_i32_e32 v8, vcc, 0x12d8fb, v1 -; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5] -; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4 +; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[6:7] +; CGP-NEXT: v_add_i32_e32 v8, vcc, 0xffed2705, v1 +; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v7 ; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v3 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 ; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index 4cf1c92539c36..b12e915c7d21b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -1583,7 +1583,7 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) { ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v3 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], v4 ; GFX6-NEXT: v_lshl_b64 v[5:6], v[2:3], v3 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 64, v3 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0xffffffc0, v3 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[0:1], v3 ; GFX6-NEXT: v_or_b32_e32 v9, v4, v5 ; GFX6-NEXT: v_lshl_b64 v[4:5], v[0:1], v8 @@ -1601,7 +1601,7 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) { ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v3 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v4, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[5:6], v3, v[2:3] -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 64, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xffffffc0, v3 ; GFX8-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1] ; GFX8-NEXT: v_or_b32_e32 v9, v4, v5 ; GFX8-NEXT: v_lshlrev_b64 v[4:5], v8, v[0:1] @@ -1619,7 +1619,7 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) { ; GFX9-NEXT: v_sub_u32_e32 v4, 64, v3 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v4, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[5:6], v3, v[2:3] -; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v3 +; GFX9-NEXT: v_add_u32_e32 v8, 0xffffffc0, v3 ; GFX9-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1] ; GFX9-NEXT: v_or_b32_e32 v9, v4, v5 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[0:1] @@ -1636,7 +1636,7 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 64, v3 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], v3, v[2:3] -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 64, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v8, 0xffffffc0, v3 ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 ; GFX10-NEXT: v_lshrrev_b64 v[5:6], v6, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1] @@ -1654,7 +1654,7 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_sub_nc_u32_e32 v6, 64, v3 ; GFX11-NEXT: v_lshlrev_b64 v[4:5], v3, v[2:3] -; GFX11-NEXT: v_subrev_nc_u32_e32 v8, 64, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0xffffffc0, v3 ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 ; GFX11-NEXT: v_lshrrev_b64 v[5:6], v6, v[0:1] ; GFX11-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll index 1bb606f36e48d..2b12e4b973acb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll @@ -268,10 +268,10 @@ define i32 @v_srem_i32_pow2k_denom(i32 %num) { ; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 12, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, 0x1000, v0 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0xfffff000, v0 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, 0x1000, v0 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0xfffff000, v0 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 @@ -297,23 +297,23 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) { ; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, v4, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v4, v5 +; GISEL-NEXT: v_mul_hi_u32 v7, v4, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v0, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, v1, v4 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 12, v5 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 12, v7 ; GISEL-NEXT: v_lshlrev_b32_e32 v4, 12, v4 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v0, v3 -; GISEL-NEXT: v_subrev_i32_e32 v5, vcc, 0x1000, v1 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v0, v5 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, 0xfffff000, v1 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v0, v3 -; GISEL-NEXT: v_subrev_i32_e32 v5, vcc, 0x1000, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v0, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 0xfffff000, v1 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 @@ -338,23 +338,23 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) { ; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_mul_lo_u32 v4, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v0, v3 +; CGP-NEXT: v_mul_lo_u32 v7, v3, v4 +; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v1, v3 -; CGP-NEXT: v_lshlrev_b32_e32 v4, 12, v4 +; CGP-NEXT: v_lshlrev_b32_e32 v7, 12, v7 ; CGP-NEXT: v_lshlrev_b32_e32 v3, 12, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v5 -; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x1000, v1 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v0, v4 +; CGP-NEXT: v_add_i32_e32 v7, vcc, 0xfffff000, v1 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v5 -; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x1000, v1 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v0, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, 0xfffff000, v1 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 @@ -386,10 +386,10 @@ define i32 @v_srem_i32_oddk_denom(i32 %num) { ; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 ; CHECK-NEXT: v_mul_lo_u32 v2, v2, v4 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, 0x12d8fb, v0 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0xffed2705, v0 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, 0x12d8fb, v0 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0xffed2705, v0 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 @@ -415,23 +415,23 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) { ; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, v4, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v4, v5 +; GISEL-NEXT: v_mul_hi_u32 v7, v4, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v0, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, v1, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, v5, v3 +; GISEL-NEXT: v_mul_lo_u32 v7, v7, v3 ; GISEL-NEXT: v_mul_lo_u32 v4, v4, v3 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v0, v3 -; GISEL-NEXT: v_subrev_i32_e32 v5, vcc, 0x12d8fb, v1 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v0, v5 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, 0xffed2705, v1 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v0, v3 -; GISEL-NEXT: v_subrev_i32_e32 v5, vcc, 0x12d8fb, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v0, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 0xffed2705, v1 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 @@ -456,23 +456,23 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) { ; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_mul_lo_u32 v4, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v0, v3 +; CGP-NEXT: v_mul_lo_u32 v7, v3, v4 +; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v1, v3 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v5 +; CGP-NEXT: v_mul_lo_u32 v7, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v3, v3, v5 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v5 -; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v0, v4 +; CGP-NEXT: v_add_i32_e32 v7, vcc, 0xffed2705, v1 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v5 -; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v0, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, 0xffed2705, v1 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 2572f8581f0ed..7214f4ab581d5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -12,9 +12,9 @@ define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 25, v0 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 25, v1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000001, v2 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 @@ -27,9 +27,9 @@ define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) { ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 9, v0 ; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 9, v1 -; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 +; GFX8-NEXT: v_add_u16_e32 v2, 0x8001, v2 ; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 +; GFX8-NEXT: v_add_u16_e32 v3, 0x8000, v3 ; GFX8-NEXT: v_max_i16_e32 v1, v2, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v3 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 @@ -63,9 +63,9 @@ define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX6-NEXT: s_lshl_b32 s0, s0, 25 ; GFX6-NEXT: s_max_i32 s2, s0, -1 ; GFX6-NEXT: s_lshl_b32 s1, s1, 25 -; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_add_i32 s2, s2, 0x80000001 ; GFX6-NEXT: s_min_i32 s3, s0, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000 +; GFX6-NEXT: s_add_i32 s3, s3, 0x80000000 ; GFX6-NEXT: s_max_i32 s1, s2, s1 ; GFX6-NEXT: s_min_i32 s1, s1, s3 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 @@ -79,11 +79,11 @@ define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX8-NEXT: s_sext_i32_i16 s3, -1 ; GFX8-NEXT: s_max_i32 s4, s2, s3 ; GFX8-NEXT: s_lshl_b32 s1, s1, 9 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_addk_i32 s4, 0x8001 ; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s2, 0x8000 ; GFX8-NEXT: s_max_i32 s1, s3, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 @@ -122,9 +122,9 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000001, v2 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 @@ -137,9 +137,9 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) { ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 +; GFX8-NEXT: v_add_u16_e32 v2, 0x8001, v2 ; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 +; GFX8-NEXT: v_add_u16_e32 v3, 0x8000, v3 ; GFX8-NEXT: v_max_i16_e32 v1, v2, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v3 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 @@ -173,9 +173,9 @@ define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX6-NEXT: s_lshl_b32 s0, s0, 24 ; GFX6-NEXT: s_max_i32 s2, s0, -1 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_add_i32 s2, s2, 0x80000001 ; GFX6-NEXT: s_min_i32 s3, s0, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000 +; GFX6-NEXT: s_add_i32 s3, s3, 0x80000000 ; GFX6-NEXT: s_max_i32 s1, s2, s1 ; GFX6-NEXT: s_min_i32 s1, s1, s3 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 @@ -189,11 +189,11 @@ define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX8-NEXT: s_sext_i32_i16 s3, -1 ; GFX8-NEXT: s_max_i32 s4, s2, s3 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_addk_i32 s4, 0x8001 ; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s2, 0x8000 ; GFX8-NEXT: s_max_i32 s1, s3, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 @@ -234,18 +234,19 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000001, v4 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GFX6-NEXT: v_max_i32_e32 v1, v4, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v5 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x7fffffff, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000001, v3 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x80000000, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000000, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 @@ -266,16 +267,16 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX8-NEXT: v_max_i16_e32 v4, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_subrev_u16_e32 v4, 0x7fff, v4 +; GFX8-NEXT: v_add_u16_e32 v4, 0x8001, v4 ; GFX8-NEXT: v_min_i16_e32 v5, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v5, 0x8000, v5 +; GFX8-NEXT: v_add_u16_e32 v5, 0x8000, v5 ; GFX8-NEXT: v_max_i16_e32 v1, v4, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v5 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 ; GFX8-NEXT: v_max_i16_e32 v1, -1, v3 -; GFX8-NEXT: v_subrev_u16_e32 v1, 0x7fff, v1 +; GFX8-NEXT: v_add_u16_e32 v1, 0x8001, v1 ; GFX8-NEXT: v_min_i16_e32 v4, -1, v3 -; GFX8-NEXT: v_subrev_u16_e32 v4, 0x8000, v4 +; GFX8-NEXT: v_add_u16_e32 v4, 0x8000, v4 ; GFX8-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 ; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 @@ -355,18 +356,18 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX6-NEXT: s_max_i32 s4, s0, -1 ; GFX6-NEXT: s_lshr_b32 s3, s1, 8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX6-NEXT: s_add_i32 s4, s4, 0x80000001 ; GFX6-NEXT: s_min_i32 s5, s0, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000000 ; GFX6-NEXT: s_max_i32 s1, s4, s1 ; GFX6-NEXT: s_min_i32 s1, s1, s5 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24 ; GFX6-NEXT: s_max_i32 s3, s1, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, 0x7fffffff +; GFX6-NEXT: s_add_i32 s3, s3, 0x80000001 ; GFX6-NEXT: s_min_i32 s4, s1, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s4, s4, 0x80000000 ; GFX6-NEXT: s_max_i32 s2, s3, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s4 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 @@ -387,11 +388,11 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX8-NEXT: s_max_i32 s6, s4, s5 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_addk_i32 s6, 0x8001 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s4, 0x8000 ; GFX8-NEXT: s_max_i32 s1, s6, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 @@ -401,11 +402,11 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX8-NEXT: s_lshl_b32 s2, s3, 8 ; GFX8-NEXT: s_sext_i32_i16 s3, s1 ; GFX8-NEXT: s_max_i32 s4, s3, s5 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_addk_i32 s4, 0x8001 ; GFX8-NEXT: s_min_i32 s3, s3, s5 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s3, 0x8000 ; GFX8-NEXT: s_max_i32 s2, s4, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 @@ -509,29 +510,29 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 0x7fffffff, v8 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x80000001, v8 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v0 ; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v11 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GFX6-NEXT: v_max_i32_e32 v1, v8, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v10 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 +; GFX6-NEXT: v_mov_b32_e32 v9, 0x80000001 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v11 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; GFX6-NEXT: v_max_i32_e32 v2, v5, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v8 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v2 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v11 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v11 ; GFX6-NEXT: v_max_i32_e32 v3, v5, v3 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v6 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 @@ -539,10 +540,10 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v11 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v11 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 24, v2 @@ -573,34 +574,34 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_subrev_u16_e32 v8, 0x7fff, v8 +; GFX8-NEXT: v_add_u16_e32 v8, 0x8001, v8 ; GFX8-NEXT: v_min_i16_e32 v9, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_add_u16_e32 v9, 0x8000, v9 ; GFX8-NEXT: v_max_i16_e32 v1, v8, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v9 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 ; GFX8-NEXT: v_max_i16_e32 v1, -1, v3 -; GFX8-NEXT: v_subrev_u16_e32 v1, 0x7fff, v1 +; GFX8-NEXT: v_add_u16_e32 v1, 0x8001, v1 ; GFX8-NEXT: v_min_i16_e32 v8, -1, v3 -; GFX8-NEXT: v_subrev_u16_e32 v8, 0x8000, v8 +; GFX8-NEXT: v_add_u16_e32 v8, 0x8000, v8 ; GFX8-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v8 ; GFX8-NEXT: v_max_i16_e32 v4, -1, v2 ; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX8-NEXT: v_subrev_u16_e32 v4, 0x7fff, v4 +; GFX8-NEXT: v_add_u16_e32 v4, 0x8001, v4 ; GFX8-NEXT: v_min_i16_e32 v6, -1, v2 -; GFX8-NEXT: v_subrev_u16_e32 v6, 0x8000, v6 +; GFX8-NEXT: v_add_u16_e32 v6, 0x8000, v6 ; GFX8-NEXT: v_max_i16_e32 v3, v4, v3 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v6 ; GFX8-NEXT: v_sub_u16_e32 v2, v2, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v5 ; GFX8-NEXT: v_max_i16_e32 v5, -1, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7 -; GFX8-NEXT: v_subrev_u16_e32 v5, 0x7fff, v5 +; GFX8-NEXT: v_add_u16_e32 v5, 0x8001, v5 ; GFX8-NEXT: v_min_i16_e32 v6, -1, v3 -; GFX8-NEXT: v_subrev_u16_e32 v6, 0x8000, v6 +; GFX8-NEXT: v_add_u16_e32 v6, 0x8000, v6 ; GFX8-NEXT: v_max_i16_e32 v4, v5, v4 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v6 ; GFX8-NEXT: v_sub_u16_e32 v3, v3, v4 @@ -727,27 +728,27 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX6-NEXT: s_lshr_b32 s6, s1, 16 ; GFX6-NEXT: s_lshr_b32 s7, s1, 24 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_sub_i32 s8, s8, 0x7fffffff +; GFX6-NEXT: s_add_i32 s8, s8, 0x80000001 ; GFX6-NEXT: s_min_i32 s9, s0, -1 -; GFX6-NEXT: s_sub_i32 s9, s9, 0x80000000 +; GFX6-NEXT: s_add_i32 s9, s9, 0x80000000 ; GFX6-NEXT: s_max_i32 s1, s8, s1 ; GFX6-NEXT: s_min_i32 s1, s1, s9 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24 ; GFX6-NEXT: s_lshl_b32 s2, s5, 24 ; GFX6-NEXT: s_max_i32 s5, s1, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX6-NEXT: s_min_i32 s8, s1, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 +; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s2, s5, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s8 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24 ; GFX6-NEXT: s_max_i32 s5, s2, -1 ; GFX6-NEXT: s_lshl_b32 s3, s6, 24 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX6-NEXT: s_min_i32 s6, s2, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s3, s5, s3 ; GFX6-NEXT: s_min_i32 s3, s3, s6 ; GFX6-NEXT: s_sub_i32 s2, s2, s3 @@ -755,10 +756,10 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX6-NEXT: s_max_i32 s5, s3, -1 ; GFX6-NEXT: s_ashr_i32 s1, s1, 24 ; GFX6-NEXT: s_lshl_b32 s4, s7, 24 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX6-NEXT: s_min_i32 s6, s3, -1 ; GFX6-NEXT: s_ashr_i32 s0, s0, 24 -; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s5, s4 ; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_ashr_i32 s2, s2, 24 @@ -789,11 +790,11 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_lshr_b32 s6, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_sub_i32 s10, s10, 0x7fff +; GFX8-NEXT: s_addk_i32 s10, 0x8001 ; GFX8-NEXT: s_min_i32 s8, s8, s9 ; GFX8-NEXT: s_sext_i32_i16 s10, s10 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s8, 0x8000 ; GFX8-NEXT: s_max_i32 s1, s10, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 @@ -803,11 +804,11 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_lshl_b32 s2, s5, 8 ; GFX8-NEXT: s_sext_i32_i16 s5, s1 ; GFX8-NEXT: s_max_i32 s8, s5, s9 -; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff +; GFX8-NEXT: s_addk_i32 s8, 0x8001 ; GFX8-NEXT: s_min_i32 s5, s5, s9 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s5, 0x8000 ; GFX8-NEXT: s_max_i32 s2, s8, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 @@ -817,11 +818,11 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_sext_i32_i16 s5, s2 ; GFX8-NEXT: s_lshl_b32 s3, s6, 8 ; GFX8-NEXT: s_max_i32 s6, s5, s9 -; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_addk_i32 s6, 0x8001 ; GFX8-NEXT: s_min_i32 s5, s5, s9 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s5, 0x8000 ; GFX8-NEXT: s_max_i32 s3, s6, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 @@ -831,12 +832,12 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_sext_i32_i16 s5, s3 ; GFX8-NEXT: s_max_i32 s6, s5, s9 ; GFX8-NEXT: s_lshl_b32 s4, s7, 8 -; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_addk_i32 s6, 0x8001 ; GFX8-NEXT: s_min_i32 s5, s5, s9 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s5, 0x8000 ; GFX8-NEXT: s_max_i32 s4, s6, s4 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_ashr_i32 s1, s1, 8 @@ -1004,9 +1005,9 @@ define i24 @v_ssubsat_i24(i24 %lhs, i24 %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000001, v2 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 @@ -1055,9 +1056,9 @@ define amdgpu_ps i24 @s_ssubsat_i24(i24 inreg %lhs, i24 inreg %rhs) { ; GFX6-NEXT: s_lshl_b32 s0, s0, 8 ; GFX6-NEXT: s_max_i32 s2, s0, -1 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 -; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_add_i32 s2, s2, 0x80000001 ; GFX6-NEXT: s_min_i32 s3, s0, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000 +; GFX6-NEXT: s_add_i32 s3, s3, 0x80000000 ; GFX6-NEXT: s_max_i32 s1, s2, s1 ; GFX6-NEXT: s_min_i32 s1, s1, s3 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 @@ -1109,9 +1110,9 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000001, v2 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 @@ -1121,9 +1122,9 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_i32_e32 v2, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 0x7fffffff, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x80000001, v2 ; GFX8-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 0x80000000, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x80000000, v3 ; GFX8-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX8-NEXT: v_min_i32_e32 v1, v1, v3 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 @@ -1148,9 +1149,9 @@ define amdgpu_ps i32 @s_ssubsat_i32(i32 inreg %lhs, i32 inreg %rhs) { ; GFX6-LABEL: s_ssubsat_i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_max_i32 s2, s0, -1 -; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_add_i32 s2, s2, 0x80000001 ; GFX6-NEXT: s_min_i32 s3, s0, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000 +; GFX6-NEXT: s_add_i32 s3, s3, 0x80000000 ; GFX6-NEXT: s_max_i32 s1, s2, s1 ; GFX6-NEXT: s_min_i32 s1, s1, s3 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 @@ -1159,9 +1160,9 @@ define amdgpu_ps i32 @s_ssubsat_i32(i32 inreg %lhs, i32 inreg %rhs) { ; GFX8-LABEL: s_ssubsat_i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_max_i32 s2, s0, -1 -; GFX8-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_add_i32 s2, s2, 0x80000001 ; GFX8-NEXT: s_min_i32 s3, s0, -1 -; GFX8-NEXT: s_sub_i32 s3, s3, 0x80000000 +; GFX8-NEXT: s_add_i32 s3, s3, 0x80000000 ; GFX8-NEXT: s_max_i32 s1, s2, s1 ; GFX8-NEXT: s_min_i32 s1, s1, s3 ; GFX8-NEXT: s_sub_i32 s0, s0, s1 @@ -1187,9 +1188,9 @@ define amdgpu_ps float @ssubsat_i32_sv(i32 inreg %lhs, i32 %rhs) { ; GFX6-LABEL: ssubsat_i32_sv: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_max_i32 s1, s0, -1 -; GFX6-NEXT: s_sub_i32 s1, s1, 0x7fffffff +; GFX6-NEXT: s_add_i32 s1, s1, 0x80000001 ; GFX6-NEXT: s_min_i32 s2, s0, -1 -; GFX6-NEXT: s_sub_i32 s2, s2, 0x80000000 +; GFX6-NEXT: s_add_i32 s2, s2, 0x80000000 ; GFX6-NEXT: v_max_i32_e32 v0, s1, v0 ; GFX6-NEXT: v_min_i32_e32 v0, s2, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 @@ -1198,9 +1199,9 @@ define amdgpu_ps float @ssubsat_i32_sv(i32 inreg %lhs, i32 %rhs) { ; GFX8-LABEL: ssubsat_i32_sv: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_max_i32 s1, s0, -1 -; GFX8-NEXT: s_sub_i32 s1, s1, 0x7fffffff +; GFX8-NEXT: s_add_i32 s1, s1, 0x80000001 ; GFX8-NEXT: s_min_i32 s2, s0, -1 -; GFX8-NEXT: s_sub_i32 s2, s2, 0x80000000 +; GFX8-NEXT: s_add_i32 s2, s2, 0x80000000 ; GFX8-NEXT: v_max_i32_e32 v0, s1, v0 ; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 @@ -1224,9 +1225,9 @@ define amdgpu_ps float @ssubsat_i32_vs(i32 %lhs, i32 inreg %rhs) { ; GFX6-LABEL: ssubsat_i32_vs: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_max_i32_e32 v1, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 0x7fffffff, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000001, v1 ; GFX6-NEXT: v_min_i32_e32 v2, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x80000000, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000000, v2 ; GFX6-NEXT: v_max_i32_e32 v1, s0, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 @@ -1235,9 +1236,9 @@ define amdgpu_ps float @ssubsat_i32_vs(i32 %lhs, i32 inreg %rhs) { ; GFX8-LABEL: ssubsat_i32_vs: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_max_i32_e32 v1, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 0x7fffffff, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000001, v1 ; GFX8-NEXT: v_min_i32_e32 v2, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 0x80000000, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x80000000, v2 ; GFX8-NEXT: v_max_i32_e32 v1, s0, v1 ; GFX8-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 @@ -1262,16 +1263,16 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000001, v4 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, 0x80000000, v5 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v5 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000001, v2 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x80000000, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000000, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 @@ -1281,16 +1282,16 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_i32_e32 v4, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 0x7fffffff, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x80000001, v4 ; GFX8-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, 0x80000000, v5 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x80000000, v5 ; GFX8-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX8-NEXT: v_min_i32_e32 v2, v2, v5 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_max_i32_e32 v2, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 0x7fffffff, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x80000001, v2 ; GFX8-NEXT: v_min_i32_e32 v4, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 0x80000000, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x80000000, v4 ; GFX8-NEXT: v_max_i32_e32 v2, v2, v3 ; GFX8-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v2 @@ -1317,16 +1318,16 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; GFX6-LABEL: s_ssubsat_v2i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_max_i32 s4, s0, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX6-NEXT: s_add_i32 s4, s4, 0x80000001 ; GFX6-NEXT: s_min_i32 s5, s0, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000000 ; GFX6-NEXT: s_max_i32 s2, s4, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s5 ; GFX6-NEXT: s_sub_i32 s0, s0, s2 ; GFX6-NEXT: s_max_i32 s2, s1, -1 -; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_add_i32 s2, s2, 0x80000001 ; GFX6-NEXT: s_min_i32 s4, s1, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s4, s4, 0x80000000 ; GFX6-NEXT: s_max_i32 s2, s2, s3 ; GFX6-NEXT: s_min_i32 s2, s2, s4 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 @@ -1335,16 +1336,16 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; GFX8-LABEL: s_ssubsat_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_max_i32 s4, s0, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX8-NEXT: s_add_i32 s4, s4, 0x80000001 ; GFX8-NEXT: s_min_i32 s5, s0, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX8-NEXT: s_add_i32 s5, s5, 0x80000000 ; GFX8-NEXT: s_max_i32 s2, s4, s2 ; GFX8-NEXT: s_min_i32 s2, s2, s5 ; GFX8-NEXT: s_sub_i32 s0, s0, s2 ; GFX8-NEXT: s_max_i32 s2, s1, -1 -; GFX8-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_add_i32 s2, s2, 0x80000001 ; GFX8-NEXT: s_min_i32 s4, s1, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x80000000 +; GFX8-NEXT: s_add_i32 s4, s4, 0x80000000 ; GFX8-NEXT: s_max_i32 s2, s2, s3 ; GFX8-NEXT: s_min_i32 s2, s2, s4 ; GFX8-NEXT: s_sub_i32 s1, s1, s2 @@ -1376,24 +1377,25 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_max_i32_e32 v6, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x7fffffff, v6 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, 0x80000001, v6 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 0x80000000, v8 +; GFX6-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GFX6-NEXT: v_max_i32_e32 v3, v6, v3 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v8 -; GFX6-NEXT: v_bfrev_b32_e32 v7, -2 +; GFX6-NEXT: v_mov_b32_e32 v7, 0x80000001 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x80000000, v6 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; GFX6-NEXT: v_max_i32_e32 v3, v3, v4 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v6 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x7fffffff, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000001, v3 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x80000000, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000000, v4 ; GFX6-NEXT: v_max_i32_e32 v3, v3, v5 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v4 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 @@ -1403,24 +1405,25 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_i32_e32 v6, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x7fffffff, v6 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x80000001, v6 ; GFX8-NEXT: v_min_i32_e32 v8, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 0x80000000, v8 +; GFX8-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v9 ; GFX8-NEXT: v_max_i32_e32 v3, v6, v3 ; GFX8-NEXT: v_min_i32_e32 v3, v3, v8 -; GFX8-NEXT: v_bfrev_b32_e32 v7, -2 +; GFX8-NEXT: v_mov_b32_e32 v7, 0x80000001 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v7 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x80000000, v6 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v9 ; GFX8-NEXT: v_max_i32_e32 v3, v3, v4 ; GFX8-NEXT: v_min_i32_e32 v3, v3, v6 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_max_i32_e32 v3, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 0x7fffffff, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x80000001, v3 ; GFX8-NEXT: v_min_i32_e32 v4, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 0x80000000, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x80000000, v4 ; GFX8-NEXT: v_max_i32_e32 v3, v3, v5 ; GFX8-NEXT: v_min_i32_e32 v3, v3, v4 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 @@ -1449,23 +1452,23 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; GFX6-LABEL: s_ssubsat_v3i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_max_i32 s6, s0, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, 0x7fffffff +; GFX6-NEXT: s_add_i32 s6, s6, 0x80000001 ; GFX6-NEXT: s_min_i32 s7, s0, -1 -; GFX6-NEXT: s_sub_i32 s7, s7, 0x80000000 +; GFX6-NEXT: s_add_i32 s7, s7, 0x80000000 ; GFX6-NEXT: s_max_i32 s3, s6, s3 ; GFX6-NEXT: s_min_i32 s3, s3, s7 ; GFX6-NEXT: s_sub_i32 s0, s0, s3 ; GFX6-NEXT: s_max_i32 s3, s1, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, 0x7fffffff +; GFX6-NEXT: s_add_i32 s3, s3, 0x80000001 ; GFX6-NEXT: s_min_i32 s6, s1, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s3, s3, s4 ; GFX6-NEXT: s_min_i32 s3, s3, s6 ; GFX6-NEXT: s_sub_i32 s1, s1, s3 ; GFX6-NEXT: s_max_i32 s3, s2, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, 0x7fffffff +; GFX6-NEXT: s_add_i32 s3, s3, 0x80000001 ; GFX6-NEXT: s_min_i32 s4, s2, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s4, s4, 0x80000000 ; GFX6-NEXT: s_max_i32 s3, s3, s5 ; GFX6-NEXT: s_min_i32 s3, s3, s4 ; GFX6-NEXT: s_sub_i32 s2, s2, s3 @@ -1474,23 +1477,23 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; GFX8-LABEL: s_ssubsat_v3i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_max_i32 s6, s0, -1 -; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fffffff +; GFX8-NEXT: s_add_i32 s6, s6, 0x80000001 ; GFX8-NEXT: s_min_i32 s7, s0, -1 -; GFX8-NEXT: s_sub_i32 s7, s7, 0x80000000 +; GFX8-NEXT: s_add_i32 s7, s7, 0x80000000 ; GFX8-NEXT: s_max_i32 s3, s6, s3 ; GFX8-NEXT: s_min_i32 s3, s3, s7 ; GFX8-NEXT: s_sub_i32 s0, s0, s3 ; GFX8-NEXT: s_max_i32 s3, s1, -1 -; GFX8-NEXT: s_sub_i32 s3, s3, 0x7fffffff +; GFX8-NEXT: s_add_i32 s3, s3, 0x80000001 ; GFX8-NEXT: s_min_i32 s6, s1, -1 -; GFX8-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX8-NEXT: s_add_i32 s6, s6, 0x80000000 ; GFX8-NEXT: s_max_i32 s3, s3, s4 ; GFX8-NEXT: s_min_i32 s3, s3, s6 ; GFX8-NEXT: s_sub_i32 s1, s1, s3 ; GFX8-NEXT: s_max_i32 s3, s2, -1 -; GFX8-NEXT: s_sub_i32 s3, s3, 0x7fffffff +; GFX8-NEXT: s_add_i32 s3, s3, 0x80000001 ; GFX8-NEXT: s_min_i32 s4, s2, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x80000000 +; GFX8-NEXT: s_add_i32 s4, s4, 0x80000000 ; GFX8-NEXT: s_max_i32 s3, s3, s5 ; GFX8-NEXT: s_min_i32 s3, s3, s4 ; GFX8-NEXT: s_sub_i32 s2, s2, s3 @@ -1527,32 +1530,32 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 0x7fffffff, v8 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x80000001, v8 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v0 ; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v11 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v10 -; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 +; GFX6-NEXT: v_mov_b32_e32 v9, 0x80000001 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v9 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 0x80000000, v8 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; GFX6-NEXT: v_max_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v2 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v9 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; GFX6-NEXT: v_max_i32_e32 v4, v4, v6 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v3 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000001, v4 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v3 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, 0x80000000, v5 ; GFX6-NEXT: v_max_i32_e32 v4, v4, v7 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 @@ -1562,32 +1565,32 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_i32_e32 v8, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 0x7fffffff, v8 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x80000001, v8 ; GFX8-NEXT: v_min_i32_e32 v10, -1, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, v10, v11 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11 ; GFX8-NEXT: v_max_i32_e32 v4, v8, v4 ; GFX8-NEXT: v_min_i32_e32 v4, v4, v10 -; GFX8-NEXT: v_bfrev_b32_e32 v9, -2 +; GFX8-NEXT: v_mov_b32_e32 v9, 0x80000001 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 ; GFX8-NEXT: v_max_i32_e32 v4, -1, v1 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v9 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v9 ; GFX8-NEXT: v_min_i32_e32 v8, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 0x80000000, v8 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v11 ; GFX8-NEXT: v_max_i32_e32 v4, v4, v5 ; GFX8-NEXT: v_min_i32_e32 v4, v4, v8 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v4 ; GFX8-NEXT: v_max_i32_e32 v4, -1, v2 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v9 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v9 ; GFX8-NEXT: v_min_i32_e32 v5, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, 0x80000000, v5 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v11 ; GFX8-NEXT: v_max_i32_e32 v4, v4, v6 ; GFX8-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_max_i32_e32 v4, -1, v3 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 0x7fffffff, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x80000001, v4 ; GFX8-NEXT: v_min_i32_e32 v5, -1, v3 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, 0x80000000, v5 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x80000000, v5 ; GFX8-NEXT: v_max_i32_e32 v4, v4, v7 ; GFX8-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v4 @@ -1618,30 +1621,30 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; GFX6-LABEL: s_ssubsat_v4i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_max_i32 s8, s0, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, 0x7fffffff +; GFX6-NEXT: s_add_i32 s8, s8, 0x80000001 ; GFX6-NEXT: s_min_i32 s9, s0, -1 -; GFX6-NEXT: s_sub_i32 s9, s9, 0x80000000 +; GFX6-NEXT: s_add_i32 s9, s9, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s8, s4 ; GFX6-NEXT: s_min_i32 s4, s4, s9 ; GFX6-NEXT: s_sub_i32 s0, s0, s4 ; GFX6-NEXT: s_max_i32 s4, s1, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX6-NEXT: s_add_i32 s4, s4, 0x80000001 ; GFX6-NEXT: s_min_i32 s8, s1, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 +; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s4, s5 ; GFX6-NEXT: s_min_i32 s4, s4, s8 ; GFX6-NEXT: s_sub_i32 s1, s1, s4 ; GFX6-NEXT: s_max_i32 s4, s2, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX6-NEXT: s_add_i32 s4, s4, 0x80000001 ; GFX6-NEXT: s_min_i32 s5, s2, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s4, s6 ; GFX6-NEXT: s_min_i32 s4, s4, s5 ; GFX6-NEXT: s_sub_i32 s2, s2, s4 ; GFX6-NEXT: s_max_i32 s4, s3, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX6-NEXT: s_add_i32 s4, s4, 0x80000001 ; GFX6-NEXT: s_min_i32 s5, s3, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s4, s7 ; GFX6-NEXT: s_min_i32 s4, s4, s5 ; GFX6-NEXT: s_sub_i32 s3, s3, s4 @@ -1650,30 +1653,30 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; GFX8-LABEL: s_ssubsat_v4i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_max_i32 s8, s0, -1 -; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fffffff +; GFX8-NEXT: s_add_i32 s8, s8, 0x80000001 ; GFX8-NEXT: s_min_i32 s9, s0, -1 -; GFX8-NEXT: s_sub_i32 s9, s9, 0x80000000 +; GFX8-NEXT: s_add_i32 s9, s9, 0x80000000 ; GFX8-NEXT: s_max_i32 s4, s8, s4 ; GFX8-NEXT: s_min_i32 s4, s4, s9 ; GFX8-NEXT: s_sub_i32 s0, s0, s4 ; GFX8-NEXT: s_max_i32 s4, s1, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX8-NEXT: s_add_i32 s4, s4, 0x80000001 ; GFX8-NEXT: s_min_i32 s8, s1, -1 -; GFX8-NEXT: s_sub_i32 s8, s8, 0x80000000 +; GFX8-NEXT: s_add_i32 s8, s8, 0x80000000 ; GFX8-NEXT: s_max_i32 s4, s4, s5 ; GFX8-NEXT: s_min_i32 s4, s4, s8 ; GFX8-NEXT: s_sub_i32 s1, s1, s4 ; GFX8-NEXT: s_max_i32 s4, s2, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX8-NEXT: s_add_i32 s4, s4, 0x80000001 ; GFX8-NEXT: s_min_i32 s5, s2, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX8-NEXT: s_add_i32 s5, s5, 0x80000000 ; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_sub_i32 s2, s2, s4 ; GFX8-NEXT: s_max_i32 s4, s3, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX8-NEXT: s_add_i32 s4, s4, 0x80000001 ; GFX8-NEXT: s_min_i32 s5, s3, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX8-NEXT: s_add_i32 s5, s5, 0x80000000 ; GFX8-NEXT: s_max_i32 s4, s4, s7 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_sub_i32 s3, s3, s4 @@ -1715,39 +1718,39 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_max_i32_e32 v10, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, 0x7fffffff, v10 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, 0x80000001, v10 ; GFX6-NEXT: v_min_i32_e32 v12, -1, v0 ; GFX6-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v13 +; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; GFX6-NEXT: v_max_i32_e32 v5, v10, v5 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v12 -; GFX6-NEXT: v_bfrev_b32_e32 v11, -2 +; GFX6-NEXT: v_mov_b32_e32 v11, 0x80000001 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v11 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v13 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v13 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v10 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v2 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v11 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x80000000, v6 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v13 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v7 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v11 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x80000000, v6 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v13 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v8 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v4 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x7fffffff, v5 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, 0x80000001, v5 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v4 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x80000000, v6 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, 0x80000000, v6 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v9 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v5 @@ -1757,39 +1760,39 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_i32_e32 v10, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, 0x7fffffff, v10 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x80000001, v10 ; GFX8-NEXT: v_min_i32_e32 v12, -1, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX8-NEXT: v_sub_u32_e32 v12, vcc, v12, v13 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v13 ; GFX8-NEXT: v_max_i32_e32 v5, v10, v5 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v12 -; GFX8-NEXT: v_bfrev_b32_e32 v11, -2 +; GFX8-NEXT: v_mov_b32_e32 v11, 0x80000001 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v1 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v11 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v11 ; GFX8-NEXT: v_min_i32_e32 v10, -1, v1 -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, v10, v13 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v13 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v10 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v2 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v11 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v11 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x80000000, v6 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v13 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v7 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v11 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v11 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x80000000, v6 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v13 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v8 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v4 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, 0x7fffffff, v5 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x80000001, v5 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v4 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x80000000, v6 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x80000000, v6 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v9 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v5 @@ -1822,37 +1825,37 @@ define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; GFX6-LABEL: s_ssubsat_v5i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_max_i32 s10, s0, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, 0x7fffffff +; GFX6-NEXT: s_add_i32 s10, s10, 0x80000001 ; GFX6-NEXT: s_min_i32 s11, s0, -1 -; GFX6-NEXT: s_sub_i32 s11, s11, 0x80000000 +; GFX6-NEXT: s_add_i32 s11, s11, 0x80000000 ; GFX6-NEXT: s_max_i32 s5, s10, s5 ; GFX6-NEXT: s_min_i32 s5, s5, s11 ; GFX6-NEXT: s_sub_i32 s0, s0, s5 ; GFX6-NEXT: s_max_i32 s5, s1, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX6-NEXT: s_min_i32 s10, s1, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 +; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s5, s5, s6 ; GFX6-NEXT: s_min_i32 s5, s5, s10 ; GFX6-NEXT: s_sub_i32 s1, s1, s5 ; GFX6-NEXT: s_max_i32 s5, s2, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX6-NEXT: s_min_i32 s6, s2, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s5, s5, s7 ; GFX6-NEXT: s_min_i32 s5, s5, s6 ; GFX6-NEXT: s_sub_i32 s2, s2, s5 ; GFX6-NEXT: s_max_i32 s5, s3, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX6-NEXT: s_min_i32 s6, s3, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s5, s5, s8 ; GFX6-NEXT: s_min_i32 s5, s5, s6 ; GFX6-NEXT: s_sub_i32 s3, s3, s5 ; GFX6-NEXT: s_max_i32 s5, s4, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX6-NEXT: s_min_i32 s6, s4, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s5, s5, s9 ; GFX6-NEXT: s_min_i32 s5, s5, s6 ; GFX6-NEXT: s_sub_i32 s4, s4, s5 @@ -1861,37 +1864,37 @@ define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; GFX8-LABEL: s_ssubsat_v5i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_max_i32 s10, s0, -1 -; GFX8-NEXT: s_sub_i32 s10, s10, 0x7fffffff +; GFX8-NEXT: s_add_i32 s10, s10, 0x80000001 ; GFX8-NEXT: s_min_i32 s11, s0, -1 -; GFX8-NEXT: s_sub_i32 s11, s11, 0x80000000 +; GFX8-NEXT: s_add_i32 s11, s11, 0x80000000 ; GFX8-NEXT: s_max_i32 s5, s10, s5 ; GFX8-NEXT: s_min_i32 s5, s5, s11 ; GFX8-NEXT: s_sub_i32 s0, s0, s5 ; GFX8-NEXT: s_max_i32 s5, s1, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX8-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX8-NEXT: s_min_i32 s10, s1, -1 -; GFX8-NEXT: s_sub_i32 s10, s10, 0x80000000 +; GFX8-NEXT: s_add_i32 s10, s10, 0x80000000 ; GFX8-NEXT: s_max_i32 s5, s5, s6 ; GFX8-NEXT: s_min_i32 s5, s5, s10 ; GFX8-NEXT: s_sub_i32 s1, s1, s5 ; GFX8-NEXT: s_max_i32 s5, s2, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX8-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX8-NEXT: s_min_i32 s6, s2, -1 -; GFX8-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX8-NEXT: s_add_i32 s6, s6, 0x80000000 ; GFX8-NEXT: s_max_i32 s5, s5, s7 ; GFX8-NEXT: s_min_i32 s5, s5, s6 ; GFX8-NEXT: s_sub_i32 s2, s2, s5 ; GFX8-NEXT: s_max_i32 s5, s3, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX8-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX8-NEXT: s_min_i32 s6, s3, -1 -; GFX8-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX8-NEXT: s_add_i32 s6, s6, 0x80000000 ; GFX8-NEXT: s_max_i32 s5, s5, s8 ; GFX8-NEXT: s_min_i32 s5, s5, s6 ; GFX8-NEXT: s_sub_i32 s3, s3, s5 ; GFX8-NEXT: s_max_i32 s5, s4, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX8-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX8-NEXT: s_min_i32 s6, s4, -1 -; GFX8-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX8-NEXT: s_add_i32 s6, s6, 0x80000000 ; GFX8-NEXT: s_max_i32 s5, s5, s9 ; GFX8-NEXT: s_min_i32 s5, s5, s6 ; GFX8-NEXT: s_sub_i32 s4, s4, s5 @@ -1938,117 +1941,117 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_max_i32_e32 v32, -1, v0 -; GFX6-NEXT: v_bfrev_b32_e32 v31, -2 -; GFX6-NEXT: v_sub_i32_e32 v32, vcc, v32, v31 +; GFX6-NEXT: v_mov_b32_e32 v31, 0x80000001 +; GFX6-NEXT: v_add_i32_e32 v32, vcc, v32, v31 ; GFX6-NEXT: v_max_i32_e32 v32, v32, v16 ; GFX6-NEXT: v_min_i32_e32 v33, -1, v0 ; GFX6-NEXT: v_bfrev_b32_e32 v16, 1 -; GFX6-NEXT: v_sub_i32_e32 v33, vcc, v33, v16 +; GFX6-NEXT: v_add_i32_e32 v33, vcc, v33, v16 ; GFX6-NEXT: v_min_i32_e32 v32, v32, v33 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v32 ; GFX6-NEXT: v_max_i32_e32 v32, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v32, vcc, v32, v31 +; GFX6-NEXT: v_add_i32_e32 v32, vcc, v32, v31 ; GFX6-NEXT: v_max_i32_e32 v17, v32, v17 ; GFX6-NEXT: v_min_i32_e32 v32, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v32, vcc, v32, v16 +; GFX6-NEXT: v_add_i32_e32 v32, vcc, v32, v16 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v32 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v2 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_min_i32_e32 v18, -1, v2 -; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v18, -1, v3 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v4 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v18, -1, v4 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v20 -; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v5 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v18, -1, v5 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v21 -; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v6 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v18, -1, v6 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v22 -; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v7 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v19, -1, v7 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v8 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v19, -1, v8 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v24 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v9 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v19, -1, v9 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v25 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v10 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v19, -1, v10 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v26 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v11 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v19, -1, v11 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v27 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v12 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v19, -1, v12 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v28 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v13 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v19, -1, v13 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v29 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v14 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v19, -1, v14 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v30 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v15 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v19, -1, v15 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v19, v16 +; GFX6-NEXT: v_add_i32_e32 v16, vcc, v19, v16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_max_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_min_i32_e32 v16, v17, v16 @@ -2059,117 +2062,117 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_i32_e32 v32, -1, v0 -; GFX8-NEXT: v_bfrev_b32_e32 v31, -2 -; GFX8-NEXT: v_sub_u32_e32 v32, vcc, v32, v31 +; GFX8-NEXT: v_mov_b32_e32 v31, 0x80000001 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31 ; GFX8-NEXT: v_max_i32_e32 v32, v32, v16 ; GFX8-NEXT: v_min_i32_e32 v33, -1, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v16, 1 -; GFX8-NEXT: v_sub_u32_e32 v33, vcc, v33, v16 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v16 ; GFX8-NEXT: v_min_i32_e32 v32, v32, v33 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v32 ; GFX8-NEXT: v_max_i32_e32 v32, -1, v1 -; GFX8-NEXT: v_sub_u32_e32 v32, vcc, v32, v31 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31 ; GFX8-NEXT: v_max_i32_e32 v17, v32, v17 ; GFX8-NEXT: v_min_i32_e32 v32, -1, v1 -; GFX8-NEXT: v_sub_u32_e32 v32, vcc, v32, v16 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v16 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v32 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v2 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_min_i32_e32 v18, -1, v2 -; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v3 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v18, -1, v3 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v4 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v18, -1, v4 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v20 -; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v5 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v18, -1, v5 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v21 -; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v6 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v18, -1, v6 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v22 -; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v7 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v19, -1, v7 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX8-NEXT: v_sub_u32_e32 v7, vcc, v7, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v8 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v19, -1, v8 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v24 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v8, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v9 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v19, -1, v9 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v25 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX8-NEXT: v_sub_u32_e32 v9, vcc, v9, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v10 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v19, -1, v10 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v26 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX8-NEXT: v_sub_u32_e32 v10, vcc, v10, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v11 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v19, -1, v11 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v27 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX8-NEXT: v_sub_u32_e32 v11, vcc, v11, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v12 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v19, -1, v12 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v28 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX8-NEXT: v_sub_u32_e32 v12, vcc, v12, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v13 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v19, -1, v13 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v29 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX8-NEXT: v_sub_u32_e32 v13, vcc, v13, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v14 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v19, -1, v14 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v30 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX8-NEXT: v_sub_u32_e32 v14, vcc, v14, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v15 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v19, -1, v15 -; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v19, v16 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v19, v16 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_min_i32_e32 v16, v17, v16 @@ -2252,114 +2255,114 @@ define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; GFX6-LABEL: s_ssubsat_v16i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_max_i32 s32, s0, -1 -; GFX6-NEXT: s_sub_i32 s32, s32, 0x7fffffff +; GFX6-NEXT: s_add_i32 s32, s32, 0x80000001 ; GFX6-NEXT: s_min_i32 s33, s0, -1 -; GFX6-NEXT: s_sub_i32 s33, s33, 0x80000000 +; GFX6-NEXT: s_add_i32 s33, s33, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s32, s16 ; GFX6-NEXT: s_min_i32 s16, s16, s33 ; GFX6-NEXT: s_sub_i32 s0, s0, s16 ; GFX6-NEXT: s_max_i32 s16, s1, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s32, s1, -1 -; GFX6-NEXT: s_sub_i32 s32, s32, 0x80000000 +; GFX6-NEXT: s_add_i32 s32, s32, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s17 ; GFX6-NEXT: s_min_i32 s16, s16, s32 ; GFX6-NEXT: s_sub_i32 s1, s1, s16 ; GFX6-NEXT: s_max_i32 s16, s2, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s2, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s18 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s2, s2, s16 ; GFX6-NEXT: s_max_i32 s16, s3, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s3, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s19 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s3, s3, s16 ; GFX6-NEXT: s_max_i32 s16, s4, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s4, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s20 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s4, s4, s16 ; GFX6-NEXT: s_max_i32 s16, s5, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s5, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s21 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s5, s5, s16 ; GFX6-NEXT: s_max_i32 s16, s6, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s6, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s22 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s6, s6, s16 ; GFX6-NEXT: s_max_i32 s16, s7, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s7, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s23 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s7, s7, s16 ; GFX6-NEXT: s_max_i32 s16, s8, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s8, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s24 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s8, s8, s16 ; GFX6-NEXT: s_max_i32 s16, s9, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s9, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s25 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s9, s9, s16 ; GFX6-NEXT: s_max_i32 s16, s10, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s10, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s26 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s10, s10, s16 ; GFX6-NEXT: s_max_i32 s16, s11, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s11, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s27 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s11, s11, s16 ; GFX6-NEXT: s_max_i32 s16, s12, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s12, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s28 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s12, s12, s16 ; GFX6-NEXT: s_max_i32 s16, s13, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s13, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s29 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s13, s13, s16 ; GFX6-NEXT: s_max_i32 s16, s14, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s14, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s30 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s14, s14, s16 ; GFX6-NEXT: s_max_i32 s16, s15, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s15, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s31 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s15, s15, s16 @@ -2368,114 +2371,114 @@ define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; GFX8-LABEL: s_ssubsat_v16i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_max_i32 s32, s0, -1 -; GFX8-NEXT: s_sub_i32 s32, s32, 0x7fffffff +; GFX8-NEXT: s_add_i32 s32, s32, 0x80000001 ; GFX8-NEXT: s_min_i32 s33, s0, -1 -; GFX8-NEXT: s_sub_i32 s33, s33, 0x80000000 +; GFX8-NEXT: s_add_i32 s33, s33, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s32, s16 ; GFX8-NEXT: s_min_i32 s16, s16, s33 ; GFX8-NEXT: s_sub_i32 s0, s0, s16 ; GFX8-NEXT: s_max_i32 s16, s1, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s32, s1, -1 -; GFX8-NEXT: s_sub_i32 s32, s32, 0x80000000 +; GFX8-NEXT: s_add_i32 s32, s32, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s17 ; GFX8-NEXT: s_min_i32 s16, s16, s32 ; GFX8-NEXT: s_sub_i32 s1, s1, s16 ; GFX8-NEXT: s_max_i32 s16, s2, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s2, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s18 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s2, s2, s16 ; GFX8-NEXT: s_max_i32 s16, s3, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s3, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s19 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s3, s3, s16 ; GFX8-NEXT: s_max_i32 s16, s4, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s4, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s20 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s4, s4, s16 ; GFX8-NEXT: s_max_i32 s16, s5, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s5, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s21 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s5, s5, s16 ; GFX8-NEXT: s_max_i32 s16, s6, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s6, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s22 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s6, s6, s16 ; GFX8-NEXT: s_max_i32 s16, s7, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s7, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s23 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s7, s7, s16 ; GFX8-NEXT: s_max_i32 s16, s8, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s8, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s24 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s8, s8, s16 ; GFX8-NEXT: s_max_i32 s16, s9, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s9, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s25 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s9, s9, s16 ; GFX8-NEXT: s_max_i32 s16, s10, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s10, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s26 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s10, s10, s16 ; GFX8-NEXT: s_max_i32 s16, s11, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s11, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s27 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s11, s11, s16 ; GFX8-NEXT: s_max_i32 s16, s12, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s12, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s28 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s12, s12, s16 ; GFX8-NEXT: s_max_i32 s16, s13, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s13, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s29 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s13, s13, s16 ; GFX8-NEXT: s_max_i32 s16, s14, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s14, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s30 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s14, s14, s16 ; GFX8-NEXT: s_max_i32 s16, s15, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s15, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s31 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s15, s15, s16 @@ -2579,9 +2582,9 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000001, v2 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 @@ -2592,9 +2595,9 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 +; GFX8-NEXT: v_add_u16_e32 v2, 0x8001, v2 ; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 +; GFX8-NEXT: v_add_u16_e32 v3, 0x8000, v3 ; GFX8-NEXT: v_max_i16_e32 v1, v2, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v3 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 @@ -2621,9 +2624,9 @@ define amdgpu_ps i16 @s_ssubsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_max_i32 s2, s0, -1 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_add_i32 s2, s2, 0x80000001 ; GFX6-NEXT: s_min_i32 s3, s0, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000 +; GFX6-NEXT: s_add_i32 s3, s3, 0x80000000 ; GFX6-NEXT: s_max_i32 s1, s2, s1 ; GFX6-NEXT: s_min_i32 s1, s1, s3 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 @@ -2635,11 +2638,11 @@ define amdgpu_ps i16 @s_ssubsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; GFX8-NEXT: s_sext_i32_i16 s2, s0 ; GFX8-NEXT: s_sext_i32_i16 s3, -1 ; GFX8-NEXT: s_max_i32 s4, s2, s3 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_addk_i32 s4, 0x8001 ; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s2, 0x8000 ; GFX8-NEXT: s_max_i32 s1, s3, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 @@ -2669,9 +2672,9 @@ define amdgpu_ps half @ssubsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_max_i32 s1, s0, -1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_sub_i32 s1, s1, 0x7fffffff +; GFX6-NEXT: s_add_i32 s1, s1, 0x80000001 ; GFX6-NEXT: s_min_i32 s2, s0, -1 -; GFX6-NEXT: s_sub_i32 s2, s2, 0x80000000 +; GFX6-NEXT: s_add_i32 s2, s2, 0x80000000 ; GFX6-NEXT: v_max_i32_e32 v0, s1, v0 ; GFX6-NEXT: v_min_i32_e32 v0, s2, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 @@ -2683,9 +2686,9 @@ define amdgpu_ps half @ssubsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; GFX8-NEXT: s_sext_i32_i16 s1, s0 ; GFX8-NEXT: s_sext_i32_i16 s2, -1 ; GFX8-NEXT: s_max_i32 s3, s1, s2 -; GFX8-NEXT: s_sub_i32 s3, s3, 0x7fff +; GFX8-NEXT: s_addk_i32 s3, 0x8001 ; GFX8-NEXT: s_min_i32 s1, s1, s2 -; GFX8-NEXT: s_sub_i32 s1, s1, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s1, 0x8000 ; GFX8-NEXT: v_max_i16_e32 v0, s3, v0 ; GFX8-NEXT: v_min_i16_e32 v0, s1, v0 ; GFX8-NEXT: v_sub_u16_e32 v0, s0, v0 @@ -2711,9 +2714,9 @@ define amdgpu_ps half @ssubsat_i16_vs(i16 %lhs, i16 inreg %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_max_i32_e32 v1, -1, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 0x7fffffff, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000001, v1 ; GFX6-NEXT: v_min_i32_e32 v2, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x80000000, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000000, v2 ; GFX6-NEXT: v_max_i32_e32 v1, s0, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 @@ -2723,9 +2726,9 @@ define amdgpu_ps half @ssubsat_i16_vs(i16 %lhs, i16 inreg %rhs) { ; GFX8-LABEL: ssubsat_i16_vs: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_max_i16_e32 v1, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v1, 0x7fff, v1 +; GFX8-NEXT: v_add_u16_e32 v1, 0x8001, v1 ; GFX8-NEXT: v_min_i16_e32 v2, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v2, 0x8000, v2 +; GFX8-NEXT: v_add_u16_e32 v2, 0x8000, v2 ; GFX8-NEXT: v_max_i16_e32 v1, s0, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v2 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 @@ -2752,18 +2755,19 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000001, v4 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x7fffffff, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000001, v3 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x80000000, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000000, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 @@ -2775,16 +2779,16 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 +; GFX8-NEXT: v_add_u16_e32 v2, 0x8001, v2 ; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 +; GFX8-NEXT: v_add_u16_e32 v3, 0x8000, v3 ; GFX8-NEXT: v_max_i16_e32 v2, v2, v1 ; GFX8-NEXT: v_min_i16_e32 v2, v2, v3 ; GFX8-NEXT: v_mov_b32_e32 v3, -1 ; GFX8-NEXT: v_max_i16_sdwa v4, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v4, 0x7fff, v4 +; GFX8-NEXT: v_add_u16_e32 v4, 0x8001, v4 ; GFX8-NEXT: v_min_i16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 +; GFX8-NEXT: v_add_u16_e32 v3, 0x8000, v3 ; GFX8-NEXT: v_max_i16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v3 ; GFX8-NEXT: v_sub_u16_e32 v2, v0, v2 @@ -2813,18 +2817,18 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_max_i32 s4, s0, -1 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX6-NEXT: s_add_i32 s4, s4, 0x80000001 ; GFX6-NEXT: s_min_i32 s5, s0, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000000 ; GFX6-NEXT: s_max_i32 s2, s4, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s5 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_max_i32 s3, s1, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, 0x7fffffff +; GFX6-NEXT: s_add_i32 s3, s3, 0x80000001 ; GFX6-NEXT: s_min_i32 s4, s1, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s4, s4, 0x80000000 ; GFX6-NEXT: s_max_i32 s2, s3, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s4 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 @@ -2841,12 +2845,12 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX8-NEXT: s_sext_i32_i16 s4, s0 ; GFX8-NEXT: s_sext_i32_i16 s5, -1 ; GFX8-NEXT: s_max_i32 s6, s4, s5 -; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_addk_i32 s6, 0x8001 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s4, 0x8000 ; GFX8-NEXT: s_max_i32 s1, s6, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 @@ -2855,11 +2859,11 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s2 ; GFX8-NEXT: s_max_i32 s4, s1, s5 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_addk_i32 s4, 0x8001 ; GFX8-NEXT: s_min_i32 s1, s1, s5 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s1, s1, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s1, 0x8000 ; GFX8-NEXT: s_max_i32 s3, s4, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 @@ -2894,18 +2898,18 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_max_i32 s2, s0, -1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_add_i32 s2, s2, 0x80000001 ; GFX6-NEXT: s_min_i32 s3, s0, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000 +; GFX6-NEXT: s_add_i32 s3, s3, 0x80000000 ; GFX6-NEXT: v_max_i32_e32 v0, s2, v0 ; GFX6-NEXT: v_min_i32_e32 v0, s3, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16 ; GFX6-NEXT: s_max_i32 s1, s0, -1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_sub_i32 s1, s1, 0x7fffffff +; GFX6-NEXT: s_add_i32 s1, s1, 0x80000001 ; GFX6-NEXT: s_min_i32 s2, s0, -1 -; GFX6-NEXT: s_sub_i32 s2, s2, 0x80000000 +; GFX6-NEXT: s_add_i32 s2, s2, 0x80000000 ; GFX6-NEXT: v_max_i32_e32 v1, s1, v1 ; GFX6-NEXT: v_min_i32_e32 v1, s2, v1 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 @@ -2922,18 +2926,18 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX8-NEXT: s_sext_i32_i16 s2, s0 ; GFX8-NEXT: s_sext_i32_i16 s3, -1 ; GFX8-NEXT: s_max_i32 s4, s2, s3 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_addk_i32 s4, 0x8001 ; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s2, 0x8000 ; GFX8-NEXT: v_max_i16_e32 v1, s4, v0 ; GFX8-NEXT: v_min_i16_e32 v1, s2, v1 ; GFX8-NEXT: s_sext_i32_i16 s2, s1 ; GFX8-NEXT: s_max_i32 s4, s2, s3 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_addk_i32 s4, 0x8001 ; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s2, 0x8000 ; GFX8-NEXT: v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v0, s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 @@ -2962,18 +2966,20 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 -; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000001, v2 +; GFX6-NEXT: v_min_i32_e32 v4, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 -; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 +; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, 0x80000001 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v1 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 @@ -2988,17 +2994,17 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX8-LABEL: ssubsat_v2i16_vs: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_max_i16_e32 v1, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v1, 0x7fff, v1 +; GFX8-NEXT: v_add_u16_e32 v1, 0x8001, v1 ; GFX8-NEXT: v_min_i16_e32 v2, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v2, 0x8000, v2 +; GFX8-NEXT: v_add_u16_e32 v2, 0x8000, v2 ; GFX8-NEXT: v_max_i16_e32 v1, s0, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, -1 ; GFX8-NEXT: v_max_i16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: v_subrev_u16_e32 v3, 0x7fff, v3 +; GFX8-NEXT: v_add_u16_e32 v3, 0x8001, v3 ; GFX8-NEXT: v_min_i16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v2, 0x8000, v2 +; GFX8-NEXT: v_add_u16_e32 v2, 0x8000, v2 ; GFX8-NEXT: v_max_i16_e32 v3, s1, v3 ; GFX8-NEXT: v_min_i16_e32 v2, v3, v2 ; GFX8-NEXT: v_sub_u16_e32 v1, v0, v1 @@ -3038,38 +3044,38 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 0x7fffffff, v8 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x80000001, v8 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v0 ; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v11 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v10 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 +; GFX6-NEXT: v_mov_b32_e32 v9, 0x80000001 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v11 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v2 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v11 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v11 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v11 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v11 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 @@ -3091,28 +3097,28 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_i16_e32 v4, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v4, 0x7fff, v4 +; GFX8-NEXT: v_add_u16_e32 v4, 0x8001, v4 ; GFX8-NEXT: v_min_i16_e32 v5, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v5, 0x8000, v5 +; GFX8-NEXT: v_add_u16_e32 v5, 0x8000, v5 ; GFX8-NEXT: v_max_i16_e32 v4, v4, v2 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v5 ; GFX8-NEXT: v_mov_b32_e32 v5, -1 ; GFX8-NEXT: v_max_i16_sdwa v6, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v6, 0x7fff, v6 +; GFX8-NEXT: v_add_u16_e32 v6, 0x8001, v6 ; GFX8-NEXT: v_min_i16_sdwa v7, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v7, 0x8000, v7 +; GFX8-NEXT: v_add_u16_e32 v7, 0x8000, v7 ; GFX8-NEXT: v_max_i16_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v6, -1, v1 ; GFX8-NEXT: v_min_i16_e32 v2, v2, v7 -; GFX8-NEXT: v_subrev_u16_e32 v6, 0x7fff, v6 +; GFX8-NEXT: v_add_u16_e32 v6, 0x8001, v6 ; GFX8-NEXT: v_min_i16_e32 v7, -1, v1 -; GFX8-NEXT: v_subrev_u16_e32 v7, 0x8000, v7 +; GFX8-NEXT: v_add_u16_e32 v7, 0x8000, v7 ; GFX8-NEXT: v_max_i16_e32 v6, v6, v3 ; GFX8-NEXT: v_min_i16_e32 v6, v6, v7 ; GFX8-NEXT: v_max_i16_sdwa v7, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v7, 0x7fff, v7 +; GFX8-NEXT: v_add_u16_e32 v7, 0x8001, v7 ; GFX8-NEXT: v_min_i16_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v5, 0x8000, v5 +; GFX8-NEXT: v_add_u16_e32 v5, 0x8000, v5 ; GFX8-NEXT: v_max_i16_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v5 ; GFX8-NEXT: v_sub_u16_e32 v4, v0, v4 @@ -3147,36 +3153,36 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_max_i32 s8, s0, -1 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_sub_i32 s8, s8, 0x7fffffff +; GFX6-NEXT: s_add_i32 s8, s8, 0x80000001 ; GFX6-NEXT: s_min_i32 s9, s0, -1 -; GFX6-NEXT: s_sub_i32 s9, s9, 0x80000000 +; GFX6-NEXT: s_add_i32 s9, s9, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s8, s4 ; GFX6-NEXT: s_min_i32 s4, s4, s9 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s4, s5, 16 ; GFX6-NEXT: s_max_i32 s5, s1, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX6-NEXT: s_min_i32 s8, s1, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 +; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s5, s4 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_min_i32 s4, s4, s8 ; GFX6-NEXT: s_max_i32 s5, s2, -1 ; GFX6-NEXT: s_sub_i32 s1, s1, s4 ; GFX6-NEXT: s_lshl_b32 s4, s6, 16 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX6-NEXT: s_min_i32 s6, s2, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s5, s4 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s4, s4, s6 ; GFX6-NEXT: s_max_i32 s5, s3, -1 ; GFX6-NEXT: s_sub_i32 s2, s2, s4 ; GFX6-NEXT: s_lshl_b32 s4, s7, 16 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX6-NEXT: s_min_i32 s6, s3, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s5, s4 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_min_i32 s4, s4, s6 @@ -3199,12 +3205,12 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX8-NEXT: s_sext_i32_i16 s8, s0 ; GFX8-NEXT: s_sext_i32_i16 s9, -1 ; GFX8-NEXT: s_max_i32 s10, s8, s9 -; GFX8-NEXT: s_sub_i32 s10, s10, 0x7fff +; GFX8-NEXT: s_addk_i32 s10, 0x8001 ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 ; GFX8-NEXT: s_min_i32 s8, s8, s9 ; GFX8-NEXT: s_sext_i32_i16 s10, s10 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s8, 0x8000 ; GFX8-NEXT: s_max_i32 s2, s10, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 @@ -3213,11 +3219,11 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX8-NEXT: s_sub_i32 s0, s0, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s4 ; GFX8-NEXT: s_max_i32 s8, s2, s9 -; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff +; GFX8-NEXT: s_addk_i32 s8, 0x8001 ; GFX8-NEXT: s_min_i32 s2, s2, s9 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s2, 0x8000 ; GFX8-NEXT: s_max_i32 s6, s8, s6 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 @@ -3225,12 +3231,12 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX8-NEXT: s_sub_i32 s2, s4, s2 ; GFX8-NEXT: s_sext_i32_i16 s4, s1 ; GFX8-NEXT: s_max_i32 s6, s4, s9 -; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_addk_i32 s6, 0x8001 ; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_min_i32 s4, s4, s9 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s4, 0x8000 ; GFX8-NEXT: s_max_i32 s3, s6, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 @@ -3239,11 +3245,11 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX8-NEXT: s_sub_i32 s1, s1, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s5 ; GFX8-NEXT: s_max_i32 s4, s3, s9 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_addk_i32 s4, 0x8001 ; GFX8-NEXT: s_min_i32 s3, s3, s9 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s7 -; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s3, 0x8000 ; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 @@ -3299,57 +3305,57 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_max_i32_e32 v12, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, 0x7fffffff, v12 +; GFX6-NEXT: v_add_i32_e32 v12, vcc, 0x80000001, v12 ; GFX6-NEXT: v_min_i32_e32 v14, -1, v0 ; GFX6-NEXT: v_bfrev_b32_e32 v15, 1 -; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v15 +; GFX6-NEXT: v_add_i32_e32 v14, vcc, v14, v15 ; GFX6-NEXT: v_max_i32_e32 v6, v12, v6 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v14 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_bfrev_b32_e32 v13, -2 +; GFX6-NEXT: v_mov_b32_e32 v13, 0x80000001 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; GFX6-NEXT: v_min_i32_e32 v12, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v15 +; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v12 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v2 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v15 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v15 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v4 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v4 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v15 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v5 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v5 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v15 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 @@ -3376,40 +3382,40 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_i16_e32 v6, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v6, 0x7fff, v6 +; GFX8-NEXT: v_add_u16_e32 v6, 0x8001, v6 ; GFX8-NEXT: v_min_i16_e32 v7, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v7, 0x8000, v7 +; GFX8-NEXT: v_add_u16_e32 v7, 0x8000, v7 ; GFX8-NEXT: v_max_i16_e32 v6, v6, v3 ; GFX8-NEXT: v_min_i16_e32 v6, v6, v7 ; GFX8-NEXT: v_mov_b32_e32 v7, -1 ; GFX8-NEXT: v_max_i16_sdwa v8, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v8, 0x7fff, v8 +; GFX8-NEXT: v_add_u16_e32 v8, 0x8001, v8 ; GFX8-NEXT: v_min_i16_sdwa v9, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_add_u16_e32 v9, 0x8000, v9 ; GFX8-NEXT: v_max_i16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v8, -1, v1 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v9 -; GFX8-NEXT: v_subrev_u16_e32 v8, 0x7fff, v8 +; GFX8-NEXT: v_add_u16_e32 v8, 0x8001, v8 ; GFX8-NEXT: v_min_i16_e32 v9, -1, v1 -; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_add_u16_e32 v9, 0x8000, v9 ; GFX8-NEXT: v_max_i16_e32 v8, v8, v4 ; GFX8-NEXT: v_min_i16_e32 v8, v8, v9 ; GFX8-NEXT: v_max_i16_sdwa v9, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v9, 0x7fff, v9 +; GFX8-NEXT: v_add_u16_e32 v9, 0x8001, v9 ; GFX8-NEXT: v_min_i16_sdwa v10, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v10, 0x8000, v10 +; GFX8-NEXT: v_add_u16_e32 v10, 0x8000, v10 ; GFX8-NEXT: v_max_i16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v9, -1, v2 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v10 -; GFX8-NEXT: v_subrev_u16_e32 v9, 0x7fff, v9 +; GFX8-NEXT: v_add_u16_e32 v9, 0x8001, v9 ; GFX8-NEXT: v_min_i16_e32 v10, -1, v2 -; GFX8-NEXT: v_subrev_u16_e32 v10, 0x8000, v10 +; GFX8-NEXT: v_add_u16_e32 v10, 0x8000, v10 ; GFX8-NEXT: v_max_i16_e32 v9, v9, v5 ; GFX8-NEXT: v_min_i16_e32 v9, v9, v10 ; GFX8-NEXT: v_max_i16_sdwa v10, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v10, 0x7fff, v10 +; GFX8-NEXT: v_add_u16_e32 v10, 0x8001, v10 ; GFX8-NEXT: v_min_i16_sdwa v7, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v7, 0x8000, v7 +; GFX8-NEXT: v_add_u16_e32 v7, 0x8000, v7 ; GFX8-NEXT: v_max_i16_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v5, v5, v7 ; GFX8-NEXT: v_sub_u16_e32 v6, v0, v6 @@ -3449,55 +3455,55 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_max_i32 s12, s0, -1 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_sub_i32 s12, s12, 0x7fffffff +; GFX6-NEXT: s_add_i32 s12, s12, 0x80000001 ; GFX6-NEXT: s_min_i32 s13, s0, -1 -; GFX6-NEXT: s_sub_i32 s13, s13, 0x80000000 +; GFX6-NEXT: s_add_i32 s13, s13, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s12, s6 ; GFX6-NEXT: s_min_i32 s6, s6, s13 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s0, s0, s6 ; GFX6-NEXT: s_lshl_b32 s6, s7, 16 ; GFX6-NEXT: s_max_i32 s7, s1, -1 -; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff +; GFX6-NEXT: s_add_i32 s7, s7, 0x80000001 ; GFX6-NEXT: s_min_i32 s12, s1, -1 -; GFX6-NEXT: s_sub_i32 s12, s12, 0x80000000 +; GFX6-NEXT: s_add_i32 s12, s12, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s12 ; GFX6-NEXT: s_max_i32 s7, s2, -1 ; GFX6-NEXT: s_sub_i32 s1, s1, s6 ; GFX6-NEXT: s_lshl_b32 s6, s8, 16 -; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff +; GFX6-NEXT: s_add_i32 s7, s7, 0x80000001 ; GFX6-NEXT: s_min_i32 s8, s2, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 +; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s8 ; GFX6-NEXT: s_max_i32 s7, s3, -1 ; GFX6-NEXT: s_sub_i32 s2, s2, s6 ; GFX6-NEXT: s_lshl_b32 s6, s9, 16 -; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff +; GFX6-NEXT: s_add_i32 s7, s7, 0x80000001 ; GFX6-NEXT: s_min_i32 s8, s3, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 +; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s8 ; GFX6-NEXT: s_max_i32 s7, s4, -1 ; GFX6-NEXT: s_sub_i32 s3, s3, s6 ; GFX6-NEXT: s_lshl_b32 s6, s10, 16 -; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff +; GFX6-NEXT: s_add_i32 s7, s7, 0x80000001 ; GFX6-NEXT: s_min_i32 s8, s4, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 +; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s8 ; GFX6-NEXT: s_max_i32 s7, s5, -1 ; GFX6-NEXT: s_sub_i32 s4, s4, s6 ; GFX6-NEXT: s_lshl_b32 s6, s11, 16 -; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff +; GFX6-NEXT: s_add_i32 s7, s7, 0x80000001 ; GFX6-NEXT: s_min_i32 s8, s5, -1 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 +; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s8 @@ -3525,12 +3531,12 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_sext_i32_i16 s12, s0 ; GFX8-NEXT: s_sext_i32_i16 s13, -1 ; GFX8-NEXT: s_max_i32 s14, s12, s13 -; GFX8-NEXT: s_sub_i32 s14, s14, 0x7fff +; GFX8-NEXT: s_addk_i32 s14, 0x8001 ; GFX8-NEXT: s_lshr_b32 s9, s3, 16 ; GFX8-NEXT: s_min_i32 s12, s12, s13 ; GFX8-NEXT: s_sext_i32_i16 s14, s14 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s12, s12, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s12, 0x8000 ; GFX8-NEXT: s_max_i32 s3, s14, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 @@ -3539,11 +3545,11 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_sub_i32 s0, s0, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s6 ; GFX8-NEXT: s_max_i32 s12, s3, s13 -; GFX8-NEXT: s_sub_i32 s12, s12, 0x7fff +; GFX8-NEXT: s_addk_i32 s12, 0x8001 ; GFX8-NEXT: s_min_i32 s3, s3, s13 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 -; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s3, 0x8000 ; GFX8-NEXT: s_max_i32 s9, s12, s9 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 @@ -3551,12 +3557,12 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_sub_i32 s3, s6, s3 ; GFX8-NEXT: s_sext_i32_i16 s6, s1 ; GFX8-NEXT: s_max_i32 s9, s6, s13 -; GFX8-NEXT: s_sub_i32 s9, s9, 0x7fff +; GFX8-NEXT: s_addk_i32 s9, 0x8001 ; GFX8-NEXT: s_lshr_b32 s10, s4, 16 ; GFX8-NEXT: s_min_i32 s6, s6, s13 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sub_i32 s6, s6, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s6, 0x8000 ; GFX8-NEXT: s_max_i32 s4, s9, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 @@ -3565,11 +3571,11 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_sub_i32 s1, s1, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s7 ; GFX8-NEXT: s_max_i32 s6, s4, s13 -; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_addk_i32 s6, 0x8001 ; GFX8-NEXT: s_min_i32 s4, s4, s13 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s9, s10 -; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s4, 0x8000 ; GFX8-NEXT: s_max_i32 s6, s6, s9 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 @@ -3577,12 +3583,12 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_sext_i32_i16 s6, s2 ; GFX8-NEXT: s_sub_i32 s4, s7, s4 ; GFX8-NEXT: s_max_i32 s7, s6, s13 -; GFX8-NEXT: s_sub_i32 s7, s7, 0x7fff +; GFX8-NEXT: s_addk_i32 s7, 0x8001 ; GFX8-NEXT: s_lshr_b32 s11, s5, 16 ; GFX8-NEXT: s_min_i32 s6, s6, s13 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_sub_i32 s6, s6, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s6, 0x8000 ; GFX8-NEXT: s_max_i32 s5, s7, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 @@ -3591,11 +3597,11 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_sub_i32 s2, s2, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s8 ; GFX8-NEXT: s_max_i32 s6, s5, s13 -; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_addk_i32 s6, 0x8001 ; GFX8-NEXT: s_min_i32 s5, s5, s13 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s7, s11 -; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s5, 0x8000 ; GFX8-NEXT: s_max_i32 s6, s6, s7 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 @@ -3648,66 +3654,66 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_max_i32_e32 v16, -1, v0 -; GFX6-NEXT: v_bfrev_b32_e32 v17, -2 +; GFX6-NEXT: v_mov_b32_e32 v17, 0x80000001 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v17 +; GFX6-NEXT: v_add_i32_e32 v16, vcc, v16, v17 ; GFX6-NEXT: v_min_i32_e32 v18, -1, v0 ; GFX6-NEXT: v_bfrev_b32_e32 v19, 1 -; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v18, v19 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v16, v8 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v18 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_min_i32_e32 v16, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v19 +; GFX6-NEXT: v_add_i32_e32 v16, vcc, v16, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v16 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v2 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v4 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v4 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v5 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v5 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v6 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v6 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 @@ -3715,10 +3721,10 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v7 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 @@ -3750,52 +3756,52 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_i16_e32 v8, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v8, 0x7fff, v8 +; GFX8-NEXT: v_add_u16_e32 v8, 0x8001, v8 ; GFX8-NEXT: v_min_i16_e32 v9, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_add_u16_e32 v9, 0x8000, v9 ; GFX8-NEXT: v_max_i16_e32 v8, v8, v4 ; GFX8-NEXT: v_min_i16_e32 v8, v8, v9 ; GFX8-NEXT: v_mov_b32_e32 v9, -1 ; GFX8-NEXT: v_max_i16_sdwa v10, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v10, 0x7fff, v10 +; GFX8-NEXT: v_add_u16_e32 v10, 0x8001, v10 ; GFX8-NEXT: v_min_i16_sdwa v11, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v11, 0x8000, v11 +; GFX8-NEXT: v_add_u16_e32 v11, 0x8000, v11 ; GFX8-NEXT: v_max_i16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v10, -1, v1 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v11 -; GFX8-NEXT: v_subrev_u16_e32 v10, 0x7fff, v10 +; GFX8-NEXT: v_add_u16_e32 v10, 0x8001, v10 ; GFX8-NEXT: v_min_i16_e32 v11, -1, v1 -; GFX8-NEXT: v_subrev_u16_e32 v11, 0x8000, v11 +; GFX8-NEXT: v_add_u16_e32 v11, 0x8000, v11 ; GFX8-NEXT: v_max_i16_e32 v10, v10, v5 ; GFX8-NEXT: v_min_i16_e32 v10, v10, v11 ; GFX8-NEXT: v_max_i16_sdwa v11, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v11, 0x7fff, v11 +; GFX8-NEXT: v_add_u16_e32 v11, 0x8001, v11 ; GFX8-NEXT: v_min_i16_sdwa v12, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v12, 0x8000, v12 +; GFX8-NEXT: v_add_u16_e32 v12, 0x8000, v12 ; GFX8-NEXT: v_max_i16_sdwa v5, v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v11, -1, v2 ; GFX8-NEXT: v_min_i16_e32 v5, v5, v12 -; GFX8-NEXT: v_subrev_u16_e32 v11, 0x7fff, v11 +; GFX8-NEXT: v_add_u16_e32 v11, 0x8001, v11 ; GFX8-NEXT: v_min_i16_e32 v12, -1, v2 -; GFX8-NEXT: v_subrev_u16_e32 v12, 0x8000, v12 +; GFX8-NEXT: v_add_u16_e32 v12, 0x8000, v12 ; GFX8-NEXT: v_max_i16_e32 v11, v11, v6 ; GFX8-NEXT: v_min_i16_e32 v11, v11, v12 ; GFX8-NEXT: v_max_i16_sdwa v12, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v12, 0x7fff, v12 +; GFX8-NEXT: v_add_u16_e32 v12, 0x8001, v12 ; GFX8-NEXT: v_min_i16_sdwa v13, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v13, 0x8000, v13 +; GFX8-NEXT: v_add_u16_e32 v13, 0x8000, v13 ; GFX8-NEXT: v_max_i16_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v12, -1, v3 ; GFX8-NEXT: v_min_i16_e32 v6, v6, v13 -; GFX8-NEXT: v_subrev_u16_e32 v12, 0x7fff, v12 +; GFX8-NEXT: v_add_u16_e32 v12, 0x8001, v12 ; GFX8-NEXT: v_min_i16_e32 v13, -1, v3 -; GFX8-NEXT: v_subrev_u16_e32 v13, 0x8000, v13 +; GFX8-NEXT: v_add_u16_e32 v13, 0x8000, v13 ; GFX8-NEXT: v_max_i16_e32 v12, v12, v7 ; GFX8-NEXT: v_min_i16_e32 v12, v12, v13 ; GFX8-NEXT: v_max_i16_sdwa v13, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v13, 0x7fff, v13 +; GFX8-NEXT: v_add_u16_e32 v13, 0x8001, v13 ; GFX8-NEXT: v_min_i16_sdwa v9, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_add_u16_e32 v9, 0x8000, v9 ; GFX8-NEXT: v_max_i16_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_e32 v8, v0, v8 ; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -3840,63 +3846,63 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_max_i32 s16, s0, -1 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s0, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s16, s8 ; GFX6-NEXT: s_min_i32 s8, s8, s17 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s0, s0, s8 ; GFX6-NEXT: s_lshl_b32 s8, s9, 16 ; GFX6-NEXT: s_max_i32 s9, s1, -1 -; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff +; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 ; GFX6-NEXT: s_min_i32 s16, s1, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x80000000 +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s16 ; GFX6-NEXT: s_max_i32 s9, s2, -1 ; GFX6-NEXT: s_sub_i32 s1, s1, s8 ; GFX6-NEXT: s_lshl_b32 s8, s10, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff +; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 ; GFX6-NEXT: s_min_i32 s10, s2, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 +; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s3, -1 ; GFX6-NEXT: s_sub_i32 s2, s2, s8 ; GFX6-NEXT: s_lshl_b32 s8, s11, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff +; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 ; GFX6-NEXT: s_min_i32 s10, s3, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 +; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s4, -1 ; GFX6-NEXT: s_sub_i32 s3, s3, s8 ; GFX6-NEXT: s_lshl_b32 s8, s12, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff +; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 ; GFX6-NEXT: s_min_i32 s10, s4, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 +; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s5, -1 ; GFX6-NEXT: s_sub_i32 s4, s4, s8 ; GFX6-NEXT: s_lshl_b32 s8, s13, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff +; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 ; GFX6-NEXT: s_min_i32 s10, s5, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 +; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s6, -1 ; GFX6-NEXT: s_sub_i32 s5, s5, s8 ; GFX6-NEXT: s_lshl_b32 s8, s14, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff +; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 ; GFX6-NEXT: s_min_i32 s10, s6, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 +; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s7, s7, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 @@ -3904,10 +3910,10 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s6, s6, s8 ; GFX6-NEXT: s_lshl_b32 s8, s15, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff +; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 ; GFX6-NEXT: s_min_i32 s10, s7, -1 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 +; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 @@ -3940,12 +3946,12 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_sext_i32_i16 s16, s0 ; GFX8-NEXT: s_sext_i32_i16 s17, -1 ; GFX8-NEXT: s_max_i32 s18, s16, s17 -; GFX8-NEXT: s_sub_i32 s18, s18, 0x7fff +; GFX8-NEXT: s_addk_i32 s18, 0x8001 ; GFX8-NEXT: s_lshr_b32 s12, s4, 16 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sext_i32_i16 s18, s18 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sub_i32 s16, s16, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s16, 0x8000 ; GFX8-NEXT: s_max_i32 s4, s18, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s16, s16 @@ -3954,11 +3960,11 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_sub_i32 s0, s0, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s8 ; GFX8-NEXT: s_max_i32 s16, s4, s17 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fff +; GFX8-NEXT: s_addk_i32 s16, 0x8001 ; GFX8-NEXT: s_min_i32 s4, s4, s17 ; GFX8-NEXT: s_sext_i32_i16 s16, s16 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 -; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s4, 0x8000 ; GFX8-NEXT: s_max_i32 s12, s16, s12 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 @@ -3966,12 +3972,12 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_sub_i32 s4, s8, s4 ; GFX8-NEXT: s_sext_i32_i16 s8, s1 ; GFX8-NEXT: s_max_i32 s12, s8, s17 -; GFX8-NEXT: s_sub_i32 s12, s12, 0x7fff +; GFX8-NEXT: s_addk_i32 s12, 0x8001 ; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_min_i32 s8, s8, s17 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s8, 0x8000 ; GFX8-NEXT: s_max_i32 s5, s12, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 @@ -3980,11 +3986,11 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_sub_i32 s1, s1, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s9 ; GFX8-NEXT: s_max_i32 s8, s5, s17 -; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff +; GFX8-NEXT: s_addk_i32 s8, 0x8001 ; GFX8-NEXT: s_min_i32 s5, s5, s17 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s12, s13 -; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s5, 0x8000 ; GFX8-NEXT: s_max_i32 s8, s8, s12 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 @@ -3992,12 +3998,12 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_sext_i32_i16 s8, s2 ; GFX8-NEXT: s_sub_i32 s5, s9, s5 ; GFX8-NEXT: s_max_i32 s9, s8, s17 -; GFX8-NEXT: s_sub_i32 s9, s9, 0x7fff +; GFX8-NEXT: s_addk_i32 s9, 0x8001 ; GFX8-NEXT: s_lshr_b32 s14, s6, 16 ; GFX8-NEXT: s_min_i32 s8, s8, s17 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s8, 0x8000 ; GFX8-NEXT: s_max_i32 s6, s9, s6 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 @@ -4006,23 +4012,23 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_sub_i32 s2, s2, s6 ; GFX8-NEXT: s_sext_i32_i16 s6, s10 ; GFX8-NEXT: s_max_i32 s8, s6, s17 -; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff +; GFX8-NEXT: s_addk_i32 s8, 0x8001 ; GFX8-NEXT: s_min_i32 s6, s6, s17 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s9, s14 -; GFX8-NEXT: s_sub_i32 s6, s6, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s6, 0x8000 ; GFX8-NEXT: s_max_i32 s8, s8, s9 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_min_i32 s6, s8, s6 ; GFX8-NEXT: s_sext_i32_i16 s8, s3 ; GFX8-NEXT: s_max_i32 s9, s8, s17 -; GFX8-NEXT: s_sub_i32 s9, s9, 0x7fff +; GFX8-NEXT: s_addk_i32 s9, 0x8001 ; GFX8-NEXT: s_lshr_b32 s15, s7, 16 ; GFX8-NEXT: s_min_i32 s8, s8, s17 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 -; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s8, 0x8000 ; GFX8-NEXT: s_max_i32 s7, s9, s7 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 @@ -4031,14 +4037,14 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_sub_i32 s3, s3, s7 ; GFX8-NEXT: s_sext_i32_i16 s7, s11 ; GFX8-NEXT: s_max_i32 s8, s7, s17 -; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff +; GFX8-NEXT: s_addk_i32 s8, 0x8001 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_min_i32 s7, s7, s17 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s9, s15 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_sub_i32 s7, s7, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s7, 0x8000 ; GFX8-NEXT: s_max_i32 s8, s8, s9 ; GFX8-NEXT: s_or_b32 s0, s0, s4 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll index 855687281ce9a..6c104709f5ee3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll @@ -147,10 +147,10 @@ define <2 x i16> @v_sub_v2i16_neg_inline_imm_splat(<2 x i16> %a) { ; GFX8-LABEL: v_sub_v2i16_neg_inline_imm_splat: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_not_b32_e32 v1, 63 -; GFX8-NEXT: v_subrev_u16_e32 v2, 0xffc0, v0 -; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 64 +; GFX8-NEXT: v_add_u16_e32 v1, 64, v0 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sub_v2i16_neg_inline_imm_splat: @@ -179,9 +179,9 @@ define <2 x i16> @v_sub_v2i16_neg_inline_imm_lo(<2 x i16> %a) { ; GFX8-LABEL: v_sub_v2i16_neg_inline_imm_lo: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, 4 -; GFX8-NEXT: v_subrev_u16_e32 v1, 0xffc0, v0 -; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v2, -4 +; GFX8-NEXT: v_add_u16_e32 v1, 64, v0 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -211,10 +211,10 @@ define <2 x i16> @v_sub_v2i16_neg_inline_imm_hi(<2 x i16> %a) { ; GFX8-LABEL: v_sub_v2i16_neg_inline_imm_hi: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_not_b32_e32 v1, 63 -; GFX8-NEXT: v_subrev_u16_e32 v2, 4, v0 -; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 64 +; GFX8-NEXT: v_add_u16_e32 v1, -4, v0 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sub_v2i16_neg_inline_imm_hi: @@ -245,8 +245,8 @@ define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_sub_i32 s0, s0, 0xffc0 -; GFX8-NEXT: s_sub_i32 s1, s1, 0xffc0 +; GFX8-NEXT: s_add_i32 s0, s0, 0xffff0040 +; GFX8-NEXT: s_add_i32 s1, s1, 0xffff0040 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 @@ -285,8 +285,8 @@ define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_sub_i32 s0, s0, 0xffc0 -; GFX8-NEXT: s_sub_i32 s1, s1, 4 +; GFX8-NEXT: s_add_i32 s0, s0, 0xffff0040 +; GFX8-NEXT: s_add_i32 s1, s1, -4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 @@ -325,8 +325,8 @@ define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_sub_i32 s0, s0, 4 -; GFX8-NEXT: s_sub_i32 s1, s1, 0xffc0 +; GFX8-NEXT: s_add_i32 s0, s0, -4 +; GFX8-NEXT: s_add_i32 s1, s1, 0xffff0040 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll index 31f61b9968b8b..24ec4fa48f778 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll @@ -222,10 +222,10 @@ define i32 @v_urem_i32_oddk_denom(i32 %num) { ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 ; CHECK-NEXT: v_mul_lo_u32 v1, v1, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, 0x12d8fb, v0 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, 0xffed2705, v0 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, 0x12d8fb, v0 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, 0xffed2705, v0 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -243,23 +243,23 @@ define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) { ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GISEL-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GISEL-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GISEL-NEXT: v_mul_lo_u32 v4, v3, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v0, v3 +; GISEL-NEXT: v_mul_lo_u32 v5, v3, v4 +; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v0, v3 ; GISEL-NEXT: v_mul_hi_u32 v3, v1, v3 -; GISEL-NEXT: v_mul_lo_u32 v4, v4, v2 +; GISEL-NEXT: v_mul_lo_u32 v5, v5, v2 ; GISEL-NEXT: v_mul_lo_u32 v3, v3, v2 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 -; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 0xffed2705, v1 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 -; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, 0xffed2705, v1 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 @@ -274,23 +274,23 @@ define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) { ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v2, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v0, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v2, v3 +; CGP-NEXT: v_mul_hi_u32 v5, v2, v5 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v0, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v1, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v3, v4 +; CGP-NEXT: v_mul_lo_u32 v5, v5, v4 ; CGP-NEXT: v_mul_lo_u32 v2, v2, v4 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; CGP-NEXT: v_subrev_i32_e32 v2, vcc, 0x12d8fb, v0 -; CGP-NEXT: v_subrev_i32_e32 v3, vcc, 0x12d8fb, v1 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v0, v3 +; CGP-NEXT: v_add_i32_e32 v5, vcc, 0xffed2705, v1 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v0, v4 -; CGP-NEXT: v_subrev_i32_e32 v3, vcc, 0x12d8fb, v1 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v0, v3 +; CGP-NEXT: v_add_i32_e32 v3, vcc, 0xffed2705, v1 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index a7522ef761b8a..c63e9d471b6bf 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -567,7 +567,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 -; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v1, 0xffffffe8, v1 ; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; @@ -1566,7 +1566,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 -; GFX10-GISEL-NEXT: v_sub_nc_u16 v1, v1, 24 +; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffe8 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0xffff, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[4:5] @@ -1686,7 +1686,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 -; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v2, 16, v2 +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v2, -16, v2 ; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo ; GFX10-GISEL-NEXT: global_store_short v0, v1, s[4:5] @@ -1807,7 +1807,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 -; GFX10-GISEL-NEXT: v_sub_nc_u16 v1, v1, 25 +; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffe7 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index d94ec56842ab8..147ddc4d4b75b 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -1313,7 +1313,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_sub_u32_e32 v0, 64, v8 ; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11] ; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v8, v[12:13] -; GFX9-G-NEXT: v_subrev_u32_e32 v9, 64, v8 +; GFX9-G-NEXT: v_add_u32_e32 v9, 0xffffffc0, v8 ; GFX9-G-NEXT: v_lshlrev_b64 v[6:7], v8, v[10:11] ; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3 @@ -1338,7 +1338,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_sub_u32_e32 v2, 64, v20 ; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v20, v[10:11] ; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v2, v[12:13] -; GFX9-G-NEXT: v_subrev_u32_e32 v24, 64, v20 +; GFX9-G-NEXT: v_add_u32_e32 v24, 0xffffffc0, v20 ; GFX9-G-NEXT: v_lshrrev_b64 v[14:15], v20, v[12:13] ; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3 @@ -2070,8 +2070,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v4 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v18, v4 +; GFX9-G-O0-NEXT: s_mov_b32 s5, 0xffffffc0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-G-O0-NEXT: v_add_u32_e64 v4, v18, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4 ; GFX9-G-O0-NEXT: v_sub_u32_e64 v5, v5, v18 ; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 @@ -2203,8 +2204,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v9 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-G-O0-NEXT: v_sub_u32_e64 v2, v8, v0 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0xffffffc0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-G-O0-NEXT: v_add_u32_e64 v2, v8, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-G-O0-NEXT: v_sub_u32_e64 v14, v0, v8 ; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 @@ -3453,7 +3455,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_sub_u32_e32 v8, 64, v16 ; GFX9-G-NEXT: v_lshrrev_b64 v[8:9], v8, v[0:1] ; GFX9-G-NEXT: v_lshlrev_b64 v[10:11], v16, v[2:3] -; GFX9-G-NEXT: v_subrev_u32_e32 v14, 64, v16 +; GFX9-G-NEXT: v_add_u32_e32 v14, 0xffffffc0, v16 ; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], v16, v[0:1] ; GFX9-G-NEXT: v_or_b32_e32 v10, v8, v10 ; GFX9-G-NEXT: v_or_b32_e32 v11, v9, v11 @@ -3476,7 +3478,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: s_cbranch_execz .LBB1_5 ; GFX9-G-NEXT: ; %bb.2: ; %udiv-preheader ; GFX9-G-NEXT: v_sub_u32_e32 v12, 64, v18 -; GFX9-G-NEXT: v_subrev_u32_e32 v22, 64, v18 +; GFX9-G-NEXT: v_add_u32_e32 v22, 0xffffffc0, v18 ; GFX9-G-NEXT: v_lshrrev_b64 v[10:11], v18, v[0:1] ; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], v12, v[2:3] ; GFX9-G-NEXT: v_lshrrev_b64 v[16:17], v18, v[2:3] @@ -4175,8 +4177,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v7 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v12, v4 +; GFX9-G-O0-NEXT: s_mov_b32 s5, 0xffffffc0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-G-O0-NEXT: v_add_u32_e64 v4, v12, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4 ; GFX9-G-O0-NEXT: v_sub_u32_e64 v5, v5, v12 ; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 @@ -4311,8 +4314,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_mov_b32 s7, 64 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v9 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-G-O0-NEXT: v_sub_u32_e64 v2, v3, v0 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0xffffffc0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-G-O0-NEXT: v_add_u32_e64 v2, v3, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-G-O0-NEXT: v_sub_u32_e64 v8, v0, v3 ; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index e04cd71125608..691f3d36bc736 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -476,18 +476,18 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_subb_u32_e64 v2, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[20:21] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[20:21] ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_xor_b32_e32 v8, 0x7f, v0 -; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_xor_b32_e32 v8, 0x7f, v2 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v8, v8, v2 -; GISEL-NEXT: v_or_b32_e32 v9, v1, v3 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_or_b32_e32 v8, v8, v0 +; GISEL-NEXT: v_or_b32_e32 v9, v3, v1 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc @@ -505,12 +505,13 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB0_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 -; GISEL-NEXT: v_add_i32_e32 v28, vcc, 1, v0 -; GISEL-NEXT: v_addc_u32_e64 v29, s[4:5], 0, v1, vcc -; GISEL-NEXT: v_sub_i32_e32 v32, vcc, 0x7f, v0 -; GISEL-NEXT: v_addc_u32_e64 v30, vcc, 0, v2, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v31, vcc, 0, v3, vcc -; GISEL-NEXT: v_subrev_i32_e64 v20, s[4:5], 64, v32 +; GISEL-NEXT: v_add_i32_e32 v28, vcc, 1, v2 +; GISEL-NEXT: v_addc_u32_e64 v29, s[4:5], 0, v3, vcc +; GISEL-NEXT: v_sub_i32_e32 v32, vcc, 0x7f, v2 +; GISEL-NEXT: v_not_b32_e32 v2, 63 +; GISEL-NEXT: v_addc_u32_e64 v30, vcc, 0, v0, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v31, vcc, 0, v1, vcc +; GISEL-NEXT: v_add_i32_e64 v20, s[4:5], v32, v2 ; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], 64, v32 ; GISEL-NEXT: v_lshl_b64 v[0:1], v[16:17], v32 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[18:19], v32 @@ -536,7 +537,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB0_5 ; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 -; GISEL-NEXT: v_subrev_i32_e32 v34, vcc, 64, v28 +; GISEL-NEXT: v_add_i32_e32 v34, vcc, 0xffffffc0, v28 ; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v28 ; GISEL-NEXT: v_lshr_b64 v[0:1], v[18:19], v28 ; GISEL-NEXT: v_lshr_b64 v[2:3], v[16:17], v28 @@ -665,18 +666,18 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_subb_u32_e64 v2, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[10:11] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[10:11] ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_xor_b32_e32 v10, 0x7f, v0 -; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_xor_b32_e32 v10, 0x7f, v2 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v10, v10, v2 -; GISEL-NEXT: v_or_b32_e32 v11, v1, v3 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_or_b32_e32 v10, v10, v0 +; GISEL-NEXT: v_or_b32_e32 v11, v3, v1 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc @@ -694,12 +695,13 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB0_12 ; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 -; GISEL-NEXT: v_add_i32_e32 v26, vcc, 1, v0 -; GISEL-NEXT: v_addc_u32_e64 v27, s[4:5], 0, v1, vcc -; GISEL-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v0 -; GISEL-NEXT: v_addc_u32_e64 v28, vcc, 0, v2, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v3, vcc -; GISEL-NEXT: v_subrev_i32_e64 v14, s[4:5], 64, v30 +; GISEL-NEXT: v_add_i32_e32 v26, vcc, 1, v2 +; GISEL-NEXT: v_addc_u32_e64 v27, s[4:5], 0, v3, vcc +; GISEL-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v2 +; GISEL-NEXT: v_not_b32_e32 v2, 63 +; GISEL-NEXT: v_addc_u32_e64 v28, vcc, 0, v0, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v1, vcc +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v30, v2 ; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 64, v30 ; GISEL-NEXT: v_lshl_b64 v[0:1], v[6:7], v30 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[12:13], v30 @@ -725,7 +727,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB0_11 ; GISEL-NEXT: ; %bb.8: ; %udiv-preheader -; GISEL-NEXT: v_subrev_i32_e32 v32, vcc, 64, v26 +; GISEL-NEXT: v_add_i32_e32 v32, vcc, 0xffffffc0, v26 ; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 64, v26 ; GISEL-NEXT: v_lshr_b64 v[0:1], v[12:13], v26 ; GISEL-NEXT: v_lshr_b64 v[2:3], v[6:7], v26 @@ -1229,18 +1231,18 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] ; GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v18, vcc -; GISEL-NEXT: v_sub_i32_e32 v20, vcc, v2, v3 -; GISEL-NEXT: v_subb_u32_e64 v21, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_subb_u32_e64 v22, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[20:21], v[24:25] +; GISEL-NEXT: v_sub_i32_e32 v22, vcc, v2, v3 +; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v21, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[22:23], v[24:25] ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, 0x7f, v20 -; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[22:23] +; GISEL-NEXT: v_xor_b32_e32 v2, 0x7f, v22 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[20:21] ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v2, v2, v22 -; GISEL-NEXT: v_or_b32_e32 v3, v21, v23 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[22:23] +; GISEL-NEXT: v_or_b32_e32 v2, v2, v20 +; GISEL-NEXT: v_or_b32_e32 v3, v23, v21 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] ; GISEL-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc @@ -1258,12 +1260,13 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB1_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 -; GISEL-NEXT: v_add_i32_e32 v26, vcc, 1, v20 -; GISEL-NEXT: v_addc_u32_e64 v27, s[4:5], 0, v21, vcc -; GISEL-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v20 -; GISEL-NEXT: v_addc_u32_e64 v28, vcc, 0, v22, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v23, vcc -; GISEL-NEXT: v_subrev_i32_e64 v22, s[4:5], 64, v30 +; GISEL-NEXT: v_add_i32_e32 v26, vcc, 1, v22 +; GISEL-NEXT: v_addc_u32_e64 v27, s[4:5], 0, v23, vcc +; GISEL-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v22 +; GISEL-NEXT: v_not_b32_e32 v2, 63 +; GISEL-NEXT: v_addc_u32_e64 v28, vcc, 0, v20, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v21, vcc +; GISEL-NEXT: v_add_i32_e64 v22, s[4:5], v30, v2 ; GISEL-NEXT: v_sub_i32_e64 v20, s[4:5], 64, v30 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[0:1], v30 ; GISEL-NEXT: v_lshl_b64 v[18:19], v[16:17], v30 @@ -1289,7 +1292,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB1_5 ; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 -; GISEL-NEXT: v_subrev_i32_e32 v32, vcc, 64, v26 +; GISEL-NEXT: v_add_i32_e32 v32, vcc, 0xffffffc0, v26 ; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 64, v26 ; GISEL-NEXT: v_lshr_b64 v[18:19], v[16:17], v26 ; GISEL-NEXT: v_lshr_b64 v[20:21], v[0:1], v26 @@ -1401,18 +1404,18 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[10:11] +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v0, v1 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[16:17], v[10:11] ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_xor_b32_e32 v8, 0x7f, v0 -; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[16:17] +; GISEL-NEXT: v_xor_b32_e32 v8, 0x7f, v16 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v8, v8, v16 -; GISEL-NEXT: v_or_b32_e32 v9, v1, v17 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; GISEL-NEXT: v_or_b32_e32 v8, v8, v0 +; GISEL-NEXT: v_or_b32_e32 v9, v17, v1 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc @@ -1430,12 +1433,13 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB1_12 ; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0 -; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, v1, vcc -; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v0 -; GISEL-NEXT: v_addc_u32_e64 v24, vcc, 0, v16, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v17, vcc -; GISEL-NEXT: v_subrev_i32_e64 v9, s[4:5], 64, v26 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v16 +; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, v17, vcc +; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v16 +; GISEL-NEXT: v_not_b32_e32 v9, 63 +; GISEL-NEXT: v_addc_u32_e64 v24, vcc, 0, v0, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v1, vcc +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v26, v9 ; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 64, v26 ; GISEL-NEXT: v_lshl_b64 v[0:1], v[4:5], v26 ; GISEL-NEXT: v_lshl_b64 v[16:17], v[6:7], v26 @@ -1461,7 +1465,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB1_11 ; GISEL-NEXT: ; %bb.8: ; %udiv-preheader -; GISEL-NEXT: v_subrev_i32_e32 v28, vcc, 64, v8 +; GISEL-NEXT: v_add_i32_e32 v28, vcc, 0xffffffc0, v8 ; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v8 ; GISEL-NEXT: v_lshr_b64 v[16:17], v[6:7], v8 ; GISEL-NEXT: v_lshr_b64 v[20:21], v[4:5], v8 @@ -2072,18 +2076,18 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_subb_u32_e64 v2, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[18:19] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; GISEL-NEXT: v_xor_b32_e32 v18, 0x7f, v0 -; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_xor_b32_e32 v18, 0x7f, v2 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v18, v18, v2 -; GISEL-NEXT: v_or_b32_e32 v19, v1, v3 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_or_b32_e32 v18, v18, v0 +; GISEL-NEXT: v_or_b32_e32 v19, v3, v1 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc @@ -2101,12 +2105,13 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB2_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 -; GISEL-NEXT: v_add_i32_e32 v31, vcc, 1, v0 -; GISEL-NEXT: v_addc_u32_e64 v32, s[4:5], 0, v1, vcc -; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v0 -; GISEL-NEXT: v_addc_u32_e64 v33, vcc, 0, v2, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v34, vcc, 0, v3, vcc -; GISEL-NEXT: v_subrev_i32_e64 v20, s[4:5], 64, v24 +; GISEL-NEXT: v_add_i32_e32 v31, vcc, 1, v2 +; GISEL-NEXT: v_addc_u32_e64 v32, s[4:5], 0, v3, vcc +; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v2 +; GISEL-NEXT: v_not_b32_e32 v2, 63 +; GISEL-NEXT: v_addc_u32_e64 v33, vcc, 0, v0, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v34, vcc, 0, v1, vcc +; GISEL-NEXT: v_add_i32_e64 v20, s[4:5], v24, v2 ; GISEL-NEXT: v_sub_i32_e64 v18, s[4:5], 64, v24 ; GISEL-NEXT: v_lshl_b64 v[0:1], v[16:17], v24 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[8:9], v24 @@ -2132,7 +2137,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB2_5 ; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 -; GISEL-NEXT: v_subrev_i32_e32 v24, vcc, 64, v31 +; GISEL-NEXT: v_add_i32_e32 v24, vcc, 0xffffffc0, v31 ; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v31 ; GISEL-NEXT: v_lshr_b64 v[0:1], v[8:9], v31 ; GISEL-NEXT: v_lshr_b64 v[2:3], v[16:17], v31 @@ -2262,18 +2267,18 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v14, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v0, v1 +; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[14:15], v[2:3] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, 0x7f, v0 -; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[14:15] +; GISEL-NEXT: v_xor_b32_e32 v2, 0x7f, v14 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v2, v2, v14 -; GISEL-NEXT: v_or_b32_e32 v3, v1, v15 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; GISEL-NEXT: v_or_b32_e32 v2, v2, v0 +; GISEL-NEXT: v_or_b32_e32 v3, v15, v1 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc @@ -2291,12 +2296,13 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB2_12 ; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 -; GISEL-NEXT: v_add_i32_e32 v36, vcc, 1, v0 -; GISEL-NEXT: v_addc_u32_e64 v37, s[4:5], 0, v1, vcc -; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v0 -; GISEL-NEXT: v_addc_u32_e64 v38, vcc, 0, v14, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v39, vcc, 0, v15, vcc -; GISEL-NEXT: v_subrev_i32_e64 v20, s[4:5], 64, v24 +; GISEL-NEXT: v_add_i32_e32 v36, vcc, 1, v14 +; GISEL-NEXT: v_addc_u32_e64 v37, s[4:5], 0, v15, vcc +; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v14 +; GISEL-NEXT: v_not_b32_e32 v2, 63 +; GISEL-NEXT: v_addc_u32_e64 v38, vcc, 0, v0, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v39, vcc, 0, v1, vcc +; GISEL-NEXT: v_add_i32_e64 v20, s[4:5], v24, v2 ; GISEL-NEXT: v_sub_i32_e64 v14, s[4:5], 64, v24 ; GISEL-NEXT: v_lshl_b64 v[0:1], v[12:13], v24 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[6:7], v24 @@ -2322,7 +2328,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB2_11 ; GISEL-NEXT: ; %bb.8: ; %udiv-preheader -; GISEL-NEXT: v_subrev_i32_e32 v24, vcc, 64, v36 +; GISEL-NEXT: v_add_i32_e32 v24, vcc, 0xffffffc0, v36 ; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v36 ; GISEL-NEXT: v_lshr_b64 v[0:1], v[6:7], v36 ; GISEL-NEXT: v_lshr_b64 v[2:3], v[12:13], v36 @@ -2903,18 +2909,18 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GISEL-NEXT: v_cndmask_b32_e32 v17, v19, v18, vcc -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v16, v17 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_subb_u32_e64 v18, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[16:17], v[20:21] +; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v17 +; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[18:19], v[20:21] ; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc -; GISEL-NEXT: v_xor_b32_e32 v20, 0x7f, v16 -; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[18:19] +; GISEL-NEXT: v_xor_b32_e32 v20, 0x7f, v18 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[16:17] ; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v20, v20, v18 -; GISEL-NEXT: v_or_b32_e32 v21, v17, v19 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] +; GISEL-NEXT: v_or_b32_e32 v20, v20, v16 +; GISEL-NEXT: v_or_b32_e32 v21, v19, v17 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] ; GISEL-NEXT: v_cndmask_b32_e32 v23, v24, v23, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc @@ -2932,12 +2938,13 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB3_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 -; GISEL-NEXT: v_add_i32_e32 v30, vcc, 1, v16 -; GISEL-NEXT: v_addc_u32_e64 v31, s[4:5], 0, v17, vcc -; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v16 -; GISEL-NEXT: v_addc_u32_e64 v32, vcc, 0, v18, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v33, vcc, 0, v19, vcc -; GISEL-NEXT: v_subrev_i32_e64 v22, s[4:5], 64, v26 +; GISEL-NEXT: v_add_i32_e32 v30, vcc, 1, v18 +; GISEL-NEXT: v_addc_u32_e64 v31, s[4:5], 0, v19, vcc +; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v18 +; GISEL-NEXT: v_not_b32_e32 v18, 63 +; GISEL-NEXT: v_addc_u32_e64 v32, vcc, 0, v16, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v33, vcc, 0, v17, vcc +; GISEL-NEXT: v_add_i32_e64 v22, s[4:5], v26, v18 ; GISEL-NEXT: v_sub_i32_e64 v20, s[4:5], 64, v26 ; GISEL-NEXT: v_lshl_b64 v[16:17], v[0:1], v26 ; GISEL-NEXT: v_lshl_b64 v[18:19], v[2:3], v26 @@ -2963,7 +2970,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB3_5 ; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 -; GISEL-NEXT: v_subrev_i32_e32 v26, vcc, 64, v30 +; GISEL-NEXT: v_add_i32_e32 v26, vcc, 0xffffffc0, v30 ; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 64, v30 ; GISEL-NEXT: v_lshr_b64 v[16:17], v[2:3], v30 ; GISEL-NEXT: v_lshr_b64 v[18:19], v[0:1], v30 @@ -3075,18 +3082,18 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GISEL-NEXT: v_cndmask_b32_e32 v17, v19, v18, vcc -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v16, v17 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_subb_u32_e64 v22, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[16:17], v[24:25] +; GISEL-NEXT: v_sub_i32_e32 v22, vcc, v16, v17 +; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[22:23], v[24:25] ; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc -; GISEL-NEXT: v_xor_b32_e32 v18, 0x7f, v16 -; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[22:23] +; GISEL-NEXT: v_xor_b32_e32 v18, 0x7f, v22 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[16:17] ; GISEL-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v18, v18, v22 -; GISEL-NEXT: v_or_b32_e32 v19, v17, v23 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[22:23] +; GISEL-NEXT: v_or_b32_e32 v18, v18, v16 +; GISEL-NEXT: v_or_b32_e32 v19, v23, v17 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] ; GISEL-NEXT: v_cndmask_b32_e32 v24, v25, v24, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc @@ -3104,12 +3111,13 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB3_12 ; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 -; GISEL-NEXT: v_add_i32_e32 v34, vcc, 1, v16 -; GISEL-NEXT: v_addc_u32_e64 v35, s[4:5], 0, v17, vcc -; GISEL-NEXT: v_sub_i32_e32 v28, vcc, 0x7f, v16 -; GISEL-NEXT: v_addc_u32_e64 v36, vcc, 0, v22, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v37, vcc, 0, v23, vcc -; GISEL-NEXT: v_subrev_i32_e64 v24, s[4:5], 64, v28 +; GISEL-NEXT: v_add_i32_e32 v34, vcc, 1, v22 +; GISEL-NEXT: v_addc_u32_e64 v35, s[4:5], 0, v23, vcc +; GISEL-NEXT: v_sub_i32_e32 v28, vcc, 0x7f, v22 +; GISEL-NEXT: v_not_b32_e32 v18, 63 +; GISEL-NEXT: v_addc_u32_e64 v36, vcc, 0, v16, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v37, vcc, 0, v17, vcc +; GISEL-NEXT: v_add_i32_e64 v24, s[4:5], v28, v18 ; GISEL-NEXT: v_sub_i32_e64 v22, s[4:5], 64, v28 ; GISEL-NEXT: v_lshl_b64 v[16:17], v[4:5], v28 ; GISEL-NEXT: v_lshl_b64 v[18:19], v[6:7], v28 @@ -3135,7 +3143,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB3_11 ; GISEL-NEXT: ; %bb.8: ; %udiv-preheader -; GISEL-NEXT: v_subrev_i32_e32 v28, vcc, 64, v34 +; GISEL-NEXT: v_add_i32_e32 v28, vcc, 0xffffffc0, v34 ; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 64, v34 ; GISEL-NEXT: v_lshr_b64 v[16:17], v[6:7], v34 ; GISEL-NEXT: v_lshr_b64 v[18:19], v[4:5], v34 diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index 786fe03164690..6fa607f83f8af 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -236,17 +236,17 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB0_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffbcd, v6 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GISEL-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0 -; GISEL-NEXT: v_subrev_u32_e32 v7, 64, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6 +; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffb8d, v6 +; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 +; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7] @@ -273,7 +273,7 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: s_cbranch_execz .LBB0_6 ; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; GISEL-NEXT: v_sub_co_u32_e32 v6, vcc, 0x433, v6 -; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v6 +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffffc0, v6 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], v6, v[4:5] ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, 0 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 @@ -608,17 +608,17 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB1_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffbcd, v6 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GISEL-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0 -; GISEL-NEXT: v_subrev_u32_e32 v7, 64, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6 +; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffb8d, v6 +; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 +; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7] @@ -645,7 +645,7 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: s_cbranch_execz .LBB1_6 ; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; GISEL-NEXT: v_sub_co_u32_e32 v6, vcc, 0x433, v6 -; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v6 +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffffc0, v6 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], v6, v[4:5] ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, 0 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 @@ -972,17 +972,17 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB2_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff6a, v6 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GISEL-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0 -; GISEL-NEXT: v_subrev_u32_e32 v7, 64, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6 +; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6 +; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 +; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7] @@ -1009,7 +1009,7 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: s_cbranch_execz .LBB2_6 ; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; GISEL-NEXT: v_sub_co_u32_e32 v3, vcc, 0x96, v6 -; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v3 +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffffc0, v3 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], v3, v[4:5] ; GISEL-NEXT: v_lshrrev_b64 v[1:2], v2, 0 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3 @@ -1330,17 +1330,17 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB3_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff6a, v6 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GISEL-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0 -; GISEL-NEXT: v_subrev_u32_e32 v7, 64, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6 +; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6 +; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 +; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7] @@ -1367,7 +1367,7 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: s_cbranch_execz .LBB3_6 ; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; GISEL-NEXT: v_sub_co_u32_e32 v3, vcc, 0x96, v6 -; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v3 +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffffc0, v3 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], v3, v[4:5] ; GISEL-NEXT: v_lshrrev_b64 v[1:2], v2, 0 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3 @@ -1714,7 +1714,7 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else ; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff7a, v5 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[7:8] -; GISEL-NEXT: v_subrev_u32_e32 v4, 64, v6 +; GISEL-NEXT: v_add_u32_e32 v4, 0xffffff3a, v5 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 ; GISEL-NEXT: v_lshl_or_b32 v11, v11, 16, v11 @@ -1748,7 +1748,7 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: s_cbranch_execz .LBB6_6 ; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; GISEL-NEXT: v_sub_co_u32_e32 v3, vcc, 0x86, v5 -; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v3 +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffffc0, v3 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], v3, v[7:8] ; GISEL-NEXT: v_lshrrev_b64 v[1:2], v2, 0 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3 @@ -2066,7 +2066,7 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else ; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff7a, v5 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[7:8] -; GISEL-NEXT: v_subrev_u32_e32 v4, 64, v6 +; GISEL-NEXT: v_add_u32_e32 v4, 0xffffff3a, v5 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 ; GISEL-NEXT: v_lshl_or_b32 v11, v11, 16, v11 @@ -2100,7 +2100,7 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: s_cbranch_execz .LBB7_6 ; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; GISEL-NEXT: v_sub_co_u32_e32 v3, vcc, 0x86, v5 -; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v3 +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffffc0, v3 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], v3, v[7:8] ; GISEL-NEXT: v_lshrrev_b64 v[1:2], v2, 0 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3 diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll index 2999ddb831588..f372a54894604 100644 --- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll @@ -193,32 +193,32 @@ define float @sitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_sub_u32_e32 v11, 64, v4 ; GISEL-NEXT: v_lshrrev_b64 v[9:10], v4, v[0:1] ; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, v[2:3] -; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v4 +; GISEL-NEXT: v_add_u32_e32 v13, 0xffffffc0, v4 ; GISEL-NEXT: v_or_b32_e32 v11, v9, v11 ; GISEL-NEXT: v_or_b32_e32 v12, v10, v12 ; GISEL-NEXT: v_lshrrev_b64 v[9:10], v13, v[2:3] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 -; GISEL-NEXT: v_add_u32_e32 v5, 26, v5 +; GISEL-NEXT: v_add_u32_e32 v14, 26, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_sub_u32_e32 v11, 64, v5 +; GISEL-NEXT: v_sub_u32_e32 v11, 64, v14 ; GISEL-NEXT: v_cndmask_b32_e32 v13, v9, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v4, v10, v1, vcc -; GISEL-NEXT: v_lshrrev_b64 v[9:10], v5, -1 +; GISEL-NEXT: v_lshrrev_b64 v[9:10], v14, -1 ; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, -1 -; GISEL-NEXT: v_subrev_u32_e32 v14, 64, v5 +; GISEL-NEXT: v_add_u32_e32 v5, 0xffffffda, v5 ; GISEL-NEXT: v_or_b32_e32 v15, v9, v11 ; GISEL-NEXT: v_or_b32_e32 v16, v10, v12 -; GISEL-NEXT: v_lshrrev_b64 v[11:12], v14, -1 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 +; GISEL-NEXT: v_lshrrev_b64 v[11:12], v5, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v11, v15, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, v12, v16, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 ; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, v11, -1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v11, v12, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, -1, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v2, v9, v2 ; GISEL-NEXT: v_and_b32_e32 v3, v10, v3 ; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2 @@ -438,32 +438,32 @@ define float @uitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_sub_u32_e32 v10, 64, v4 ; GISEL-NEXT: v_lshrrev_b64 v[8:9], v4, v[0:1] ; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] -; GISEL-NEXT: v_subrev_u32_e32 v12, 64, v4 +; GISEL-NEXT: v_add_u32_e32 v12, 0xffffffc0, v4 ; GISEL-NEXT: v_or_b32_e32 v10, v8, v10 ; GISEL-NEXT: v_or_b32_e32 v11, v9, v11 ; GISEL-NEXT: v_lshrrev_b64 v[8:9], v12, v[2:3] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 -; GISEL-NEXT: v_add_u32_e32 v5, 26, v5 +; GISEL-NEXT: v_add_u32_e32 v13, 26, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_sub_u32_e32 v10, 64, v5 +; GISEL-NEXT: v_sub_u32_e32 v10, 64, v13 ; GISEL-NEXT: v_cndmask_b32_e32 v12, v8, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v1, vcc -; GISEL-NEXT: v_lshrrev_b64 v[8:9], v5, -1 +; GISEL-NEXT: v_lshrrev_b64 v[8:9], v13, -1 ; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, -1 -; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v5 +; GISEL-NEXT: v_add_u32_e32 v5, 0xffffffda, v5 ; GISEL-NEXT: v_or_b32_e32 v14, v8, v10 ; GISEL-NEXT: v_or_b32_e32 v15, v9, v11 -; GISEL-NEXT: v_lshrrev_b64 v[10:11], v13, -1 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 +; GISEL-NEXT: v_lshrrev_b64 v[10:11], v5, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v10, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, v11, v15, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 ; GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, v10, -1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v10, v11, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, -1, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v2, v8, v2 ; GISEL-NEXT: v_and_b32_e32 v3, v9, v3 ; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2 @@ -723,34 +723,34 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_sub_u32_e32 v10, 64, v14 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], v14, v[2:3] ; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, v[4:5] -; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v14 +; GISEL-NEXT: v_add_u32_e32 v15, 0xffffffc0, v14 ; GISEL-NEXT: v_lshrrev_b64 v[12:13], v14, v[4:5] ; GISEL-NEXT: v_or_b32_e32 v10, v0, v10 ; GISEL-NEXT: v_or_b32_e32 v11, v1, v11 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], v15, v[4:5] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 -; GISEL-NEXT: v_add_u32_e32 v9, 55, v9 +; GISEL-NEXT: v_add_u32_e32 v15, 55, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 ; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc -; GISEL-NEXT: v_sub_u32_e32 v12, 64, v9 +; GISEL-NEXT: v_sub_u32_e32 v12, 64, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v14, v0, v2, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v10, v1, v3, s[4:5] -; GISEL-NEXT: v_lshrrev_b64 v[0:1], v9, -1 +; GISEL-NEXT: v_lshrrev_b64 v[0:1], v15, -1 ; GISEL-NEXT: v_lshlrev_b64 v[12:13], v12, -1 -; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v9 +; GISEL-NEXT: v_add_u32_e32 v9, -9, v9 ; GISEL-NEXT: v_or_b32_e32 v16, v0, v12 ; GISEL-NEXT: v_or_b32_e32 v17, v1, v13 -; GISEL-NEXT: v_lshrrev_b64 v[12:13], v15, -1 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 +; GISEL-NEXT: v_lshrrev_b64 v[12:13], v9, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v9, v12, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, v13, v17, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 ; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v9, v12, -1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v12, v13, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v12, -1, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v0, v0, v4 ; GISEL-NEXT: v_and_b32_e32 v1, v1, v5 ; GISEL-NEXT: v_and_or_b32 v0, v9, v2, v0 @@ -999,35 +999,35 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_sub_u32_e32 v9, 64, v13 ; GISEL-NEXT: v_lshrrev_b64 v[4:5], v13, v[0:1] ; GISEL-NEXT: v_lshlrev_b64 v[9:10], v9, v[2:3] -; GISEL-NEXT: v_subrev_u32_e32 v14, 64, v13 +; GISEL-NEXT: v_add_u32_e32 v14, 0xffffffc0, v13 ; GISEL-NEXT: v_lshrrev_b64 v[11:12], v13, v[2:3] ; GISEL-NEXT: v_or_b32_e32 v9, v4, v9 ; GISEL-NEXT: v_or_b32_e32 v10, v5, v10 ; GISEL-NEXT: v_lshrrev_b64 v[4:5], v14, v[2:3] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13 -; GISEL-NEXT: v_add_u32_e32 v8, 55, v8 +; GISEL-NEXT: v_add_u32_e32 v15, 55, v8 ; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 ; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v11, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc -; GISEL-NEXT: v_sub_u32_e32 v12, 64, v8 +; GISEL-NEXT: v_sub_u32_e32 v12, 64, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v14, v4, v0, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v9, v5, v1, s[4:5] -; GISEL-NEXT: v_lshrrev_b64 v[4:5], v8, -1 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], v15, -1 ; GISEL-NEXT: v_lshlrev_b64 v[12:13], v12, -1 -; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v8 +; GISEL-NEXT: v_add_u32_e32 v8, -9, v8 ; GISEL-NEXT: v_or_b32_e32 v16, v4, v12 ; GISEL-NEXT: v_or_b32_e32 v17, v5, v13 -; GISEL-NEXT: v_lshrrev_b64 v[12:13], v15, -1 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 +; GISEL-NEXT: v_lshrrev_b64 v[12:13], v8, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v12, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, v13, v17, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 ; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v8, v12, -1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v12, v13, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v12, -1, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v2, v4, v2 ; GISEL-NEXT: v_and_b32_e32 v3, v5, v3 ; GISEL-NEXT: v_and_or_b32 v0, v8, v0, v2 @@ -1284,32 +1284,32 @@ define half @sitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: v_sub_u32_e32 v11, 64, v4 ; GISEL-NEXT: v_lshrrev_b64 v[9:10], v4, v[0:1] ; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, v[2:3] -; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v4 +; GISEL-NEXT: v_add_u32_e32 v13, 0xffffffc0, v4 ; GISEL-NEXT: v_or_b32_e32 v11, v9, v11 ; GISEL-NEXT: v_or_b32_e32 v12, v10, v12 ; GISEL-NEXT: v_lshrrev_b64 v[9:10], v13, v[2:3] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 -; GISEL-NEXT: v_add_u32_e32 v5, 26, v5 +; GISEL-NEXT: v_add_u32_e32 v14, 26, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_sub_u32_e32 v11, 64, v5 +; GISEL-NEXT: v_sub_u32_e32 v11, 64, v14 ; GISEL-NEXT: v_cndmask_b32_e32 v13, v9, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v4, v10, v1, vcc -; GISEL-NEXT: v_lshrrev_b64 v[9:10], v5, -1 +; GISEL-NEXT: v_lshrrev_b64 v[9:10], v14, -1 ; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, -1 -; GISEL-NEXT: v_subrev_u32_e32 v14, 64, v5 +; GISEL-NEXT: v_add_u32_e32 v5, 0xffffffda, v5 ; GISEL-NEXT: v_or_b32_e32 v15, v9, v11 ; GISEL-NEXT: v_or_b32_e32 v16, v10, v12 -; GISEL-NEXT: v_lshrrev_b64 v[11:12], v14, -1 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 +; GISEL-NEXT: v_lshrrev_b64 v[11:12], v5, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v11, v15, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, v12, v16, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 ; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, v11, -1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v11, v12, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, -1, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v2, v9, v2 ; GISEL-NEXT: v_and_b32_e32 v3, v10, v3 ; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2 @@ -1531,32 +1531,32 @@ define half @uitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: v_sub_u32_e32 v10, 64, v4 ; GISEL-NEXT: v_lshrrev_b64 v[8:9], v4, v[0:1] ; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] -; GISEL-NEXT: v_subrev_u32_e32 v12, 64, v4 +; GISEL-NEXT: v_add_u32_e32 v12, 0xffffffc0, v4 ; GISEL-NEXT: v_or_b32_e32 v10, v8, v10 ; GISEL-NEXT: v_or_b32_e32 v11, v9, v11 ; GISEL-NEXT: v_lshrrev_b64 v[8:9], v12, v[2:3] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 -; GISEL-NEXT: v_add_u32_e32 v5, 26, v5 +; GISEL-NEXT: v_add_u32_e32 v13, 26, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_sub_u32_e32 v10, 64, v5 +; GISEL-NEXT: v_sub_u32_e32 v10, 64, v13 ; GISEL-NEXT: v_cndmask_b32_e32 v12, v8, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v1, vcc -; GISEL-NEXT: v_lshrrev_b64 v[8:9], v5, -1 +; GISEL-NEXT: v_lshrrev_b64 v[8:9], v13, -1 ; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, -1 -; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v5 +; GISEL-NEXT: v_add_u32_e32 v5, 0xffffffda, v5 ; GISEL-NEXT: v_or_b32_e32 v14, v8, v10 ; GISEL-NEXT: v_or_b32_e32 v15, v9, v11 -; GISEL-NEXT: v_lshrrev_b64 v[10:11], v13, -1 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 +; GISEL-NEXT: v_lshrrev_b64 v[10:11], v5, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v10, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, v11, v15, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 ; GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, v10, -1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v10, v11, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, -1, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v2, v8, v2 ; GISEL-NEXT: v_and_b32_e32 v3, v9, v3 ; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll index 8d99ec2e1b709..b2bfc2ea4e0b2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -139,7 +139,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; VARIANT6-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 ; VARIANT6-NEXT: s_wait_kmcnt 0x0 ; VARIANT6-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_and_b32 v4, 0x3ff, v0 -; VARIANT6-NEXT: s_sub_co_i32 s2, s2, 1 +; VARIANT6-NEXT: s_add_co_i32 s2, s2, -1 ; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_1) ; VARIANT6-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_lshlrev_b32 v5, 2, v4 ; VARIANT6-NEXT: v_sub_nc_u32_e32 v0, s2, v4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll index a577fb3d190ab..d874418b99dd3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll @@ -395,7 +395,7 @@ define i1 @posnormal_f16(half %x) nounwind { ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v2 -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v1 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -459,7 +459,7 @@ define i1 @negnormal_f16(half %x) nounwind { ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GFX7GLISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, v2 -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v1 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -513,7 +513,7 @@ define i1 @possubnormal_f16(half %x) nounwind { ; GFX7GLISEL-LABEL: possubnormal_f16: ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 1, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, -1, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -575,7 +575,7 @@ define i1 @negsubnormal_f16(half %x) nounwind { ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GFX7GLISEL-NEXT: v_cmp_ne_u32_e32 vcc, v0, v2 -; GFX7GLISEL-NEXT: v_subrev_i32_e64 v0, s[4:5], 1, v1 +; GFX7GLISEL-NEXT: v_add_i32_e64 v0, s[4:5], -1, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1 @@ -1587,7 +1587,7 @@ define i1 @not_issubnormal_or_zero_f16(half %x) { ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00 -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7GLISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v2 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 @@ -1647,7 +1647,7 @@ define i1 @isnormal_f16(half %x) { ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -1780,7 +1780,7 @@ define i1 @not_is_plus_normal_f16(half %x) { ; GFX7GLISEL-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v3, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[6:7], s[6:7], vcc -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v1 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -1853,7 +1853,7 @@ define i1 @not_is_neg_normal_f16(half %x) { ; GFX7GLISEL-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v3, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[6:7], s[6:7], vcc -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v1 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -1911,7 +1911,7 @@ define i1 @issubnormal_f16(half %x) { ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 1, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, -1, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -1974,7 +1974,7 @@ define i1 @not_issubnormal_f16(half %x) { ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v1, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -2081,7 +2081,7 @@ define i1 @not_iszero_f16(half %x) { ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v1, vcc, 1, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v1, vcc, -1, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 @@ -2091,7 +2091,7 @@ define i1 @not_iszero_f16(half %x) { ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v1, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -2568,7 +2568,7 @@ define i1 @not_iszero_or_nan_f16(half %x) { ; GFX7GLISEL: ; %bb.0: ; %entry ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v1, vcc, 1, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v1, vcc, -1, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 @@ -2576,7 +2576,7 @@ define i1 @not_iszero_or_nan_f16(half %x) { ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -2635,7 +2635,7 @@ define i1 @not_iszero_or_nan_f_daz(half %x) #0 { ; GFX7GLISEL: ; %bb.0: ; %entry ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v1, vcc, 1, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v1, vcc, -1, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 @@ -2643,7 +2643,7 @@ define i1 @not_iszero_or_nan_f_daz(half %x) #0 { ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -2702,7 +2702,7 @@ define i1 @not_iszero_or_nan_f_maybe_daz(half %x) #1 { ; GFX7GLISEL: ; %bb.0: ; %entry ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v1, vcc, 1, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v1, vcc, -1, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 @@ -2710,7 +2710,7 @@ define i1 @not_iszero_or_nan_f_maybe_daz(half %x) #1 { ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -2902,7 +2902,7 @@ define i1 @not_iszero_or_qnan_f16(half %x) { ; GFX7GLISEL: ; %bb.0: ; %entry ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v1, vcc, 1, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v1, vcc, -1, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 @@ -2914,7 +2914,7 @@ define i1 @not_iszero_or_qnan_f16(half %x) { ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x1ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -2983,7 +2983,7 @@ define i1 @not_iszero_or_snan_f16(half %x) { ; GFX7GLISEL: ; %bb.0: ; %entry ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v1, vcc, 1, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v1, vcc, -1, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 @@ -2994,7 +2994,7 @@ define i1 @not_iszero_or_snan_f16(half %x) { ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7GLISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index 6ac04d8bc42bb..b3c06756a8987 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -41,7 +41,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc0, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -78,44 +78,79 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, 0xffffffc0, v3 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_i32_x_sub_64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_i32_x_sub_64: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_subrev_u32_e32 v1, 64, v1 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX10-LABEL: v_test_i32_x_sub_64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] -; GFX10-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_i32_x_sub_64: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 0xffffffc0, v1 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_i32_x_sub_64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX10-SDAG-LABEL: v_test_i32_x_sub_64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_test_i32_x_sub_64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v1, 0xffffffc0, v1 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_i32_x_sub_64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_i32_x_sub_64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 0xffffffc0, v1 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext @@ -163,8 +198,8 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 64, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc0, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xffffffc0, v3 ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 @@ -211,66 +246,119 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3 -; VI-GISEL-NEXT: v_subrev_u32_e32 v3, vcc, 64, v4 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, 0xffffffc0, v3 +; VI-GISEL-NEXT: v_add_u32_e32 v3, vcc, 0xffffffc0, v4 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: flat_store_dword v[0:1], v3 ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_i32_x_sub_64_multi_use: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1 -; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v2, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_i32_x_sub_64_multi_use: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[6:7] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_subrev_u32_e32 v1, 64, v1 +; GFX9-SDAG-NEXT: v_subrev_u32_e32 v2, 64, v2 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[4:5] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX10-LABEL: v_test_i32_x_sub_64_multi_use: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v2 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_dword v0, v2, s[4:5] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_i32_x_sub_64_multi_use: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[6:7] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 0xffffffc0, v1 +; GFX9-GISEL-NEXT: v_add_u32_e32 v2, 0xffffffc0, v2 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[4:5] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_i32_x_sub_64_multi_use: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 -; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 64, v2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_endpgm +; GFX10-SDAG-LABEL: v_test_i32_x_sub_64_multi_use: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v2, v0, s[6:7] glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 +; GFX10-SDAG-NEXT: v_subrev_nc_u32_e32 v2, 64, v2 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: global_store_dword v0, v2, s[4:5] +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_test_i32_x_sub_64_multi_use: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v2, v0, s[6:7] glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v1, 0xffffffc0, v1 +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v2, 0xffffffc0, v2 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: global_store_dword v0, v2, s[4:5] +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_i32_x_sub_64_multi_use: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 +; GFX11-SDAG-NEXT: v_subrev_nc_u32_e32 v2, 64, v2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[0:1] dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_i32_x_sub_64_multi_use: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 0xffffffc0, v1 +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v2, 0xffffffc0, v2 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1] dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext @@ -425,7 +513,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0x41, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffffffbf, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -462,79 +550,44 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_subrev_u32_e32 v2, vcc, 0x41, v3 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, 0xffffffbf, v3 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_test_i32_x_sub_65: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_add_u32_e32 v1, 0xffffffbf, v1 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_test_i32_x_sub_65: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 0x41, v1 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] -; GFX9-GISEL-NEXT: s_endpgm -; -; GFX10-SDAG-LABEL: v_test_i32_x_sub_65: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_test_i32_x_sub_65: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0x41, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] -; GFX10-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_test_i32_x_sub_65: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 0xffffffbf, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_test_i32_x_sub_65: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_endpgm +; GFX10-LABEL: v_test_i32_x_sub_65: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_test_i32_x_sub_65: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0x41, v1 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-LABEL: v_test_i32_x_sub_65: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext @@ -686,7 +739,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, -16, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 16, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -723,79 +776,44 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_subrev_u32_e32 v2, vcc, -16, v3 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, 16, v3 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg16: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_add_u32_e32 v1, 16, v1 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg16: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, -16, v1 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] -; GFX9-GISEL-NEXT: s_endpgm -; -; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg16: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_add_nc_u32_e32 v1, 16, v1 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg16: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, -16, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] -; GFX10-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_test_i32_x_sub_neg16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 16, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg16: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 16, v1 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_endpgm +; GFX10-LABEL: v_test_i32_x_sub_neg16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v1, 16, v1 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg16: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_subrev_nc_u32_e32 v1, -16, v1 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-LABEL: v_test_i32_x_sub_neg16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v1, 16, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext @@ -947,7 +965,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0xffffffef, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 17, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -983,80 +1001,45 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_subrev_u32_e32 v2, vcc, 0xffffffef, v3 -; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 -; VI-GISEL-NEXT: s_endpgm -; -; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg17: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_add_u32_e32 v1, 17, v1 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg17: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 0xffffffef, v1 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] -; GFX9-GISEL-NEXT: s_endpgm -; -; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg17: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_add_nc_u32_e32 v1, 17, v1 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg17: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0xffffffef, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] -; GFX10-GISEL-NEXT: s_endpgm +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, 17, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg17: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 17, v1 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_endpgm +; GFX9-LABEL: v_test_i32_x_sub_neg17: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 17, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg17: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0xffffffef, v1 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_endpgm +; GFX10-LABEL: v_test_i32_x_sub_neg17: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v1, 17, v1 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_i32_x_sub_neg17: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v1, 17, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext @@ -1263,7 +1246,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-GISEL-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc0, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -1300,44 +1283,79 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_subrev_u16_e32 v2, 64, v3 +; VI-GISEL-NEXT: v_add_u16_e32 v2, 0xffc0, v3 ; VI-GISEL-NEXT: flat_store_short v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_i16_x_sub_64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_i16_x_sub_64: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_subrev_u16_e32 v1, 64, v1 +; GFX9-SDAG-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX10-LABEL: v_test_i16_x_sub_64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64 -; GFX10-NEXT: global_store_short v0, v1, s[4:5] -; GFX10-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_i16_x_sub_64: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_add_u16_e32 v1, 0xffc0, v1 +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_i16_x_sub_64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_sub_nc_u16 v1, v1, 64 -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX10-SDAG-LABEL: v_test_i16_x_sub_64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_sub_nc_u16 v1, v1, 64 +; GFX10-SDAG-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_test_i16_x_sub_64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0 +; GFX10-GISEL-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_i16_x_sub_64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_sub_nc_u16 v1, v1, 64 +; GFX11-SDAG-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_i16_x_sub_64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0 +; GFX11-GISEL-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 %tid.ext @@ -1379,7 +1397,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; SI-GISEL-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 64, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 @@ -1419,50 +1437,91 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_subrev_u16_e32 v2, 64, v2 +; VI-GISEL-NEXT: v_add_u16_e32 v2, 0xffc0, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_i16_x_sub_64_zext_to_i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v1, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_ushort v1, v1, s[6:7] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_subrev_u16_e32 v1, 64, v1 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX10-LABEL: v_test_i16_x_sub_64_zext_to_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v1, s[6:7] -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] -; GFX10-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_ushort v1, v1, s[6:7] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_add_u16_e32 v1, 0xffc0, v1 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_i16_x_sub_64_zext_to_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v1, v1, s[2:3] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_sub_nc_u16 v1, v1, 64 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX10-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: global_load_ushort v1, v1, s[6:7] +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_sub_nc_u16 v1, v1, 64 +; GFX10-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: global_load_ushort v1, v1, s[6:7] +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0 +; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_u16 v1, v1, s[2:3] +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_sub_nc_u16 v1, v1, 64 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_u16 v1, v1, s[2:3] +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 %tid.ext @@ -1511,8 +1570,8 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; SI-GISEL-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 64, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc0, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xffffffc0, v3 ; SI-GISEL-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: buffer_store_short v3, v[0:1], s[0:3], 0 addr64 @@ -1559,66 +1618,119 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: v_subrev_u16_e32 v2, 64, v3 -; VI-GISEL-NEXT: v_subrev_u16_e32 v3, 64, v4 +; VI-GISEL-NEXT: v_add_u16_e32 v2, 0xffc0, v3 +; VI-GISEL-NEXT: v_add_u16_e32 v3, 0xffc0, v4 ; VI-GISEL-NEXT: flat_store_short v[0:1], v2 ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: flat_store_short v[0:1], v3 ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_i16_x_sub_64_multi_use: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1 -; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v2 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_short v0, v2, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_i16_x_sub_64_multi_use: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_ushort v1, v0, s[6:7] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_ushort v2, v0, s[6:7] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_subrev_u16_e32 v1, 64, v1 +; GFX9-SDAG-NEXT: v_subrev_u16_e32 v2, 64, v2 +; GFX9-SDAG-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_store_short v0, v2, s[4:5] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX10-LABEL: v_test_i16_x_sub_64_multi_use: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64 -; GFX10-NEXT: v_sub_nc_u16 v2, v2, 64 -; GFX10-NEXT: global_store_short v0, v1, s[4:5] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_short v0, v2, s[4:5] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_i16_x_sub_64_multi_use: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_ushort v1, v0, s[6:7] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_ushort v2, v0, s[6:7] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_add_u16_e32 v1, 0xffc0, v1 +; GFX9-GISEL-NEXT: v_add_u16_e32 v2, 0xffc0, v2 +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_store_short v0, v2, s[4:5] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_i16_x_sub_64_multi_use: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_sub_nc_u16 v1, v1, 64 -; GFX11-NEXT: v_sub_nc_u16 v2, v2, 64 -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b16 v0, v2, s[0:1] dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_endpgm +; GFX10-SDAG-LABEL: v_test_i16_x_sub_64_multi_use: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_sub_nc_u16 v1, v1, 64 +; GFX10-SDAG-NEXT: v_sub_nc_u16 v2, v2, 64 +; GFX10-SDAG-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: global_store_short v0, v2, s[4:5] +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_test_i16_x_sub_64_multi_use: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0 +; GFX10-GISEL-NEXT: v_add_nc_u16 v2, v2, 0xffc0 +; GFX10-GISEL-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: global_store_short v0, v2, s[4:5] +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_i16_x_sub_64_multi_use: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_sub_nc_u16 v1, v1, 64 +; GFX11-SDAG-NEXT: v_sub_nc_u16 v2, v2, 64 +; GFX11-SDAG-NEXT: global_store_b16 v0, v1, s[0:1] dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: global_store_b16 v0, v2, s[0:1] dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_i16_x_sub_64_multi_use: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0 +; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v2, 0xffc0 +; GFX11-GISEL-NEXT: global_store_b16 v0, v1, s[0:1] dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: global_store_b16 v0, v2, s[0:1] dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 %tid.ext @@ -1664,8 +1776,8 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 64, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc0, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xffffffc0, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -1698,7 +1810,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 64 +; VI-GISEL-NEXT: v_not_b32_e32 v4, 63 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -1710,8 +1822,8 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_subrev_u16_e32 v2, 64, v3 -; VI-GISEL-NEXT: v_sub_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-GISEL-NEXT: v_add_u16_e32 v2, 0xffc0, v3 +; VI-GISEL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm @@ -1792,8 +1904,8 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 7, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 64, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, -7, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xffffffc0, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -1826,7 +1938,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 64 +; VI-GISEL-NEXT: v_not_b32_e32 v4, 63 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -1838,8 +1950,8 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_subrev_u16_e32 v2, 7, v3 -; VI-GISEL-NEXT: v_sub_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-GISEL-NEXT: v_add_u16_e32 v2, -7, v3 +; VI-GISEL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm @@ -1933,8 +2045,8 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 0x7b, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc0, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xffffff85, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -1967,7 +2079,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7b +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff85 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -1979,8 +2091,8 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_subrev_u16_e32 v2, 64, v3 -; VI-GISEL-NEXT: v_sub_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-GISEL-NEXT: v_add_u16_e32 v2, 0xffc0, v3 +; VI-GISEL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm @@ -2074,7 +2186,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 7, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, -7, v2 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 @@ -2117,7 +2229,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; VI-GISEL-NEXT: v_subrev_u16_e32 v3, 7, v3 +; VI-GISEL-NEXT: v_add_u16_e32 v3, -7, v3 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-GISEL-NEXT: v_or_b32_e32 v2, v3, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -2197,7 +2309,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 16, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, -16, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -2237,11 +2349,11 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 16 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, -16 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-GISEL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-GISEL-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm @@ -2319,7 +2431,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 0xffffc400, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0x3c00, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -2359,11 +2471,11 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0xffffc400 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3c00 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-GISEL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-GISEL-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm @@ -2454,7 +2566,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 0x4400, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xffffbc00, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -2494,11 +2606,11 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4400 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0xffffbc00 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-GISEL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-GISEL-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll b/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll index c503d6541b0a5..14ff9e01ab3bc 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll @@ -457,19 +457,19 @@ entry: define i64 @subi_i64(i64 %a) { ; RV32IM-LABEL: subi_i64: ; RV32IM: # %bb.0: # %entry -; RV32IM-NEXT: lui a2, 301 -; RV32IM-NEXT: addi a3, a2, 1548 -; RV32IM-NEXT: sub a2, a0, a3 -; RV32IM-NEXT: sltu a0, a0, a3 -; RV32IM-NEXT: sub a1, a1, a0 -; RV32IM-NEXT: mv a0, a2 +; RV32IM-NEXT: lui a2, 1048275 +; RV32IM-NEXT: addi a2, a2, -1548 +; RV32IM-NEXT: add a0, a0, a2 +; RV32IM-NEXT: sltu a2, a0, a2 +; RV32IM-NEXT: addi a1, a1, -1 +; RV32IM-NEXT: add a1, a1, a2 ; RV32IM-NEXT: ret ; ; RV64IM-LABEL: subi_i64: ; RV64IM: # %bb.0: # %entry -; RV64IM-NEXT: lui a1, 301 -; RV64IM-NEXT: addiw a1, a1, 1548 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: lui a1, 1048275 +; RV64IM-NEXT: addiw a1, a1, -1548 +; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: ret entry: %0 = sub i64 %a, 1234444 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu-rv32.mir index 2ef5de501ee71..39d0ee7c382df 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu-rv32.mir @@ -200,8 +200,9 @@ body: | ; RV32I: liveins: $x10 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 - ; RV32I-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI [[COPY]], 1234 - ; RV32I-NEXT: $x10 = COPY [[ADDI]] + ; RV32I-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, -1234 + ; RV32I-NEXT: [[SUB:%[0-9]+]]:gpr = SUB [[COPY]], [[ADDI]] + ; RV32I-NEXT: $x10 = COPY [[SUB]] ; RV32I-NEXT: PseudoRET implicit $x10 %0:gprb(s32) = COPY $x10 %1:gprb(s32) = G_CONSTANT i32 -1234 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu-rv64.mir index be12333e1499b..527036d8b750f 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu-rv64.mir @@ -188,8 +188,9 @@ body: | ; RV64I: liveins: $x10 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 - ; RV64I-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[COPY]], 1234 - ; RV64I-NEXT: $x10 = COPY [[ADDIW]] + ; RV64I-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, -1234 + ; RV64I-NEXT: [[SUBW:%[0-9]+]]:gpr = SUBW [[COPY]], [[ADDI]] + ; RV64I-NEXT: $x10 = COPY [[SUBW]] ; RV64I-NEXT: PseudoRET implicit $x10 %0:gprb(s64) = COPY $x10 %1:gprb(s32) = G_TRUNC %0(s64) @@ -440,8 +441,9 @@ body: | ; RV64I: liveins: $x10 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 - ; RV64I-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI [[COPY]], 1234 - ; RV64I-NEXT: $x10 = COPY [[ADDI]] + ; RV64I-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, -1234 + ; RV64I-NEXT: [[SUB:%[0-9]+]]:gpr = SUB [[COPY]], [[ADDI]] + ; RV64I-NEXT: $x10 = COPY [[SUB]] ; RV64I-NEXT: PseudoRET implicit $x10 %0:gprb(s64) = COPY $x10 %1:gprb(s64) = G_CONSTANT i64 -1234 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-medium-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-medium-rv64.mir index 5d980e7721458..d0237892d132f 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-medium-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-medium-rv64.mir @@ -111,8 +111,8 @@ body: | %2:gprb(s64) = G_ASSERT_SEXT %1, 32 %7:gprb(s64) = G_CONSTANT i64 5 %3:gprb(s64) = G_SEXT_INREG %2, 32 - %4:gprb(s64) = G_CONSTANT i64 1 - %5:gprb(s64) = G_SUB %3, %4 + %4:gprb(s64) = G_CONSTANT i64 -1 + %5:gprb(s64) = G_ADD %3, %4 %26:gprb(s64) = G_ICMP intpred(ugt), %5(s64), %7 G_BRCOND %26(s64), %bb.8 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv32.mir index 27fe465ccf696..396421a4ba739 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv32.mir @@ -115,8 +115,8 @@ body: | %12:gprb(s32) = G_CONSTANT i32 3 %13:gprb(s32) = G_CONSTANT i32 4 %14:gprb(s32) = G_CONSTANT i32 1000 - %1:gprb(s32) = G_CONSTANT i32 1 - %2:gprb(s32) = G_SUB %0, %1 + %1:gprb(s32) = G_CONSTANT i32 -1 + %2:gprb(s32) = G_ADD %0, %1 %16:gprb(s32) = G_ICMP intpred(ugt), %2(s32), %4 G_BRCOND %16(s32), %bb.8 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv64.mir index 77156b913c5e8..0a08586bc1af4 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv64.mir @@ -112,8 +112,8 @@ body: | %2:gprb(s64) = G_ASSERT_SEXT %1, 32 %7:gprb(s64) = G_CONSTANT i64 5 %3:gprb(s64) = G_SEXT_INREG %2, 32 - %4:gprb(s64) = G_CONSTANT i64 1 - %5:gprb(s64) = G_SUB %3, %4 + %4:gprb(s64) = G_CONSTANT i64 -1 + %5:gprb(s64) = G_ADD %3, %4 %26:gprb(s64) = G_ICMP intpred(ugt), %5(s64), %7 G_BRCOND %26(s64), %bb.8 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-rv32.mir index 388c238b86eb6..efa1a6c86027d 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-rv32.mir @@ -171,8 +171,8 @@ body: | %12:gprb(s32) = G_CONSTANT i32 3 %13:gprb(s32) = G_CONSTANT i32 4 %14:gprb(s32) = G_CONSTANT i32 1000 - %1:gprb(s32) = G_CONSTANT i32 1 - %2:gprb(s32) = G_SUB %0, %1 + %1:gprb(s32) = G_CONSTANT i32 -1 + %2:gprb(s32) = G_ADD %0, %1 %16:gprb(s32) = G_ICMP intpred(ugt), %2(s32), %4 G_BRCOND %16(s32), %bb.8 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-small-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-small-rv64.mir index 09a855105c262..12b1517e2cfb5 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-small-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-small-rv64.mir @@ -112,8 +112,8 @@ body: | %2:gprb(s64) = G_ASSERT_SEXT %1, 32 %7:gprb(s64) = G_CONSTANT i64 5 %3:gprb(s64) = G_SEXT_INREG %2, 32 - %4:gprb(s64) = G_CONSTANT i64 1 - %5:gprb(s64) = G_SUB %3, %4 + %4:gprb(s64) = G_CONSTANT i64 -1 + %5:gprb(s64) = G_ADD %3, %4 %26:gprb(s64) = G_ICMP intpred(ugt), %5(s64), %7 G_BRCOND %26(s64), %bb.8 From fa7ec2bea3bc5df8adc810a970b81cdcc42d1c85 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 30 Oct 2024 15:43:07 -0700 Subject: [PATCH 2/5] fixup! address review comments. --- llvm/include/llvm/Target/GlobalISel/Combine.td | 3 ++- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 13 +++++-------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 9891db5ceb6fa..80a22c35ebcef 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -338,7 +338,8 @@ def mul_to_shl : GICombineRule< // (sub x, C) -> (add x, -C) def sub_to_add : GICombineRule< (defs root:$d, build_fn_matchinfo:$matchinfo), - (match (G_SUB $d, $op1, $op2):$mi, + (match (G_CONSTANT $c, $imm), + (G_SUB $d, $op1, $c):$mi, [{ return Helper.matchCombineSubToAdd(*${mi}, ${matchinfo}); }]), (apply [{ Helper.applyBuildFnNoErase(*${mi}, ${matchinfo}); }])>; diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 91e5af9dfd8e2..da9860352b00d 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -2043,17 +2043,14 @@ void CombinerHelper::applyCombineMulToShl(MachineInstr &MI, bool CombinerHelper::matchCombineSubToAdd(MachineInstr &MI, BuildFnTy &MatchInfo) { - assert(MI.getOpcode() == TargetOpcode::G_SUB && "Expected a G_SUB"); - auto MaybeImmVal = - getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); - if (!MaybeImmVal) - return false; + GSub &Sub = cast(MI); - LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + LLT Ty = MRI.getType(Sub.getReg(0)); + + APInt Imm = getIConstantFromReg(Sub.getRHSReg(), MRI); - APInt NegImm = -MaybeImmVal->Value; MatchInfo = [=, &MI](MachineIRBuilder &B) { - auto NegCst = B.buildConstant(Ty, NegImm); + auto NegCst = B.buildConstant(Ty, -Imm); Observer.changingInstr(MI); MI.setDesc(B.getTII().get(TargetOpcode::G_ADD)); MI.getOperand(2).setReg(NegCst.getReg(0)); From 78d79f0edff688b99a0773165a04d1acd9a419af Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 30 Oct 2024 16:20:21 -0700 Subject: [PATCH 3/5] fixup! Add nuw/nsw flag tests. --- .../prelegalizercombiner-trivial-arith.mir | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-trivial-arith.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-trivial-arith.mir index bc3be691bd25a..4c3faa9403909 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-trivial-arith.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-trivial-arith.mir @@ -488,3 +488,66 @@ body: | RET_ReallyLR implicit $w0 ... +--- +name: sub_to_add +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $w0 + ; CHECK-LABEL: name: sub_to_add + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %x:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: %op:_(s32) = G_ADD %x, [[C]] + ; CHECK-NEXT: $w0 = COPY %op(s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %x:_(s32) = COPY $w0 + %cst:_(s32) = G_CONSTANT i32 1 + %op:_(s32) = G_SUB %x(s32), %cst + $w0 = COPY %op(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: sub_to_add_nuw +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $w0 + ; CHECK-LABEL: name: sub_to_add_nuw + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %x:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: %op:_(s32) = G_ADD %x, [[C]] + ; CHECK-NEXT: $w0 = COPY %op(s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %x:_(s32) = COPY $w0 + %cst:_(s32) = G_CONSTANT i32 1 + %op:_(s32) = nuw G_SUB %x(s32), %cst + $w0 = COPY %op(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: sub_to_add_nsw +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $w0 + ; CHECK-LABEL: name: sub_to_add_nsw + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %x:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: %op:_(s32) = nsw G_ADD %x, [[C]] + ; CHECK-NEXT: $w0 = COPY %op(s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %x:_(s32) = COPY $w0 + %cst:_(s32) = G_CONSTANT i32 1 + %op:_(s32) = nsw G_SUB %x(s32), %cst + $w0 = COPY %op(s32) + RET_ReallyLR implicit $w0 + +... From 1e3a249129e7c63696030abaa663baf345019d14 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 30 Oct 2024 22:55:04 -0700 Subject: [PATCH 4/5] fixup! Add isLegalOrBeforeLegalizer check --- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index da9860352b00d..494938c32739a 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -2047,6 +2047,9 @@ bool CombinerHelper::matchCombineSubToAdd(MachineInstr &MI, LLT Ty = MRI.getType(Sub.getReg(0)); + if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, {Ty}})) + return false; + APInt Imm = getIConstantFromReg(Sub.getRHSReg(), MRI); MatchInfo = [=, &MI](MachineIRBuilder &B) { From 27433abcb8583c2aeacb836a21e6e43607af2795 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 31 Oct 2024 10:25:29 -0700 Subject: [PATCH 5/5] fixup! Add isConstantLegalOrBeforeLegalizer. --- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 494938c32739a..55df8dcb095f5 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -2050,6 +2050,9 @@ bool CombinerHelper::matchCombineSubToAdd(MachineInstr &MI, if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, {Ty}})) return false; + if (!isConstantLegalOrBeforeLegalizer(Ty)) + return false; + APInt Imm = getIConstantFromReg(Sub.getRHSReg(), MRI); MatchInfo = [=, &MI](MachineIRBuilder &B) {