Skip to content

Commit 5df1ac7

Browse files
committed
[AMDGPU] fixed divergence driven shift operations selection
Differential Revision: https://reviews.llvm.org/D73483 Reviewers: rampitec
1 parent ac8da31 commit 5df1ac7

17 files changed

+222
-76
lines changed

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -541,22 +541,22 @@ let AddedComplexity = 1 in {
541541
let Defs = [SCC] in {
542542
// TODO: b64 versions require VOP3 change since v_lshlrev_b64 is VOP3
543543
def S_LSHL_B32 : SOP2_32 <"s_lshl_b32",
544-
[(set SReg_32:$sdst, (shl (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
544+
[(set SReg_32:$sdst, (UniformBinFrag<shl> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
545545
>;
546546
def S_LSHL_B64 : SOP2_64_32 <"s_lshl_b64",
547-
[(set SReg_64:$sdst, (shl (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
547+
[(set SReg_64:$sdst, (UniformBinFrag<shl> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
548548
>;
549549
def S_LSHR_B32 : SOP2_32 <"s_lshr_b32",
550-
[(set SReg_32:$sdst, (srl (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
550+
[(set SReg_32:$sdst, (UniformBinFrag<srl> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
551551
>;
552552
def S_LSHR_B64 : SOP2_64_32 <"s_lshr_b64",
553-
[(set SReg_64:$sdst, (srl (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
553+
[(set SReg_64:$sdst, (UniformBinFrag<srl> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
554554
>;
555555
def S_ASHR_I32 : SOP2_32 <"s_ashr_i32",
556-
[(set SReg_32:$sdst, (sra (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
556+
[(set SReg_32:$sdst, (UniformBinFrag<sra> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
557557
>;
558558
def S_ASHR_I64 : SOP2_64_32 <"s_ashr_i64",
559-
[(set SReg_64:$sdst, (sra (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
559+
[(set SReg_64:$sdst, (UniformBinFrag<sra> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
560560
>;
561561
} // End Defs = [SCC]
562562

llvm/lib/Target/AMDGPU/VOP2Instructions.td

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -541,14 +541,17 @@ defm V_MIN_LEGACY_F32 : VOP2Inst <"v_min_legacy_f32", VOP_F32_F32_F32, AMDGPUfmi
541541
defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfmax_legacy>;
542542
} // End SubtargetPredicate = isGFX6GFX7
543543

544-
let SubtargetPredicate = isGFX6GFX7GFX10 in {
545544
let isCommutable = 1 in {
545+
let SubtargetPredicate = isGFX6GFX7GFX10 in {
546546
defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>;
547-
defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32, srl>;
548-
defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32, sra>;
549-
defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32, shl>;
550-
} // End isCommutable = 1
551547
} // End SubtargetPredicate = isGFX6GFX7GFX10
548+
let SubtargetPredicate = isGFX6GFX7 in {
549+
defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, srl>;
550+
defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, sra>;
551+
defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, shl>;
552+
} // End SubtargetPredicate = isGFX6GFX7
553+
} // End isCommutable = 1
554+
552555

553556
class DivergentBinOp<SDPatternOperator Op, VOP_Pseudo Inst> :
554557
GCNPat<

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -385,10 +385,12 @@ def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I3
385385
}
386386

387387
let SchedRW = [Write64Bit] in {
388-
let SubtargetPredicate = isGFX6GFX7GFX10 in {
388+
let SubtargetPredicate = isGFX6GFX7 in {
389389
def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>, shl>;
390390
def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>, srl>;
391391
def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, sra>;
392+
} // End SubtargetPredicate = isGFX6GFX7
393+
let SubtargetPredicate = isGFX6GFX7GFX10 in {
392394
def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
393395
} // End SubtargetPredicate = isGFX6GFX7GFX10
394396

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.mir

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -237,8 +237,8 @@ body: |
237237
; GFX10: $vcc_hi = IMPLICIT_DEF
238238
; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
239239
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
240-
; GFX10: [[V_ASHR_I64_:%[0-9]+]]:vreg_64 = V_ASHR_I64 [[COPY]], [[COPY1]], implicit $exec
241-
; GFX10: S_ENDPGM 0, implicit [[V_ASHR_I64_]]
240+
; GFX10: [[V_ASHRREV_I64_:%[0-9]+]]:vreg_64 = V_ASHRREV_I64 [[COPY1]], [[COPY]], implicit $exec
241+
; GFX10: S_ENDPGM 0, implicit [[V_ASHRREV_I64_]]
242242
%0:sgpr(s64) = COPY $sgpr0_sgpr1
243243
%1:vgpr(s32) = COPY $vgpr0
244244
%2:vgpr(s64) = G_ASHR %0, %1
@@ -277,8 +277,8 @@ body: |
277277
; GFX10: $vcc_hi = IMPLICIT_DEF
278278
; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
279279
; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
280-
; GFX10: [[V_ASHR_I64_:%[0-9]+]]:vreg_64 = V_ASHR_I64 [[COPY]], [[COPY1]], implicit $exec
281-
; GFX10: S_ENDPGM 0, implicit [[V_ASHR_I64_]]
280+
; GFX10: [[V_ASHRREV_I64_:%[0-9]+]]:vreg_64 = V_ASHRREV_I64 [[COPY1]], [[COPY]], implicit $exec
281+
; GFX10: S_ENDPGM 0, implicit [[V_ASHRREV_I64_]]
282282
%0:vgpr(s64) = COPY $vgpr0_vgpr1
283283
%1:sgpr(s32) = COPY $sgpr0
284284
%2:vgpr(s64) = G_ASHR %0, %1
@@ -317,8 +317,8 @@ body: |
317317
; GFX10: $vcc_hi = IMPLICIT_DEF
318318
; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
319319
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
320-
; GFX10: [[V_ASHR_I64_:%[0-9]+]]:vreg_64 = V_ASHR_I64 [[COPY]], [[COPY1]], implicit $exec
321-
; GFX10: S_ENDPGM 0, implicit [[V_ASHR_I64_]]
320+
; GFX10: [[V_ASHRREV_I64_:%[0-9]+]]:vreg_64 = V_ASHRREV_I64 [[COPY1]], [[COPY]], implicit $exec
321+
; GFX10: S_ENDPGM 0, implicit [[V_ASHRREV_I64_]]
322322
%0:vgpr(s64) = COPY $vgpr0_vgpr1
323323
%1:vgpr(s32) = COPY $vgpr2
324324
%2:vgpr(s64) = G_ASHR %0, %1

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.mir

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -237,8 +237,8 @@ body: |
237237
; GFX10: $vcc_hi = IMPLICIT_DEF
238238
; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
239239
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
240-
; GFX10: [[V_LSHR_B64_:%[0-9]+]]:vreg_64 = V_LSHR_B64 [[COPY]], [[COPY1]], implicit $exec
241-
; GFX10: S_ENDPGM 0, implicit [[V_LSHR_B64_]]
240+
; GFX10: [[V_LSHRREV_B64_:%[0-9]+]]:vreg_64 = V_LSHRREV_B64 [[COPY1]], [[COPY]], implicit $exec
241+
; GFX10: S_ENDPGM 0, implicit [[V_LSHRREV_B64_]]
242242
%0:sgpr(s64) = COPY $sgpr0_sgpr1
243243
%1:vgpr(s32) = COPY $vgpr0
244244
%2:vgpr(s64) = G_LSHR %0, %1
@@ -277,8 +277,8 @@ body: |
277277
; GFX10: $vcc_hi = IMPLICIT_DEF
278278
; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
279279
; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
280-
; GFX10: [[V_LSHR_B64_:%[0-9]+]]:vreg_64 = V_LSHR_B64 [[COPY]], [[COPY1]], implicit $exec
281-
; GFX10: S_ENDPGM 0, implicit [[V_LSHR_B64_]]
280+
; GFX10: [[V_LSHRREV_B64_:%[0-9]+]]:vreg_64 = V_LSHRREV_B64 [[COPY1]], [[COPY]], implicit $exec
281+
; GFX10: S_ENDPGM 0, implicit [[V_LSHRREV_B64_]]
282282
%0:vgpr(s64) = COPY $vgpr0_vgpr1
283283
%1:sgpr(s32) = COPY $sgpr0
284284
%2:vgpr(s64) = G_LSHR %0, %1
@@ -317,8 +317,8 @@ body: |
317317
; GFX10: $vcc_hi = IMPLICIT_DEF
318318
; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
319319
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
320-
; GFX10: [[V_LSHR_B64_:%[0-9]+]]:vreg_64 = V_LSHR_B64 [[COPY]], [[COPY1]], implicit $exec
321-
; GFX10: S_ENDPGM 0, implicit [[V_LSHR_B64_]]
320+
; GFX10: [[V_LSHRREV_B64_:%[0-9]+]]:vreg_64 = V_LSHRREV_B64 [[COPY1]], [[COPY]], implicit $exec
321+
; GFX10: S_ENDPGM 0, implicit [[V_LSHRREV_B64_]]
322322
%0:vgpr(s64) = COPY $vgpr0_vgpr1
323323
%1:vgpr(s32) = COPY $vgpr2
324324
%2:vgpr(s64) = G_LSHR %0, %1

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.mir

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -237,8 +237,8 @@ body: |
237237
; GFX10: $vcc_hi = IMPLICIT_DEF
238238
; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
239239
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
240-
; GFX10: [[V_LSHL_B64_:%[0-9]+]]:vreg_64 = V_LSHL_B64 [[COPY]], [[COPY1]], implicit $exec
241-
; GFX10: S_ENDPGM 0, implicit [[V_LSHL_B64_]]
240+
; GFX10: [[V_LSHLREV_B64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64 [[COPY1]], [[COPY]], implicit $exec
241+
; GFX10: S_ENDPGM 0, implicit [[V_LSHLREV_B64_]]
242242
%0:sgpr(s64) = COPY $sgpr0_sgpr1
243243
%1:vgpr(s32) = COPY $vgpr0
244244
%2:vgpr(s64) = G_SHL %0, %1
@@ -277,8 +277,8 @@ body: |
277277
; GFX10: $vcc_hi = IMPLICIT_DEF
278278
; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
279279
; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
280-
; GFX10: [[V_LSHL_B64_:%[0-9]+]]:vreg_64 = V_LSHL_B64 [[COPY]], [[COPY1]], implicit $exec
281-
; GFX10: S_ENDPGM 0, implicit [[V_LSHL_B64_]]
280+
; GFX10: [[V_LSHLREV_B64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64 [[COPY1]], [[COPY]], implicit $exec
281+
; GFX10: S_ENDPGM 0, implicit [[V_LSHLREV_B64_]]
282282
%0:vgpr(s64) = COPY $vgpr0_vgpr1
283283
%1:sgpr(s32) = COPY $sgpr0
284284
%2:vgpr(s64) = G_SHL %0, %1
@@ -317,8 +317,8 @@ body: |
317317
; GFX10: $vcc_hi = IMPLICIT_DEF
318318
; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
319319
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
320-
; GFX10: [[V_LSHL_B64_:%[0-9]+]]:vreg_64 = V_LSHL_B64 [[COPY]], [[COPY1]], implicit $exec
321-
; GFX10: S_ENDPGM 0, implicit [[V_LSHL_B64_]]
320+
; GFX10: [[V_LSHLREV_B64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64 [[COPY1]], [[COPY]], implicit $exec
321+
; GFX10: S_ENDPGM 0, implicit [[V_LSHLREV_B64_]]
322322
%0:vgpr(s64) = COPY $vgpr0_vgpr1
323323
%1:vgpr(s32) = COPY $vgpr2
324324
%2:vgpr(s64) = G_SHL %0, %1

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,6 @@
22
; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
33
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
44

5-
; XFAIL: *
6-
; FIXME: Merge with DAG test
7-
85
define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) {
96
; GFX8-LABEL: dpp_test:
107
; GFX8: ; %bb.0:
@@ -19,6 +16,7 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2)
1916
; GFX8-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
2017
; GFX8-NEXT: flat_store_dword v[0:1], v2
2118
; GFX8-NEXT: s_endpgm
19+
;
2220
; GFX10-LABEL: dpp_test:
2321
; GFX10: ; %bb.0:
2422
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -43,9 +41,10 @@ define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i
4341
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0
4442
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
4543
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
46-
; GFX8-NEXT: v_mov_b32_e32 v2, s1
47-
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
48-
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
44+
; GFX8-NEXT: v_mov_b32_e32 v3, s1
45+
; GFX8-NEXT: v_mov_b32_e32 v2, s0
46+
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0
47+
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
4948
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
5049
; GFX8-NEXT: v_mov_b32_e32 v5, s3
5150
; GFX8-NEXT: v_mov_b32_e32 v4, s2
@@ -55,21 +54,20 @@ define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i
5554
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
5655
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
5756
; GFX8-NEXT: s_endpgm
57+
;
5858
; GFX10-LABEL: update_dpp64_test:
5959
; GFX10: ; %bb.0:
60-
; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0
6160
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
62-
; GFX10-NEXT: v_mul_lo_u32 v2, v0, 0
63-
; GFX10-NEXT: v_mul_hi_u32 v3, v0, 8
64-
; GFX10-NEXT: v_mul_lo_u32 v0, v0, 8
65-
; GFX10-NEXT: v_mul_lo_u32 v1, v1, 8
61+
; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0
6662
; GFX10-NEXT: ; implicit-def: $vcc_hi
63+
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
6764
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
68-
; GFX10-NEXT: v_add_co_u32_e64 v6, vcc_lo, s0, v0
69-
; GFX10-NEXT: v_add3_u32 v1, v1, v2, v3
65+
; GFX10-NEXT: v_mov_b32_e32 v3, s1
66+
; GFX10-NEXT: v_mov_b32_e32 v2, s0
7067
; GFX10-NEXT: v_mov_b32_e32 v5, s3
7168
; GFX10-NEXT: v_mov_b32_e32 v4, s2
72-
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s1, v1, vcc_lo
69+
; GFX10-NEXT: v_add_co_u32_e64 v6, vcc_lo, v2, v0
70+
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v1, vcc_lo
7371
; GFX10-NEXT: global_load_dwordx2 v[2:3], v[6:7], off
7472
; GFX10-NEXT: s_waitcnt vmcnt(0)
7573
; GFX10-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1

llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@ define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, i32, <2 x
4242
; CI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
4343
; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, [[LHS]]
4444
; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
45-
; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
46-
; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
45+
; CI: v_ashr_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
46+
; CI: v_ashr_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
4747
; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
4848
; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
4949
; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}

llvm/test/CodeGen/AMDGPU/bfe-patterns.ll

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,11 @@ define amdgpu_kernel void @v_ubfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(
2424
; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
2525
; GCN: v_sub_{{[iu]}}32_e32 [[SUB:v[0-9]+]], vcc, 32, [[WIDTH]]
2626

27-
; GCN-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]]
28-
; GCN-NEXT: v_lshrrev_b32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]]
27+
; SI-NEXT: v_lshl_b32_e32 [[SHL:v[0-9]+]], [[SRC]], [[SUB]]
28+
; SI-NEXT: v_lshr_b32_e32 [[BFE:v[0-9]+]], [[SHL]], [[SUB]]
29+
30+
; VI-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]]
31+
; VI-NEXT: v_lshrrev_b32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]]
2932

3033
; GCN: [[BFE]]
3134
; GCN: [[SHL]]
@@ -97,8 +100,11 @@ define amdgpu_kernel void @v_sbfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(
97100
; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
98101
; GCN: v_sub_{{[iu]}}32_e32 [[SUB:v[0-9]+]], vcc, 32, [[WIDTH]]
99102

100-
; GCN-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]]
101-
; GCN-NEXT: v_ashrrev_i32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]]
103+
; SI-NEXT: v_lshl_b32_e32 [[SHL:v[0-9]+]], [[SRC]], [[SUB]]
104+
; SI-NEXT: v_ashr_i32_e32 [[BFE:v[0-9]+]], [[SHL]], [[SUB]]
105+
106+
; VI-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]]
107+
; VI-NEXT: v_ashrrev_i32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]]
102108

103109
; GCN: [[BFE]]
104110
; GCN: [[SHL]]

llvm/test/CodeGen/AMDGPU/commute-shifts.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ define amdgpu_ps float @main(float %arg0, float %arg1) #0 {
1717
; SI-NEXT: image_load v2, v0, s[0:7] dmask:0x1 unorm
1818
; SI-NEXT: v_and_b32_e32 v0, 7, v0
1919
; SI-NEXT: s_waitcnt vmcnt(0)
20-
; SI-NEXT: v_lshrrev_b32_e32 v0, v0, v2
20+
; SI-NEXT: v_lshr_b32_e32 v0, v2, v0
2121
; SI-NEXT: v_and_b32_e32 v0, 1, v0
2222
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
2323
; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc

0 commit comments

Comments
 (0)