Skip to content

Commit 037d397

Browse files
committed
[AMDGPU][MC] Allow opsel for v_max_i16 etc in GFX10
In GFX10, a number of VOP3 instructions should allow opsel, including V_MAX_I16, V_MAX_U16, V_MIN_I16, V_MIN_U16, V_MUL_LO_U16, V_LSHLREV_B16, V_LSHRREV_B16, and V_ASHRREV_I16.
1 parent 7114cfb commit 037d397

24 files changed

+471
-283
lines changed

llvm/lib/Target/AMDGPU/VOP2Instructions.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,10 @@ multiclass VOP2Inst_e64_t16<string opName,
211211
string revOp = opName> {
212212
let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
213213
defm NAME : VOP2Inst<opName, P, node, revOp>;
214+
let SubtargetPredicate = isGFX10Only in {
215+
def _vop3_e64 : VOP3InstBase <opName#"_vop3", VOP3_Profile<P, VOP3_OPSEL>, node, 1>,
216+
Commutable_REV<revOp#"_vop3_e64", !eq(revOp, opName)>;
217+
}
214218
}
215219
let SubtargetPredicate = UseRealTrue16Insts in {
216220
defm _t16 : VOP2Inst_e64<opName#"_t16", VOPProfile_True16<P>, node, revOp#"_t16">;

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1932,16 +1932,14 @@ defm V_DIV_FIXUP_F16 :
19321932
defm V_ADD_NC_U16 : VOP3OpSel_Real_gfx10<0x303>;
19331933
defm V_SUB_NC_U16 : VOP3OpSel_Real_gfx10<0x304>;
19341934

1935-
// FIXME-GFX10-OPSEL: Need to add "selective" opsel support to some of these
1936-
// (they do not support SDWA or DPP).
1937-
defm V_MUL_LO_U16 : VOP3_Real_gfx10_with_name<0x305, "V_MUL_LO_U16", "v_mul_lo_u16">;
1938-
defm V_LSHRREV_B16 : VOP3_Real_gfx10_with_name<0x307, "V_LSHRREV_B16", "v_lshrrev_b16">;
1939-
defm V_ASHRREV_I16 : VOP3_Real_gfx10_with_name<0x308, "V_ASHRREV_I16", "v_ashrrev_i16">;
1940-
defm V_MAX_U16 : VOP3_Real_gfx10_with_name<0x309, "V_MAX_U16", "v_max_u16">;
1941-
defm V_MAX_I16 : VOP3_Real_gfx10_with_name<0x30a, "V_MAX_I16", "v_max_i16">;
1942-
defm V_MIN_U16 : VOP3_Real_gfx10_with_name<0x30b, "V_MIN_U16", "v_min_u16">;
1943-
defm V_MIN_I16 : VOP3_Real_gfx10_with_name<0x30c, "V_MIN_I16", "v_min_i16">;
1944-
defm V_LSHLREV_B16 : VOP3_Real_gfx10_with_name<0x314, "V_LSHLREV_B16", "v_lshlrev_b16">;
1935+
defm V_MUL_LO_U16 : VOP3OpSel_Real_gfx10_with_name<0x305, "V_MUL_LO_U16_vop3", "v_mul_lo_u16">;
1936+
defm V_LSHRREV_B16 : VOP3OpSel_Real_gfx10_with_name<0x307, "V_LSHRREV_B16_vop3", "v_lshrrev_b16">;
1937+
defm V_ASHRREV_I16 : VOP3OpSel_Real_gfx10_with_name<0x308, "V_ASHRREV_I16_vop3", "v_ashrrev_i16">;
1938+
defm V_MAX_U16 : VOP3OpSel_Real_gfx10_with_name<0x309, "V_MAX_U16_vop3", "v_max_u16">;
1939+
defm V_MAX_I16 : VOP3OpSel_Real_gfx10_with_name<0x30a, "V_MAX_I16_vop3", "v_max_i16">;
1940+
defm V_MIN_U16 : VOP3OpSel_Real_gfx10_with_name<0x30b, "V_MIN_U16_vop3", "v_min_u16">;
1941+
defm V_MIN_I16 : VOP3OpSel_Real_gfx10_with_name<0x30c, "V_MIN_I16_vop3", "v_min_i16">;
1942+
defm V_LSHLREV_B16 : VOP3OpSel_Real_gfx10_with_name<0x314, "V_LSHLREV_B16_vop3", "v_lshlrev_b16">;
19451943
defm V_PERMLANE16_B32 : VOP3OpSel_Real_gfx10<0x377>;
19461944
defm V_PERMLANEX16_B32 : VOP3OpSel_Real_gfx10<0x378>;
19471945

llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -864,25 +864,25 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
864864
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
865865
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1
866866
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v2
867-
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0
867+
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v0
868868
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
869869
; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2
870870
; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v3
871-
; GFX10-NEXT: v_xor_b32_e32 v6, -1, v4
871+
; GFX10-NEXT: v_xor_b32_e32 v5, -1, v4
872872
; GFX10-NEXT: v_and_b32_e32 v4, 7, v4
873873
; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
874874
; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1
875875
; GFX10-NEXT: v_lshrrev_b16 v3, 1, v3
876-
; GFX10-NEXT: v_and_b32_e32 v6, 7, v6
877-
; GFX10-NEXT: v_and_b32_e32 v7, 7, v7
878-
; GFX10-NEXT: v_lshlrev_b16 v4, v4, v5
876+
; GFX10-NEXT: v_and_b32_e32 v5, 7, v5
877+
; GFX10-NEXT: v_lshlrev_b16 v4, v4, v6
879878
; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0
880-
; GFX10-NEXT: v_lshrrev_b16 v3, v6, v3
881-
; GFX10-NEXT: v_lshrrev_b16 v1, v7, v1
882-
; GFX10-NEXT: v_or_b32_e32 v2, v4, v3
883-
; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
879+
; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3
880+
; GFX10-NEXT: v_and_b32_e32 v5, 7, v7
881+
; GFX10-NEXT: v_or_b32_e32 v3, v4, v3
882+
; GFX10-NEXT: v_lshrrev_b16 v1, v5, v1
883+
; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v3
884884
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
885-
; GFX10-NEXT: v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
885+
; GFX10-NEXT: v_lshlrev_b16 v1, 8, v2
886886
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
887887
; GFX10-NEXT: s_setpc_b64 s[30:31]
888888
;

llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -864,25 +864,25 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
864864
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
865865
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2
866866
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0
867-
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1
867+
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1
868868
; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2
869869
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
870-
; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3
870+
; GFX10-NEXT: v_xor_b32_e32 v5, -1, v3
871871
; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4
872872
; GFX10-NEXT: v_and_b32_e32 v3, 7, v3
873-
; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v5
873+
; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v6
874874
; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
875-
; GFX10-NEXT: v_and_b32_e32 v6, 7, v6
875+
; GFX10-NEXT: v_and_b32_e32 v5, 7, v5
876876
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
877-
; GFX10-NEXT: v_and_b32_e32 v7, 7, v7
878-
; GFX10-NEXT: v_lshrrev_b16 v3, v3, v5
879-
; GFX10-NEXT: v_lshlrev_b16 v4, v6, v4
877+
; GFX10-NEXT: v_lshrrev_b16 v3, v3, v6
878+
; GFX10-NEXT: v_lshlrev_b16 v4, v5, v4
879+
; GFX10-NEXT: v_and_b32_e32 v5, 7, v7
880880
; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1
881-
; GFX10-NEXT: v_lshlrev_b16 v0, v7, v0
882-
; GFX10-NEXT: v_or_b32_e32 v2, v4, v3
883-
; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
881+
; GFX10-NEXT: v_or_b32_e32 v3, v4, v3
882+
; GFX10-NEXT: v_lshlrev_b16 v0, v5, v0
883+
; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v3
884884
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
885-
; GFX10-NEXT: v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
885+
; GFX10-NEXT: v_lshlrev_b16 v1, 8, v2
886886
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
887887
; GFX10-NEXT: s_setpc_b64 s[30:31]
888888
;

llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -71,17 +71,17 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(ptr addrspace(4) inreg %ptr, i8
7171
; GFX10: ; %bb.0:
7272
; GFX10-NEXT: v_mov_b32_e32 v0, 0
7373
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 1
74-
; GFX10-NEXT: v_mov_b32_e32 v2, 0xff
7574
; GFX10-NEXT: global_load_ushort v0, v0, s[2:3]
7675
; GFX10-NEXT: s_waitcnt vmcnt(0)
7776
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
7877
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, s0
7978
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 0
80-
; GFX10-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
81-
; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, s4, s0
79+
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
80+
; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s4, s0
81+
; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
8282
; GFX10-NEXT: v_mov_b32_e32 v0, 0
8383
; GFX10-NEXT: v_mov_b32_e32 v1, 0
84-
; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
84+
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
8585
; GFX10-NEXT: global_store_short v[0:1], v2, off
8686
; GFX10-NEXT: s_endpgm
8787
;
@@ -175,16 +175,16 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(ptr addrspace(1) %ptr, i8 inreg
175175
; GFX10: ; %bb.0:
176176
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
177177
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 1
178-
; GFX10-NEXT: v_mov_b32_e32 v2, 0xff
179178
; GFX10-NEXT: s_waitcnt vmcnt(0)
180179
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
181180
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, s0
182181
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 0
183-
; GFX10-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
184-
; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, s2, s0
182+
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
183+
; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, s0
184+
; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
185185
; GFX10-NEXT: v_mov_b32_e32 v0, 0
186186
; GFX10-NEXT: v_mov_b32_e32 v1, 0
187-
; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
187+
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
188188
; GFX10-NEXT: global_store_short v[0:1], v2, off
189189
; GFX10-NEXT: s_endpgm
190190
;
@@ -277,17 +277,17 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(ptr addrspace(4) inreg %ptr, i8
277277
; GFX10: ; %bb.0:
278278
; GFX10-NEXT: v_mov_b32_e32 v1, 0
279279
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1
280-
; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
281280
; GFX10-NEXT: global_load_ushort v1, v1, s[2:3]
282281
; GFX10-NEXT: s_waitcnt vmcnt(0)
283282
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1
284283
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
285284
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 0
286-
; GFX10-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
287-
; GFX10-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc_lo
285+
; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2
286+
; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc_lo
288287
; GFX10-NEXT: v_mov_b32_e32 v0, 0
289288
; GFX10-NEXT: v_mov_b32_e32 v1, 0
290-
; GFX10-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
289+
; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2
290+
; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
291291
; GFX10-NEXT: global_store_short v[0:1], v2, off
292292
; GFX10-NEXT: s_endpgm
293293
;
@@ -383,17 +383,17 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(ptr addrspace(4) inreg %ptr, i8
383383
; GFX10: ; %bb.0:
384384
; GFX10-NEXT: v_mov_b32_e32 v1, 0
385385
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
386-
; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
387386
; GFX10-NEXT: global_load_ushort v1, v1, s[2:3]
388387
; GFX10-NEXT: s_waitcnt vmcnt(0)
389388
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1
390389
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, vcc_lo
391390
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
392-
; GFX10-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
393-
; GFX10-NEXT: v_cndmask_b32_e64 v4, v1, s4, vcc_lo
391+
; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2
392+
; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, s4, vcc_lo
394393
; GFX10-NEXT: v_mov_b32_e32 v0, 0
395394
; GFX10-NEXT: v_mov_b32_e32 v1, 0
396-
; GFX10-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
395+
; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2
396+
; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
397397
; GFX10-NEXT: global_store_short v[0:1], v2, off
398398
; GFX10-NEXT: s_endpgm
399399
;
@@ -487,16 +487,16 @@ define amdgpu_ps void @insertelement_s_v2i8_v_v(ptr addrspace(4) inreg %ptr, i8
487487
; GFX10: ; %bb.0:
488488
; GFX10-NEXT: v_mov_b32_e32 v2, 0
489489
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
490-
; GFX10-NEXT: v_mov_b32_e32 v4, 0xff
491490
; GFX10-NEXT: global_load_ushort v2, v2, s[2:3]
492491
; GFX10-NEXT: s_waitcnt vmcnt(0)
493492
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2
494493
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo
495494
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
496-
; GFX10-NEXT: v_and_b32_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
495+
; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v3
497496
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
498497
; GFX10-NEXT: v_mov_b32_e32 v0, 0
499498
; GFX10-NEXT: v_mov_b32_e32 v1, 0
499+
; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3
500500
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
501501
; GFX10-NEXT: global_store_short v[0:1], v2, off
502502
; GFX10-NEXT: s_endpgm
@@ -590,13 +590,13 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(ptr addrspace(1) %ptr, i8 inreg
590590
; GFX10: ; %bb.0:
591591
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
592592
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
593-
; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
594593
; GFX10-NEXT: s_waitcnt vmcnt(0)
595594
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
596595
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
597596
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
598-
; GFX10-NEXT: v_and_b32_sdwa v3, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
597+
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
599598
; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, vcc_lo
599+
; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
600600
; GFX10-NEXT: v_mov_b32_e32 v0, 0
601601
; GFX10-NEXT: v_mov_b32_e32 v1, 0
602602
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -689,13 +689,13 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(ptr addrspace(1) %ptr, i8 %val,
689689
; GFX10: ; %bb.0:
690690
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
691691
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1
692-
; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
693692
; GFX10-NEXT: s_waitcnt vmcnt(0)
694693
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
695694
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
696695
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0
697-
; GFX10-NEXT: v_and_b32_sdwa v3, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
696+
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
698697
; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo
698+
; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
699699
; GFX10-NEXT: v_mov_b32_e32 v0, 0
700700
; GFX10-NEXT: v_mov_b32_e32 v1, 0
701701
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -788,13 +788,13 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(ptr addrspace(1) %ptr, i8 %val,
788788
; GFX10: ; %bb.0:
789789
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
790790
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
791-
; GFX10-NEXT: v_mov_b32_e32 v4, 0xff
792791
; GFX10-NEXT: s_waitcnt vmcnt(0)
793792
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
794793
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
795794
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
796-
; GFX10-NEXT: v_and_b32_sdwa v3, v1, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
795+
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
797796
; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo
797+
; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
798798
; GFX10-NEXT: v_mov_b32_e32 v0, 0
799799
; GFX10-NEXT: v_mov_b32_e32 v1, 0
800800
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ body: |
100100
; GFX10-NEXT: {{ $}}
101101
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
102102
; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
103-
; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
103+
; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
104104
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]]
105105
; GFX11-LABEL: name: ashr_s16_s16_vs
106106
; GFX11: liveins: $sgpr0, $vgpr0
@@ -193,7 +193,7 @@ body: |
193193
; GFX10-NEXT: {{ $}}
194194
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
195195
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
196-
; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
196+
; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
197197
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]]
198198
; GFX11-LABEL: name: ashr_s16_s16_vv
199199
; GFX11: liveins: $vgpr0, $vgpr1
@@ -238,7 +238,7 @@ body: |
238238
; GFX10-NEXT: {{ $}}
239239
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
240240
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
241-
; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
241+
; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
242242
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
243243
; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_ASHRREV_I16_e64_]], implicit $exec
244244
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
@@ -292,7 +292,7 @@ body: |
292292
; GFX10-NEXT: {{ $}}
293293
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
294294
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
295-
; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
295+
; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
296296
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
297297
; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
298298
; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_ASHRREV_I16_e64_]], implicit $exec
@@ -442,7 +442,7 @@ body: |
442442
; GFX10-NEXT: {{ $}}
443443
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
444444
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
445-
; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
445+
; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
446446
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]]
447447
; GFX11-LABEL: name: ashr_s16_s16_sv
448448
; GFX11: liveins: $sgpr0, $vgpr0

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ body: |
9898
; GFX10-NEXT: {{ $}}
9999
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
100100
; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
101-
; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
101+
; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
102102
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]]
103103
; GFX11-LABEL: name: lshr_s16_s16_vs
104104
; GFX11: liveins: $sgpr0, $vgpr0
@@ -191,7 +191,7 @@ body: |
191191
; GFX10-NEXT: {{ $}}
192192
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
193193
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
194-
; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
194+
; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
195195
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]]
196196
; GFX11-LABEL: name: lshr_s16_s16_vv
197197
; GFX11: liveins: $vgpr0, $vgpr1
@@ -236,7 +236,7 @@ body: |
236236
; GFX10-NEXT: {{ $}}
237237
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
238238
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
239-
; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
239+
; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
240240
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
241241
; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_LSHRREV_B16_e64_]], implicit $exec
242242
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
@@ -290,7 +290,7 @@ body: |
290290
; GFX10-NEXT: {{ $}}
291291
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
292292
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
293-
; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
293+
; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
294294
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
295295
; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
296296
; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHRREV_B16_e64_]], implicit $exec
@@ -440,7 +440,7 @@ body: |
440440
; GFX10-NEXT: {{ $}}
441441
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
442442
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
443-
; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
443+
; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
444444
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]]
445445
; GFX11-LABEL: name: lshr_s16_s16_sv
446446
; GFX11: liveins: $sgpr0, $vgpr0

0 commit comments

Comments
 (0)