22; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -o - < %s | FileCheck %s --check-prefixes=GFX,GFX6
33; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -o - < %s | FileCheck %s --check-prefixes=GFX,GFX8
44; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -o - < %s | FileCheck %s --check-prefixes=GFX,GFX10
5+ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 -o - < %s | FileCheck %s --check-prefixes=GFX,GFX1250
56
67declare i16 @llvm.abs.i16 (i16 , i1 )
78declare i32 @llvm.abs.i32 (i32 , i1 )
@@ -13,11 +14,30 @@ declare <3 x i16> @llvm.abs.v3i16(<3 x i16>, i1)
1314declare <4 x i32 > @llvm.abs.v4i32 (<4 x i32 >, i1 )
1415
1516define amdgpu_cs i16 @abs_sgpr_i16 (i16 inreg %arg ) {
16- ; GFX-LABEL: abs_sgpr_i16:
17- ; GFX: ; %bb.0:
18- ; GFX-NEXT: s_sext_i32_i16 s0, s0
19- ; GFX-NEXT: s_abs_i32 s0, s0
20- ; GFX-NEXT: ; return to shader part epilog
17+ ; GFX6-LABEL: abs_sgpr_i16:
18+ ; GFX6: ; %bb.0:
19+ ; GFX6-NEXT: s_sext_i32_i16 s0, s0
20+ ; GFX6-NEXT: s_abs_i32 s0, s0
21+ ; GFX6-NEXT: ; return to shader part epilog
22+ ;
23+ ; GFX8-LABEL: abs_sgpr_i16:
24+ ; GFX8: ; %bb.0:
25+ ; GFX8-NEXT: s_sext_i32_i16 s0, s0
26+ ; GFX8-NEXT: s_abs_i32 s0, s0
27+ ; GFX8-NEXT: ; return to shader part epilog
28+ ;
29+ ; GFX10-LABEL: abs_sgpr_i16:
30+ ; GFX10: ; %bb.0:
31+ ; GFX10-NEXT: s_sext_i32_i16 s0, s0
32+ ; GFX10-NEXT: s_abs_i32 s0, s0
33+ ; GFX10-NEXT: ; return to shader part epilog
34+ ;
35+ ; GFX1250-LABEL: abs_sgpr_i16:
36+ ; GFX1250: ; %bb.0:
37+ ; GFX1250-NEXT: s_sext_i32_i16 s0, s0
38+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
39+ ; GFX1250-NEXT: s_abs_i32 s0, s0
40+ ; GFX1250-NEXT: ; return to shader part epilog
2141 %res = call i16 @llvm.abs.i16 (i16 %arg , i1 false )
2242 ret i16 %res
2343}
@@ -32,14 +52,42 @@ define amdgpu_cs i32 @abs_sgpr_i32(i32 inreg %arg) {
3252}
3353
3454define amdgpu_cs i64 @abs_sgpr_i64 (i64 inreg %arg ) {
35- ; GFX-LABEL: abs_sgpr_i64:
36- ; GFX: ; %bb.0:
37- ; GFX-NEXT: s_ashr_i32 s2, s1, 31
38- ; GFX-NEXT: s_add_u32 s0, s0, s2
39- ; GFX-NEXT: s_mov_b32 s3, s2
40- ; GFX-NEXT: s_addc_u32 s1, s1, s2
41- ; GFX-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
42- ; GFX-NEXT: ; return to shader part epilog
55+ ; GFX6-LABEL: abs_sgpr_i64:
56+ ; GFX6: ; %bb.0:
57+ ; GFX6-NEXT: s_ashr_i32 s2, s1, 31
58+ ; GFX6-NEXT: s_add_u32 s0, s0, s2
59+ ; GFX6-NEXT: s_mov_b32 s3, s2
60+ ; GFX6-NEXT: s_addc_u32 s1, s1, s2
61+ ; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
62+ ; GFX6-NEXT: ; return to shader part epilog
63+ ;
64+ ; GFX8-LABEL: abs_sgpr_i64:
65+ ; GFX8: ; %bb.0:
66+ ; GFX8-NEXT: s_ashr_i32 s2, s1, 31
67+ ; GFX8-NEXT: s_add_u32 s0, s0, s2
68+ ; GFX8-NEXT: s_mov_b32 s3, s2
69+ ; GFX8-NEXT: s_addc_u32 s1, s1, s2
70+ ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
71+ ; GFX8-NEXT: ; return to shader part epilog
72+ ;
73+ ; GFX10-LABEL: abs_sgpr_i64:
74+ ; GFX10: ; %bb.0:
75+ ; GFX10-NEXT: s_ashr_i32 s2, s1, 31
76+ ; GFX10-NEXT: s_add_u32 s0, s0, s2
77+ ; GFX10-NEXT: s_mov_b32 s3, s2
78+ ; GFX10-NEXT: s_addc_u32 s1, s1, s2
79+ ; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
80+ ; GFX10-NEXT: ; return to shader part epilog
81+ ;
82+ ; GFX1250-LABEL: abs_sgpr_i64:
83+ ; GFX1250: ; %bb.0:
84+ ; GFX1250-NEXT: s_ashr_i32 s2, s1, 31
85+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
86+ ; GFX1250-NEXT: s_mov_b32 s3, s2
87+ ; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
88+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
89+ ; GFX1250-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
90+ ; GFX1250-NEXT: ; return to shader part epilog
4391 %res = call i64 @llvm.abs.i64 (i64 %arg , i1 false )
4492 ret i64 %res
4593}
@@ -78,6 +126,14 @@ define amdgpu_cs i16 @abs_vgpr_i16(i16 %arg) {
78126; GFX10-NEXT: v_max_i16 v0, v0, v1
79127; GFX10-NEXT: v_readfirstlane_b32 s0, v0
80128; GFX10-NEXT: ; return to shader part epilog
129+ ;
130+ ; GFX1250-LABEL: abs_vgpr_i16:
131+ ; GFX1250: ; %bb.0:
132+ ; GFX1250-NEXT: v_sub_nc_u16 v1, 0, v0
133+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
134+ ; GFX1250-NEXT: v_max_i16 v0, v0, v1
135+ ; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
136+ ; GFX1250-NEXT: ; return to shader part epilog
81137 %res = call i16 @llvm.abs.i16 (i16 %arg , i1 false )
82138 ret i16 %res
83139}
@@ -103,6 +159,14 @@ define amdgpu_cs i32 @abs_vgpr_i32(i32 %arg) {
103159; GFX10-NEXT: v_max_i32_e32 v0, v0, v1
104160; GFX10-NEXT: v_readfirstlane_b32 s0, v0
105161; GFX10-NEXT: ; return to shader part epilog
162+ ;
163+ ; GFX1250-LABEL: abs_vgpr_i32:
164+ ; GFX1250: ; %bb.0:
165+ ; GFX1250-NEXT: v_sub_nc_u32_e32 v1, 0, v0
166+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
167+ ; GFX1250-NEXT: v_max_i32_e32 v0, v0, v1
168+ ; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
169+ ; GFX1250-NEXT: ; return to shader part epilog
106170 %res = call i32 @llvm.abs.i32 (i32 %arg , i1 false )
107171 ret i32 %res
108172}
@@ -140,6 +204,20 @@ define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) {
140204; GFX10-NEXT: v_readfirstlane_b32 s0, v0
141205; GFX10-NEXT: v_readfirstlane_b32 s1, v1
142206; GFX10-NEXT: ; return to shader part epilog
207+ ;
208+ ; GFX1250-LABEL: abs_vgpr_i64:
209+ ; GFX1250: ; %bb.0:
210+ ; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v1
211+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
212+ ; GFX1250-NEXT: v_mov_b32_e32 v3, v2
213+ ; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
214+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
215+ ; GFX1250-NEXT: v_xor_b32_e32 v0, v0, v2
216+ ; GFX1250-NEXT: v_xor_b32_e32 v1, v1, v2
217+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
218+ ; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
219+ ; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
220+ ; GFX1250-NEXT: ; return to shader part epilog
143221 %res = call i64 @llvm.abs.i64 (i64 %arg , i1 false )
144222 ret i64 %res
145223}
@@ -192,6 +270,24 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
192270; GFX10-NEXT: v_readfirstlane_b32 s2, v2
193271; GFX10-NEXT: v_readfirstlane_b32 s3, v3
194272; GFX10-NEXT: ; return to shader part epilog
273+ ;
274+ ; GFX1250-LABEL: abs_vgpr_v4i32:
275+ ; GFX1250: ; %bb.0:
276+ ; GFX1250-NEXT: v_dual_sub_nc_u32 v4, 0, v0 :: v_dual_sub_nc_u32 v5, 0, v1
277+ ; GFX1250-NEXT: v_dual_sub_nc_u32 v6, 0, v2 :: v_dual_sub_nc_u32 v7, 0, v3
278+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
279+ ; GFX1250-NEXT: v_max_i32_e32 v0, v0, v4
280+ ; GFX1250-NEXT: v_max_i32_e32 v1, v1, v5
281+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
282+ ; GFX1250-NEXT: v_max_i32_e32 v2, v2, v6
283+ ; GFX1250-NEXT: v_max_i32_e32 v3, v3, v7
284+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
285+ ; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
286+ ; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
287+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
288+ ; GFX1250-NEXT: v_readfirstlane_b32 s2, v2
289+ ; GFX1250-NEXT: v_readfirstlane_b32 s3, v3
290+ ; GFX1250-NEXT: ; return to shader part epilog
195291 %res = call <4 x i32 > @llvm.abs.v4i32 (<4 x i32 > %arg , i1 false )
196292 ret <4 x i32 > %res
197293}
@@ -243,6 +339,21 @@ define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
243339; GFX10-NEXT: v_readfirstlane_b32 s0, v0
244340; GFX10-NEXT: v_readfirstlane_b32 s1, v1
245341; GFX10-NEXT: ; return to shader part epilog
342+ ;
343+ ; GFX1250-LABEL: abs_vgpr_v2i8:
344+ ; GFX1250: ; %bb.0:
345+ ; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8
346+ ; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8
347+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
348+ ; GFX1250-NEXT: v_sub_nc_u16 v2, 0, v0
349+ ; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v1
350+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
351+ ; GFX1250-NEXT: v_max_i16 v0, v0, v2
352+ ; GFX1250-NEXT: v_max_i16 v1, v1, v3
353+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
354+ ; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
355+ ; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
356+ ; GFX1250-NEXT: ; return to shader part epilog
246357 %res = call <2 x i8 > @llvm.abs.v2i8 (<2 x i8 > %arg , i1 false )
247358 ret <2 x i8 > %res
248359}
@@ -307,6 +418,27 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
307418; GFX10-NEXT: v_readfirstlane_b32 s1, v1
308419; GFX10-NEXT: v_readfirstlane_b32 s2, v2
309420; GFX10-NEXT: ; return to shader part epilog
421+ ;
422+ ; GFX1250-LABEL: abs_vgpr_v3i8:
423+ ; GFX1250: ; %bb.0:
424+ ; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8
425+ ; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8
426+ ; GFX1250-NEXT: v_bfe_i32 v2, v2, 0, 8
427+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
428+ ; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v0
429+ ; GFX1250-NEXT: v_sub_nc_u16 v4, 0, v1
430+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
431+ ; GFX1250-NEXT: v_sub_nc_u16 v5, 0, v2
432+ ; GFX1250-NEXT: v_max_i16 v0, v0, v3
433+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
434+ ; GFX1250-NEXT: v_max_i16 v1, v1, v4
435+ ; GFX1250-NEXT: v_max_i16 v2, v2, v5
436+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
437+ ; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
438+ ; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
439+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
440+ ; GFX1250-NEXT: v_readfirstlane_b32 s2, v2
441+ ; GFX1250-NEXT: ; return to shader part epilog
310442 %res = call <3 x i8 > @llvm.abs.v3i8 (<3 x i8 > %arg , i1 false )
311443 ret <3 x i8 > %res
312444}
@@ -341,6 +473,16 @@ define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) {
341473; GFX10-NEXT: s_abs_i32 s0, s0
342474; GFX10-NEXT: s_pack_ll_b32_b16 s0, s1, s0
343475; GFX10-NEXT: ; return to shader part epilog
476+ ;
477+ ; GFX1250-LABEL: abs_sgpr_v2i16:
478+ ; GFX1250: ; %bb.0:
479+ ; GFX1250-NEXT: s_sext_i32_i16 s1, s0
480+ ; GFX1250-NEXT: s_ashr_i32 s0, s0, 16
481+ ; GFX1250-NEXT: s_abs_i32 s1, s1
482+ ; GFX1250-NEXT: s_abs_i32 s0, s0
483+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
484+ ; GFX1250-NEXT: s_pack_ll_b32_b16 s0, s1, s0
485+ ; GFX1250-NEXT: ; return to shader part epilog
344486 %res = call <2 x i16 > @llvm.abs.v2i16 (<2 x i16 > %arg , i1 false )
345487 ret <2 x i16 > %res
346488}
@@ -375,6 +517,14 @@ define amdgpu_cs <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) {
375517; GFX10-NEXT: v_pk_max_i16 v0, v0, v1
376518; GFX10-NEXT: v_readfirstlane_b32 s0, v0
377519; GFX10-NEXT: ; return to shader part epilog
520+ ;
521+ ; GFX1250-LABEL: abs_vgpr_v2i16:
522+ ; GFX1250: ; %bb.0:
523+ ; GFX1250-NEXT: v_pk_sub_i16 v1, 0, v0
524+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
525+ ; GFX1250-NEXT: v_pk_max_i16 v0, v0, v1
526+ ; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
527+ ; GFX1250-NEXT: ; return to shader part epilog
378528 %res = call <2 x i16 > @llvm.abs.v2i16 (<2 x i16 > %arg , i1 false )
379529 ret <2 x i16 > %res
380530}
@@ -416,6 +566,17 @@ define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) {
416566; GFX10-NEXT: s_pack_ll_b32_b16 s0, s2, s0
417567; GFX10-NEXT: s_abs_i32 s1, s1
418568; GFX10-NEXT: ; return to shader part epilog
569+ ;
570+ ; GFX1250-LABEL: abs_sgpr_v3i16:
571+ ; GFX1250: ; %bb.0:
572+ ; GFX1250-NEXT: s_sext_i32_i16 s2, s0
573+ ; GFX1250-NEXT: s_ashr_i32 s0, s0, 16
574+ ; GFX1250-NEXT: s_abs_i32 s2, s2
575+ ; GFX1250-NEXT: s_abs_i32 s0, s0
576+ ; GFX1250-NEXT: s_sext_i32_i16 s1, s1
577+ ; GFX1250-NEXT: s_pack_ll_b32_b16 s0, s2, s0
578+ ; GFX1250-NEXT: s_abs_i32 s1, s1
579+ ; GFX1250-NEXT: ; return to shader part epilog
419580 %res = call <3 x i16 > @llvm.abs.v3i16 (<3 x i16 > %arg , i1 false )
420581 ret <3 x i16 > %res
421582}
@@ -460,6 +621,18 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
460621; GFX10-NEXT: v_readfirstlane_b32 s0, v0
461622; GFX10-NEXT: v_readfirstlane_b32 s1, v1
462623; GFX10-NEXT: ; return to shader part epilog
624+ ;
625+ ; GFX1250-LABEL: abs_vgpr_v3i16:
626+ ; GFX1250: ; %bb.0:
627+ ; GFX1250-NEXT: v_pk_sub_i16 v2, 0, v0
628+ ; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v1
629+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
630+ ; GFX1250-NEXT: v_pk_max_i16 v0, v0, v2
631+ ; GFX1250-NEXT: v_max_i16 v1, v1, v3
632+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
633+ ; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
634+ ; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
635+ ; GFX1250-NEXT: ; return to shader part epilog
463636 %res = call <3 x i16 > @llvm.abs.v3i16 (<3 x i16 > %arg , i1 false )
464637 ret <3 x i16 > %res
465638}
0 commit comments