2
2
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -o - < %s | FileCheck %s --check-prefixes=GFX,GFX6
3
3
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -o - < %s | FileCheck %s --check-prefixes=GFX,GFX8
4
4
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -o - < %s | FileCheck %s --check-prefixes=GFX,GFX10
5
+ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 -o - < %s | FileCheck %s --check-prefixes=GFX,GFX1250
5
6
6
7
declare i16 @llvm.abs.i16 (i16 , i1 )
7
8
declare i32 @llvm.abs.i32 (i32 , i1 )
@@ -13,11 +14,30 @@ declare <3 x i16> @llvm.abs.v3i16(<3 x i16>, i1)
13
14
declare <4 x i32 > @llvm.abs.v4i32 (<4 x i32 >, i1 )
14
15
15
16
define amdgpu_cs i16 @abs_sgpr_i16 (i16 inreg %arg ) {
16
- ; GFX-LABEL: abs_sgpr_i16:
17
- ; GFX: ; %bb.0:
18
- ; GFX-NEXT: s_sext_i32_i16 s0, s0
19
- ; GFX-NEXT: s_abs_i32 s0, s0
20
- ; GFX-NEXT: ; return to shader part epilog
17
+ ; GFX6-LABEL: abs_sgpr_i16:
18
+ ; GFX6: ; %bb.0:
19
+ ; GFX6-NEXT: s_sext_i32_i16 s0, s0
20
+ ; GFX6-NEXT: s_abs_i32 s0, s0
21
+ ; GFX6-NEXT: ; return to shader part epilog
22
+ ;
23
+ ; GFX8-LABEL: abs_sgpr_i16:
24
+ ; GFX8: ; %bb.0:
25
+ ; GFX8-NEXT: s_sext_i32_i16 s0, s0
26
+ ; GFX8-NEXT: s_abs_i32 s0, s0
27
+ ; GFX8-NEXT: ; return to shader part epilog
28
+ ;
29
+ ; GFX10-LABEL: abs_sgpr_i16:
30
+ ; GFX10: ; %bb.0:
31
+ ; GFX10-NEXT: s_sext_i32_i16 s0, s0
32
+ ; GFX10-NEXT: s_abs_i32 s0, s0
33
+ ; GFX10-NEXT: ; return to shader part epilog
34
+ ;
35
+ ; GFX1250-LABEL: abs_sgpr_i16:
36
+ ; GFX1250: ; %bb.0:
37
+ ; GFX1250-NEXT: s_sext_i32_i16 s0, s0
38
+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
39
+ ; GFX1250-NEXT: s_abs_i32 s0, s0
40
+ ; GFX1250-NEXT: ; return to shader part epilog
21
41
%res = call i16 @llvm.abs.i16 (i16 %arg , i1 false )
22
42
ret i16 %res
23
43
}
@@ -32,14 +52,42 @@ define amdgpu_cs i32 @abs_sgpr_i32(i32 inreg %arg) {
32
52
}
33
53
34
54
define amdgpu_cs i64 @abs_sgpr_i64 (i64 inreg %arg ) {
35
- ; GFX-LABEL: abs_sgpr_i64:
36
- ; GFX: ; %bb.0:
37
- ; GFX-NEXT: s_ashr_i32 s2, s1, 31
38
- ; GFX-NEXT: s_add_u32 s0, s0, s2
39
- ; GFX-NEXT: s_mov_b32 s3, s2
40
- ; GFX-NEXT: s_addc_u32 s1, s1, s2
41
- ; GFX-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
42
- ; GFX-NEXT: ; return to shader part epilog
55
+ ; GFX6-LABEL: abs_sgpr_i64:
56
+ ; GFX6: ; %bb.0:
57
+ ; GFX6-NEXT: s_ashr_i32 s2, s1, 31
58
+ ; GFX6-NEXT: s_add_u32 s0, s0, s2
59
+ ; GFX6-NEXT: s_mov_b32 s3, s2
60
+ ; GFX6-NEXT: s_addc_u32 s1, s1, s2
61
+ ; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
62
+ ; GFX6-NEXT: ; return to shader part epilog
63
+ ;
64
+ ; GFX8-LABEL: abs_sgpr_i64:
65
+ ; GFX8: ; %bb.0:
66
+ ; GFX8-NEXT: s_ashr_i32 s2, s1, 31
67
+ ; GFX8-NEXT: s_add_u32 s0, s0, s2
68
+ ; GFX8-NEXT: s_mov_b32 s3, s2
69
+ ; GFX8-NEXT: s_addc_u32 s1, s1, s2
70
+ ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
71
+ ; GFX8-NEXT: ; return to shader part epilog
72
+ ;
73
+ ; GFX10-LABEL: abs_sgpr_i64:
74
+ ; GFX10: ; %bb.0:
75
+ ; GFX10-NEXT: s_ashr_i32 s2, s1, 31
76
+ ; GFX10-NEXT: s_add_u32 s0, s0, s2
77
+ ; GFX10-NEXT: s_mov_b32 s3, s2
78
+ ; GFX10-NEXT: s_addc_u32 s1, s1, s2
79
+ ; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
80
+ ; GFX10-NEXT: ; return to shader part epilog
81
+ ;
82
+ ; GFX1250-LABEL: abs_sgpr_i64:
83
+ ; GFX1250: ; %bb.0:
84
+ ; GFX1250-NEXT: s_ashr_i32 s2, s1, 31
85
+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
86
+ ; GFX1250-NEXT: s_mov_b32 s3, s2
87
+ ; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
88
+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
89
+ ; GFX1250-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
90
+ ; GFX1250-NEXT: ; return to shader part epilog
43
91
%res = call i64 @llvm.abs.i64 (i64 %arg , i1 false )
44
92
ret i64 %res
45
93
}
@@ -78,6 +126,14 @@ define amdgpu_cs i16 @abs_vgpr_i16(i16 %arg) {
78
126
; GFX10-NEXT: v_max_i16 v0, v0, v1
79
127
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
80
128
; GFX10-NEXT: ; return to shader part epilog
129
+ ;
130
+ ; GFX1250-LABEL: abs_vgpr_i16:
131
+ ; GFX1250: ; %bb.0:
132
+ ; GFX1250-NEXT: v_sub_nc_u16 v1, 0, v0
133
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
134
+ ; GFX1250-NEXT: v_max_i16 v0, v0, v1
135
+ ; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
136
+ ; GFX1250-NEXT: ; return to shader part epilog
81
137
%res = call i16 @llvm.abs.i16 (i16 %arg , i1 false )
82
138
ret i16 %res
83
139
}
@@ -103,6 +159,14 @@ define amdgpu_cs i32 @abs_vgpr_i32(i32 %arg) {
103
159
; GFX10-NEXT: v_max_i32_e32 v0, v0, v1
104
160
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
105
161
; GFX10-NEXT: ; return to shader part epilog
162
+ ;
163
+ ; GFX1250-LABEL: abs_vgpr_i32:
164
+ ; GFX1250: ; %bb.0:
165
+ ; GFX1250-NEXT: v_sub_nc_u32_e32 v1, 0, v0
166
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
167
+ ; GFX1250-NEXT: v_max_i32_e32 v0, v0, v1
168
+ ; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
169
+ ; GFX1250-NEXT: ; return to shader part epilog
106
170
%res = call i32 @llvm.abs.i32 (i32 %arg , i1 false )
107
171
ret i32 %res
108
172
}
@@ -140,6 +204,20 @@ define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) {
140
204
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
141
205
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
142
206
; GFX10-NEXT: ; return to shader part epilog
207
+ ;
208
+ ; GFX1250-LABEL: abs_vgpr_i64:
209
+ ; GFX1250: ; %bb.0:
210
+ ; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v1
211
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
212
+ ; GFX1250-NEXT: v_mov_b32_e32 v3, v2
213
+ ; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
214
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
215
+ ; GFX1250-NEXT: v_xor_b32_e32 v0, v0, v2
216
+ ; GFX1250-NEXT: v_xor_b32_e32 v1, v1, v2
217
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
218
+ ; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
219
+ ; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
220
+ ; GFX1250-NEXT: ; return to shader part epilog
143
221
%res = call i64 @llvm.abs.i64 (i64 %arg , i1 false )
144
222
ret i64 %res
145
223
}
@@ -192,6 +270,24 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
192
270
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
193
271
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
194
272
; GFX10-NEXT: ; return to shader part epilog
273
+ ;
274
+ ; GFX1250-LABEL: abs_vgpr_v4i32:
275
+ ; GFX1250: ; %bb.0:
276
+ ; GFX1250-NEXT: v_dual_sub_nc_u32 v4, 0, v0 :: v_dual_sub_nc_u32 v5, 0, v1
277
+ ; GFX1250-NEXT: v_dual_sub_nc_u32 v6, 0, v2 :: v_dual_sub_nc_u32 v7, 0, v3
278
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
279
+ ; GFX1250-NEXT: v_max_i32_e32 v0, v0, v4
280
+ ; GFX1250-NEXT: v_max_i32_e32 v1, v1, v5
281
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
282
+ ; GFX1250-NEXT: v_max_i32_e32 v2, v2, v6
283
+ ; GFX1250-NEXT: v_max_i32_e32 v3, v3, v7
284
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
285
+ ; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
286
+ ; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
287
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
288
+ ; GFX1250-NEXT: v_readfirstlane_b32 s2, v2
289
+ ; GFX1250-NEXT: v_readfirstlane_b32 s3, v3
290
+ ; GFX1250-NEXT: ; return to shader part epilog
195
291
%res = call <4 x i32 > @llvm.abs.v4i32 (<4 x i32 > %arg , i1 false )
196
292
ret <4 x i32 > %res
197
293
}
@@ -243,6 +339,21 @@ define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
243
339
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
244
340
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
245
341
; GFX10-NEXT: ; return to shader part epilog
342
+ ;
343
+ ; GFX1250-LABEL: abs_vgpr_v2i8:
344
+ ; GFX1250: ; %bb.0:
345
+ ; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8
346
+ ; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8
347
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
348
+ ; GFX1250-NEXT: v_sub_nc_u16 v2, 0, v0
349
+ ; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v1
350
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
351
+ ; GFX1250-NEXT: v_max_i16 v0, v0, v2
352
+ ; GFX1250-NEXT: v_max_i16 v1, v1, v3
353
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
354
+ ; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
355
+ ; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
356
+ ; GFX1250-NEXT: ; return to shader part epilog
246
357
%res = call <2 x i8 > @llvm.abs.v2i8 (<2 x i8 > %arg , i1 false )
247
358
ret <2 x i8 > %res
248
359
}
@@ -307,6 +418,27 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
307
418
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
308
419
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
309
420
; GFX10-NEXT: ; return to shader part epilog
421
+ ;
422
+ ; GFX1250-LABEL: abs_vgpr_v3i8:
423
+ ; GFX1250: ; %bb.0:
424
+ ; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8
425
+ ; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8
426
+ ; GFX1250-NEXT: v_bfe_i32 v2, v2, 0, 8
427
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
428
+ ; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v0
429
+ ; GFX1250-NEXT: v_sub_nc_u16 v4, 0, v1
430
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
431
+ ; GFX1250-NEXT: v_sub_nc_u16 v5, 0, v2
432
+ ; GFX1250-NEXT: v_max_i16 v0, v0, v3
433
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
434
+ ; GFX1250-NEXT: v_max_i16 v1, v1, v4
435
+ ; GFX1250-NEXT: v_max_i16 v2, v2, v5
436
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
437
+ ; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
438
+ ; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
439
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
440
+ ; GFX1250-NEXT: v_readfirstlane_b32 s2, v2
441
+ ; GFX1250-NEXT: ; return to shader part epilog
310
442
%res = call <3 x i8 > @llvm.abs.v3i8 (<3 x i8 > %arg , i1 false )
311
443
ret <3 x i8 > %res
312
444
}
@@ -341,6 +473,16 @@ define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) {
341
473
; GFX10-NEXT: s_abs_i32 s0, s0
342
474
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s1, s0
343
475
; GFX10-NEXT: ; return to shader part epilog
476
+ ;
477
+ ; GFX1250-LABEL: abs_sgpr_v2i16:
478
+ ; GFX1250: ; %bb.0:
479
+ ; GFX1250-NEXT: s_sext_i32_i16 s1, s0
480
+ ; GFX1250-NEXT: s_ashr_i32 s0, s0, 16
481
+ ; GFX1250-NEXT: s_abs_i32 s1, s1
482
+ ; GFX1250-NEXT: s_abs_i32 s0, s0
483
+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
484
+ ; GFX1250-NEXT: s_pack_ll_b32_b16 s0, s1, s0
485
+ ; GFX1250-NEXT: ; return to shader part epilog
344
486
%res = call <2 x i16 > @llvm.abs.v2i16 (<2 x i16 > %arg , i1 false )
345
487
ret <2 x i16 > %res
346
488
}
@@ -375,6 +517,14 @@ define amdgpu_cs <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) {
375
517
; GFX10-NEXT: v_pk_max_i16 v0, v0, v1
376
518
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
377
519
; GFX10-NEXT: ; return to shader part epilog
520
+ ;
521
+ ; GFX1250-LABEL: abs_vgpr_v2i16:
522
+ ; GFX1250: ; %bb.0:
523
+ ; GFX1250-NEXT: v_pk_sub_i16 v1, 0, v0
524
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
525
+ ; GFX1250-NEXT: v_pk_max_i16 v0, v0, v1
526
+ ; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
527
+ ; GFX1250-NEXT: ; return to shader part epilog
378
528
%res = call <2 x i16 > @llvm.abs.v2i16 (<2 x i16 > %arg , i1 false )
379
529
ret <2 x i16 > %res
380
530
}
@@ -416,6 +566,17 @@ define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) {
416
566
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s2, s0
417
567
; GFX10-NEXT: s_abs_i32 s1, s1
418
568
; GFX10-NEXT: ; return to shader part epilog
569
+ ;
570
+ ; GFX1250-LABEL: abs_sgpr_v3i16:
571
+ ; GFX1250: ; %bb.0:
572
+ ; GFX1250-NEXT: s_sext_i32_i16 s2, s0
573
+ ; GFX1250-NEXT: s_ashr_i32 s0, s0, 16
574
+ ; GFX1250-NEXT: s_abs_i32 s2, s2
575
+ ; GFX1250-NEXT: s_abs_i32 s0, s0
576
+ ; GFX1250-NEXT: s_sext_i32_i16 s1, s1
577
+ ; GFX1250-NEXT: s_pack_ll_b32_b16 s0, s2, s0
578
+ ; GFX1250-NEXT: s_abs_i32 s1, s1
579
+ ; GFX1250-NEXT: ; return to shader part epilog
419
580
%res = call <3 x i16 > @llvm.abs.v3i16 (<3 x i16 > %arg , i1 false )
420
581
ret <3 x i16 > %res
421
582
}
@@ -460,6 +621,18 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
460
621
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
461
622
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
462
623
; GFX10-NEXT: ; return to shader part epilog
624
+ ;
625
+ ; GFX1250-LABEL: abs_vgpr_v3i16:
626
+ ; GFX1250: ; %bb.0:
627
+ ; GFX1250-NEXT: v_pk_sub_i16 v2, 0, v0
628
+ ; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v1
629
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
630
+ ; GFX1250-NEXT: v_pk_max_i16 v0, v0, v2
631
+ ; GFX1250-NEXT: v_max_i16 v1, v1, v3
632
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
633
+ ; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
634
+ ; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
635
+ ; GFX1250-NEXT: ; return to shader part epilog
463
636
%res = call <3 x i16 > @llvm.abs.v3i16 (<3 x i16 > %arg , i1 false )
464
637
ret <3 x i16 > %res
465
638
}
0 commit comments