Skip to content

Commit ef923f1

Browse files
authored
[AMDGPU] Change patterns for v_[pk_]add_{min|max} (#164881)
The intermediate result is in fact the add with saturation regardless of the clamp bit.
1 parent 6836261 commit ef923f1

File tree

4 files changed

+176
-209
lines changed

4 files changed

+176
-209
lines changed

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -976,10 +976,10 @@ def : GCNPat <
976976
} // End SubtargetPredicate = HasLshlAddU64Inst
977977

978978
let SubtargetPredicate = HasAddMinMaxInsts in {
979-
def : ThreeOp_i32_Pats<add, smax, V_ADD_MAX_I32_e64>;
980-
def : ThreeOp_i32_Pats<add, umax, V_ADD_MAX_U32_e64>;
981-
def : ThreeOp_i32_Pats<add, smin, V_ADD_MIN_I32_e64>;
982-
def : ThreeOp_i32_Pats<add, umin, V_ADD_MIN_U32_e64>;
979+
def : ThreeOp_i32_Pats<saddsat, smax, V_ADD_MAX_I32_e64>;
980+
def : ThreeOp_i32_Pats<uaddsat, umax, V_ADD_MAX_U32_e64>;
981+
def : ThreeOp_i32_Pats<saddsat, smin, V_ADD_MIN_I32_e64>;
982+
def : ThreeOp_i32_Pats<uaddsat, umin, V_ADD_MIN_U32_e64>;
983983
}
984984

985985
def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>;

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -464,10 +464,10 @@ class ThreeOp_OpSelClampPats <SDPatternOperator op1, SDPatternOperator op2,
464464
>;
465465

466466
let SubtargetPredicate = HasPkAddMinMaxInsts in {
467-
def : ThreeOp_OpSelClampPats<add, smax, V_PK_ADD_MAX_I16>;
468-
def : ThreeOp_OpSelClampPats<add, umax, V_PK_ADD_MAX_U16>;
469-
def : ThreeOp_OpSelClampPats<add, smin, V_PK_ADD_MIN_I16>;
470-
def : ThreeOp_OpSelClampPats<add, umin, V_PK_ADD_MIN_U16>;
467+
def : ThreeOp_OpSelClampPats<saddsat, smax, V_PK_ADD_MAX_I16>;
468+
def : ThreeOp_OpSelClampPats<uaddsat, umax, V_PK_ADD_MAX_U16>;
469+
def : ThreeOp_OpSelClampPats<saddsat, smin, V_PK_ADD_MIN_I16>;
470+
def : ThreeOp_OpSelClampPats<uaddsat, umin, V_PK_ADD_MIN_U16>;
471471
}
472472

473473
let SubtargetPredicate = HasPkMinMax3Insts in {

llvm/test/CodeGen/AMDGPU/add-max.ll

Lines changed: 55 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) {
77
; GCN: ; %bb.0:
88
; GCN-NEXT: v_add_max_u32 v0, v0, v1, v2
99
; GCN-NEXT: ; return to shader part epilog
10-
%add = add i32 %a, %b
10+
%add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
1111
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
1212
%ret = bitcast i32 %max to float
1313
ret float %ret
@@ -18,39 +18,38 @@ define amdgpu_ps float @add_max_u32_svv(i32 inreg %a, i32 %b, i32 %c) {
1818
; GCN: ; %bb.0:
1919
; GCN-NEXT: v_add_max_u32 v0, s0, v0, v1
2020
; GCN-NEXT: ; return to shader part epilog
21-
%add = add i32 %a, %b
21+
%add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
2222
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
2323
%ret = bitcast i32 %max to float
2424
ret float %ret
2525
}
2626

2727
define amdgpu_ps float @add_max_u32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) {
28-
; SDAG-LABEL: add_max_u32_ssv:
29-
; SDAG: ; %bb.0:
30-
; SDAG-NEXT: v_add_max_u32 v0, s0, s1, v0
31-
; SDAG-NEXT: ; return to shader part epilog
32-
;
33-
; GISEL-LABEL: add_max_u32_ssv:
34-
; GISEL: ; %bb.0:
35-
; GISEL-NEXT: s_add_co_i32 s0, s0, s1
36-
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
37-
; GISEL-NEXT: v_max_u32_e32 v0, s0, v0
38-
; GISEL-NEXT: ; return to shader part epilog
39-
%add = add i32 %a, %b
28+
; GCN-LABEL: add_max_u32_ssv:
29+
; GCN: ; %bb.0:
30+
; GCN-NEXT: v_add_max_u32 v0, s0, s1, v0
31+
; GCN-NEXT: ; return to shader part epilog
32+
%add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
4033
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
4134
%ret = bitcast i32 %max to float
4235
ret float %ret
4336
}
4437

4538
define amdgpu_ps float @add_max_u32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
46-
; GCN-LABEL: add_max_u32_sss:
47-
; GCN: ; %bb.0:
48-
; GCN-NEXT: s_add_co_i32 s0, s0, s1
49-
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
50-
; GCN-NEXT: s_max_u32 s0, s0, s2
51-
; GCN-NEXT: v_mov_b32_e32 v0, s0
52-
; GCN-NEXT: ; return to shader part epilog
53-
%add = add i32 %a, %b
39+
; SDAG-LABEL: add_max_u32_sss:
40+
; SDAG: ; %bb.0:
41+
; SDAG-NEXT: v_add_nc_u32_e64 v0, s0, s1 clamp
42+
; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
43+
; SDAG-NEXT: v_max_u32_e32 v0, s2, v0
44+
; SDAG-NEXT: ; return to shader part epilog
45+
;
46+
; GISEL-LABEL: add_max_u32_sss:
47+
; GISEL: ; %bb.0:
48+
; GISEL-NEXT: v_mov_b32_e32 v0, s2
49+
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
50+
; GISEL-NEXT: v_add_max_u32 v0, s0, s1, v0
51+
; GISEL-NEXT: ; return to shader part epilog
52+
%add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
5453
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
5554
%ret = bitcast i32 %max to float
5655
ret float %ret
@@ -61,7 +60,7 @@ define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) {
6160
; GCN: ; %bb.0:
6261
; GCN-NEXT: v_add_max_u32 v0, v0, s0, 4
6362
; GCN-NEXT: ; return to shader part epilog
64-
%add = add i32 %a, %b
63+
%add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
6564
%max = call i32 @llvm.umax.i32(i32 %add, i32 4)
6665
%ret = bitcast i32 %max to float
6766
ret float %ret
@@ -72,26 +71,19 @@ define amdgpu_ps float @add_max_u32_svl(i32 inreg %a, i32 %b) {
7271
; GCN: ; %bb.0:
7372
; GCN-NEXT: v_add_max_u32 v0, s0, v0, 0x64
7473
; GCN-NEXT: ; return to shader part epilog
75-
%add = add i32 %a, %b
74+
%add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
7675
%max = call i32 @llvm.umax.i32(i32 %add, i32 100)
7776
%ret = bitcast i32 %max to float
7877
ret float %ret
7978
}
8079

81-
define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b) {
82-
; SDAG-LABEL: add_max_u32_slv:
83-
; SDAG: ; %bb.0:
84-
; SDAG-NEXT: v_add_max_u32 v0, 0x64, s0, v0
85-
; SDAG-NEXT: ; return to shader part epilog
86-
;
87-
; GISEL-LABEL: add_max_u32_slv:
88-
; GISEL: ; %bb.0:
89-
; GISEL-NEXT: s_addk_co_i32 s0, 0x64
90-
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
91-
; GISEL-NEXT: v_max_u32_e32 v0, s0, v0
92-
; GISEL-NEXT: ; return to shader part epilog
93-
%add = add i32 %a, 100
94-
%max = call i32 @llvm.umax.i32(i32 %add, i32 %b)
80+
define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b, i32 %c) {
81+
; GCN-LABEL: add_max_u32_slv:
82+
; GCN: ; %bb.0:
83+
; GCN-NEXT: v_add_max_u32 v0, s0, v0, v1
84+
; GCN-NEXT: ; return to shader part epilog
85+
%add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
86+
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
9587
%ret = bitcast i32 %max to float
9688
ret float %ret
9789
}
@@ -101,7 +93,7 @@ define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) {
10193
; GCN: ; %bb.0:
10294
; GCN-NEXT: v_add_max_i32 v0, v0, v1, v2
10395
; GCN-NEXT: ; return to shader part epilog
104-
%add = add i32 %a, %b
96+
%add = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %b)
10597
%max = call i32 @llvm.smax.i32(i32 %add, i32 %c)
10698
%ret = bitcast i32 %max to float
10799
ret float %ret
@@ -112,7 +104,7 @@ define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) {
112104
; GCN: ; %bb.0:
113105
; GCN-NEXT: v_add_min_u32 v0, v0, v1, v2
114106
; GCN-NEXT: ; return to shader part epilog
115-
%add = add i32 %a, %b
107+
%add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
116108
%max = call i32 @llvm.umin.i32(i32 %add, i32 %c)
117109
%ret = bitcast i32 %max to float
118110
ret float %ret
@@ -123,7 +115,7 @@ define amdgpu_ps float @add_min_i32_vvv(i32 %a, i32 %b, i32 %c) {
123115
; GCN: ; %bb.0:
124116
; GCN-NEXT: v_add_min_i32 v0, v0, v1, v2
125117
; GCN-NEXT: ; return to shader part epilog
126-
%add = add i32 %a, %b
118+
%add = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %b)
127119
%max = call i32 @llvm.smin.i32(i32 %add, i32 %c)
128120
%ret = bitcast i32 %max to float
129121
ret float %ret
@@ -134,7 +126,7 @@ define amdgpu_ps float @add_max_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
134126
; GCN: ; %bb.0:
135127
; GCN-NEXT: v_pk_add_max_u16 v0, v0, v1, v2
136128
; GCN-NEXT: ; return to shader part epilog
137-
%add = add <2 x i16> %a, %b
129+
%add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
138130
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
139131
%ret = bitcast <2 x i16> %max to float
140132
ret float %ret
@@ -145,29 +137,18 @@ define amdgpu_ps float @add_max_v2u16_svv(<2 x i16> inreg %a, <2 x i16> %b, <2 x
145137
; GCN: ; %bb.0:
146138
; GCN-NEXT: v_pk_add_max_u16 v0, s0, v0, v1
147139
; GCN-NEXT: ; return to shader part epilog
148-
%add = add <2 x i16> %a, %b
140+
%add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
149141
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
150142
%ret = bitcast <2 x i16> %max to float
151143
ret float %ret
152144
}
153145

154146
define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> %c) {
155-
; SDAG-LABEL: add_max_v2u16_ssv:
156-
; SDAG: ; %bb.0:
157-
; SDAG-NEXT: v_pk_add_max_u16 v0, s0, s1, v0
158-
; SDAG-NEXT: ; return to shader part epilog
159-
;
160-
; GISEL-LABEL: add_max_v2u16_ssv:
161-
; GISEL: ; %bb.0:
162-
; GISEL-NEXT: s_lshr_b32 s2, s0, 16
163-
; GISEL-NEXT: s_lshr_b32 s3, s1, 16
164-
; GISEL-NEXT: s_add_co_i32 s0, s0, s1
165-
; GISEL-NEXT: s_add_co_i32 s2, s2, s3
166-
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
167-
; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s2
168-
; GISEL-NEXT: v_pk_max_u16 v0, s0, v0
169-
; GISEL-NEXT: ; return to shader part epilog
170-
%add = add <2 x i16> %a, %b
147+
; GCN-LABEL: add_max_v2u16_ssv:
148+
; GCN: ; %bb.0:
149+
; GCN-NEXT: v_pk_add_max_u16 v0, s0, s1, v0
150+
; GCN-NEXT: ; return to shader part epilog
151+
%add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
171152
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
172153
%ret = bitcast <2 x i16> %max to float
173154
ret float %ret
@@ -176,30 +157,18 @@ define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b
176157
define amdgpu_ps float @add_max_v2u16_sss(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> inreg %c) {
177158
; SDAG-LABEL: add_max_v2u16_sss:
178159
; SDAG: ; %bb.0:
179-
; SDAG-NEXT: v_pk_add_u16 v0, s0, s1
160+
; SDAG-NEXT: v_pk_add_u16 v0, s0, s1 clamp
180161
; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
181162
; SDAG-NEXT: v_pk_max_u16 v0, v0, s2
182163
; SDAG-NEXT: ; return to shader part epilog
183164
;
184165
; GISEL-LABEL: add_max_v2u16_sss:
185166
; GISEL: ; %bb.0:
186-
; GISEL-NEXT: s_lshr_b32 s3, s0, 16
187-
; GISEL-NEXT: s_lshr_b32 s4, s1, 16
188-
; GISEL-NEXT: s_add_co_i32 s0, s0, s1
189-
; GISEL-NEXT: s_add_co_i32 s3, s3, s4
190-
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
191-
; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s3
192-
; GISEL-NEXT: s_and_b32 s3, s2, 0xffff
193-
; GISEL-NEXT: s_lshr_b32 s1, s0, 16
194-
; GISEL-NEXT: s_and_b32 s0, s0, 0xffff
195-
; GISEL-NEXT: s_lshr_b32 s2, s2, 16
196-
; GISEL-NEXT: s_max_u32 s0, s0, s3
197-
; GISEL-NEXT: s_max_u32 s1, s1, s2
198-
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
199-
; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1
200-
; GISEL-NEXT: v_mov_b32_e32 v0, s0
167+
; GISEL-NEXT: v_mov_b32_e32 v0, s2
168+
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
169+
; GISEL-NEXT: v_pk_add_max_u16 v0, s0, s1, v0
201170
; GISEL-NEXT: ; return to shader part epilog
202-
%add = add <2 x i16> %a, %b
171+
%add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
203172
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
204173
%ret = bitcast <2 x i16> %max to float
205174
ret float %ret
@@ -210,7 +179,7 @@ define amdgpu_ps float @add_max_v2u16_vsi(<2 x i16> %a, <2 x i16> inreg %b) {
210179
; GCN: ; %bb.0:
211180
; GCN-NEXT: v_pk_add_max_u16 v0, v0, s0, 4
212181
; GCN-NEXT: ; return to shader part epilog
213-
%add = add <2 x i16> %a, %b
182+
%add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
214183
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 4, i16 0>)
215184
%ret = bitcast <2 x i16> %max to float
216185
ret float %ret
@@ -221,28 +190,18 @@ define amdgpu_ps float @add_max_v2u16_svl(<2 x i16> inreg %a, <2 x i16> %b) {
221190
; GCN: ; %bb.0:
222191
; GCN-NEXT: v_pk_add_max_u16 v0, s0, v0, 0x650064
223192
; GCN-NEXT: ; return to shader part epilog
224-
%add = add <2 x i16> %a, %b
193+
%add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
225194
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 100, i16 101>)
226195
%ret = bitcast <2 x i16> %max to float
227196
ret float %ret
228197
}
229198

230199
define amdgpu_ps float @add_max_v2u16_slv(<2 x i16> inreg %a, <2 x i16> %b) {
231-
; SDAG-LABEL: add_max_v2u16_slv:
232-
; SDAG: ; %bb.0:
233-
; SDAG-NEXT: v_pk_add_max_u16 v0, 0x640064, s0, v0
234-
; SDAG-NEXT: ; return to shader part epilog
235-
;
236-
; GISEL-LABEL: add_max_v2u16_slv:
237-
; GISEL: ; %bb.0:
238-
; GISEL-NEXT: s_lshr_b32 s1, s0, 16
239-
; GISEL-NEXT: s_add_co_i32 s0, s0, 0x640064
240-
; GISEL-NEXT: s_addk_co_i32 s1, 0x64
241-
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
242-
; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1
243-
; GISEL-NEXT: v_pk_max_u16 v0, s0, v0
244-
; GISEL-NEXT: ; return to shader part epilog
245-
%add = add <2 x i16> %a, <i16 100, i16 100>
200+
; GCN-LABEL: add_max_v2u16_slv:
201+
; GCN: ; %bb.0:
202+
; GCN-NEXT: v_pk_add_max_u16 v0, 0x640064, s0, v0
203+
; GCN-NEXT: ; return to shader part epilog
204+
%add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> <i16 100, i16 100>)
246205
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %b)
247206
%ret = bitcast <2 x i16> %max to float
248207
ret float %ret
@@ -253,7 +212,7 @@ define amdgpu_ps float @add_max_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
253212
; GCN: ; %bb.0:
254213
; GCN-NEXT: v_pk_add_max_i16 v0, v0, v1, v2
255214
; GCN-NEXT: ; return to shader part epilog
256-
%add = add <2 x i16> %a, %b
215+
%add = call <2 x i16> @llvm.sadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
257216
%max = call <2 x i16> @llvm.smax.v216(<2 x i16> %add, <2 x i16> %c)
258217
%ret = bitcast <2 x i16> %max to float
259218
ret float %ret
@@ -264,7 +223,7 @@ define amdgpu_ps float @add_min_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
264223
; GCN: ; %bb.0:
265224
; GCN-NEXT: v_pk_add_min_u16 v0, v0, v1, v2
266225
; GCN-NEXT: ; return to shader part epilog
267-
%add = add <2 x i16> %a, %b
226+
%add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
268227
%max = call <2 x i16> @llvm.umin.v216(<2 x i16> %add, <2 x i16> %c)
269228
%ret = bitcast <2 x i16> %max to float
270229
ret float %ret
@@ -275,7 +234,7 @@ define amdgpu_ps float @add_min_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
275234
; GCN: ; %bb.0:
276235
; GCN-NEXT: v_pk_add_min_i16 v0, v0, v1, v2
277236
; GCN-NEXT: ; return to shader part epilog
278-
%add = add <2 x i16> %a, %b
237+
%add = call <2 x i16> @llvm.sadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
279238
%max = call <2 x i16> @llvm.smin.v216(<2 x i16> %add, <2 x i16> %c)
280239
%ret = bitcast <2 x i16> %max to float
281240
ret float %ret

0 commit comments

Comments
 (0)