Skip to content

Commit 24be0ba

Browse files
authored
DAG: Fix assert on nofpclass call with aggregate return (#167725)
1 parent 58ac95d commit 24be0ba

File tree

2 files changed

+163
-2
lines changed

2 files changed

+163
-2
lines changed

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10734,8 +10734,22 @@ SDValue SelectionDAGBuilder::lowerNoFPClassToAssertNoFPClass(
1073410734
if (Classes == fcNone)
1073510735
return Op;
1073610736

10737-
return DAG.getNode(ISD::AssertNoFPClass, SDLoc(Op), Op.getValueType(), Op,
10738-
DAG.getTargetConstant(Classes, SDLoc(), MVT::i32));
10737+
SDLoc SL = getCurSDLoc();
10738+
SDValue TestConst = DAG.getTargetConstant(Classes, SDLoc(), MVT::i32);
10739+
10740+
if (Op.getOpcode() != ISD::MERGE_VALUES) {
10741+
return DAG.getNode(ISD::AssertNoFPClass, SL, Op.getValueType(), Op,
10742+
TestConst);
10743+
}
10744+
10745+
SmallVector<SDValue, 8> Ops(Op.getNumOperands());
10746+
for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
10747+
SDValue MergeOp = Op.getOperand(I);
10748+
Ops[I] = DAG.getNode(ISD::AssertNoFPClass, SL, MergeOp.getValueType(),
10749+
MergeOp, TestConst);
10750+
}
10751+
10752+
return DAG.getMergeValues(Ops, SL);
1073910753
}
1074010754

1074110755
/// Populate a CallLowerinInfo (into \p CLI) based on the properties of

llvm/test/CodeGen/AMDGPU/nofpclass-call.ll

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,3 +189,150 @@ define <2 x half> @call_nofpclass_intrinsic_v2f16(float %x, float %y, float %z,
189189
%min = select nsz <2 x i1> %lt, <2 x half> %call0, <2 x half> %call1
190190
ret <2 x half> %min
191191
}
192+
193+
define nofpclass(nan inf) { double, double } @aggregate() {
194+
; CHECK-LABEL: aggregate:
195+
; CHECK: ; %bb.0: ; %entry
196+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
197+
; CHECK-NEXT: s_mov_b32 s16, s33
198+
; CHECK-NEXT: s_mov_b32 s33, s32
199+
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
200+
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
201+
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
202+
; CHECK-NEXT: s_addk_i32 s32, 0x400
203+
; CHECK-NEXT: v_writelane_b32 v40, s16, 2
204+
; CHECK-NEXT: s_getpc_b64 s[16:17]
205+
; CHECK-NEXT: s_add_u32 s16, s16, aggregate@gotpcrel32@lo+4
206+
; CHECK-NEXT: s_addc_u32 s17, s17, aggregate@gotpcrel32@hi+12
207+
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
208+
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
209+
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
210+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
211+
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
212+
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
213+
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
214+
; CHECK-NEXT: s_mov_b32 s32, s33
215+
; CHECK-NEXT: v_readlane_b32 s4, v40, 2
216+
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
217+
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
218+
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
219+
; CHECK-NEXT: s_mov_b32 s33, s4
220+
; CHECK-NEXT: s_waitcnt vmcnt(0)
221+
; CHECK-NEXT: s_setpc_b64 s[30:31]
222+
entry:
223+
%call.i.i = call { double, double } @aggregate()
224+
ret { double, double } %call.i.i
225+
}
226+
227+
declare hidden nofpclass(nan inf) { float, float } @aggregate_f32()
228+
229+
define { float, float } @aggregate_use(float %z) {
230+
; CHECK-LABEL: aggregate_use:
231+
; CHECK: ; %bb.0:
232+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
233+
; CHECK-NEXT: s_mov_b32 s16, s33
234+
; CHECK-NEXT: s_mov_b32 s33, s32
235+
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
236+
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
237+
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
238+
; CHECK-NEXT: v_writelane_b32 v41, s16, 2
239+
; CHECK-NEXT: s_addk_i32 s32, 0x400
240+
; CHECK-NEXT: v_writelane_b32 v41, s30, 0
241+
; CHECK-NEXT: s_getpc_b64 s[16:17]
242+
; CHECK-NEXT: s_add_u32 s16, s16, aggregate_f32@rel32@lo+4
243+
; CHECK-NEXT: s_addc_u32 s17, s17, aggregate_f32@rel32@hi+12
244+
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
245+
; CHECK-NEXT: v_writelane_b32 v41, s31, 1
246+
; CHECK-NEXT: v_mov_b32_e32 v40, v0
247+
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
248+
; CHECK-NEXT: v_max_f32_e32 v2, v40, v40
249+
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
250+
; CHECK-NEXT: v_min_f32_e32 v0, v0, v2
251+
; CHECK-NEXT: v_min_f32_e32 v1, v1, v2
252+
; CHECK-NEXT: v_readlane_b32 s31, v41, 1
253+
; CHECK-NEXT: v_readlane_b32 s30, v41, 0
254+
; CHECK-NEXT: s_mov_b32 s32, s33
255+
; CHECK-NEXT: v_readlane_b32 s4, v41, 2
256+
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
257+
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
258+
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
259+
; CHECK-NEXT: s_mov_b32 s33, s4
260+
; CHECK-NEXT: s_waitcnt vmcnt(0)
261+
; CHECK-NEXT: s_setpc_b64 s[30:31]
262+
%call = call nofpclass(nan inf) { float, float } @aggregate_f32()
263+
%i = extractvalue { float, float } %call, 0
264+
%i1 = extractvalue { float, float } %call, 1
265+
%min0 = call float @llvm.minnum.f32(float %i, float %z)
266+
%min1 = call float @llvm.minnum.f32(float %i1, float %z)
267+
%insert.0 = insertvalue { float, float } poison, float %min0, 0
268+
%insert.1 = insertvalue { float, float } %insert.0, float %min1, 1
269+
ret { float, float } %insert.1
270+
}
271+
272+
define internal <5 x double> @func_v5f64(ptr addrspace(1) %ptr) {
273+
; CHECK-LABEL: func_v5f64:
274+
; CHECK: ; %bb.0:
275+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
276+
; CHECK-NEXT: v_mov_b32_e32 v11, v1
277+
; CHECK-NEXT: v_mov_b32_e32 v10, v0
278+
; CHECK-NEXT: global_load_dwordx4 v[0:3], v[10:11], off glc
279+
; CHECK-NEXT: s_waitcnt vmcnt(0)
280+
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[10:11], off offset:16 glc
281+
; CHECK-NEXT: s_waitcnt vmcnt(0)
282+
; CHECK-NEXT: global_load_dwordx2 v[8:9], v[10:11], off offset:32 glc
283+
; CHECK-NEXT: s_waitcnt vmcnt(0)
284+
; CHECK-NEXT: s_setpc_b64 s[30:31]
285+
%ld = load volatile <5 x double>, ptr addrspace(1) %ptr
286+
ret <5 x double> %ld
287+
}
288+
289+
define <5 x double> @call_nofpclass_funcs_v5f64_non_mvt_vector(ptr addrspace(1) %ptr) {
290+
; CHECK-LABEL: call_nofpclass_funcs_v5f64_non_mvt_vector:
291+
; CHECK: ; %bb.0:
292+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
293+
; CHECK-NEXT: s_mov_b32 s18, s33
294+
; CHECK-NEXT: s_mov_b32 s33, s32
295+
; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1
296+
; CHECK-NEXT: buffer_store_dword v24, off, s[0:3], s33 ; 4-byte Folded Spill
297+
; CHECK-NEXT: s_mov_b64 exec, s[16:17]
298+
; CHECK-NEXT: s_addk_i32 s32, 0x400
299+
; CHECK-NEXT: v_writelane_b32 v24, s30, 0
300+
; CHECK-NEXT: s_getpc_b64 s[16:17]
301+
; CHECK-NEXT: s_add_u32 s16, s16, func_v5f64@rel32@lo+4
302+
; CHECK-NEXT: s_addc_u32 s17, s17, func_v5f64@rel32@hi+12
303+
; CHECK-NEXT: v_writelane_b32 v24, s31, 1
304+
; CHECK-NEXT: v_mov_b32_e32 v22, v1
305+
; CHECK-NEXT: v_mov_b32_e32 v23, v0
306+
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
307+
; CHECK-NEXT: v_mov_b32_e32 v12, v0
308+
; CHECK-NEXT: v_mov_b32_e32 v13, v1
309+
; CHECK-NEXT: v_mov_b32_e32 v0, v23
310+
; CHECK-NEXT: v_mov_b32_e32 v1, v22
311+
; CHECK-NEXT: v_mov_b32_e32 v14, v2
312+
; CHECK-NEXT: v_mov_b32_e32 v15, v3
313+
; CHECK-NEXT: v_mov_b32_e32 v16, v4
314+
; CHECK-NEXT: v_mov_b32_e32 v17, v5
315+
; CHECK-NEXT: v_mov_b32_e32 v18, v6
316+
; CHECK-NEXT: v_mov_b32_e32 v19, v7
317+
; CHECK-NEXT: v_mov_b32_e32 v20, v8
318+
; CHECK-NEXT: v_mov_b32_e32 v21, v9
319+
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
320+
; CHECK-NEXT: v_min_f64 v[0:1], v[12:13], v[0:1]
321+
; CHECK-NEXT: v_min_f64 v[2:3], v[14:15], v[2:3]
322+
; CHECK-NEXT: v_min_f64 v[4:5], v[16:17], v[4:5]
323+
; CHECK-NEXT: v_min_f64 v[6:7], v[18:19], v[6:7]
324+
; CHECK-NEXT: v_min_f64 v[8:9], v[20:21], v[8:9]
325+
; CHECK-NEXT: v_readlane_b32 s31, v24, 1
326+
; CHECK-NEXT: v_readlane_b32 s30, v24, 0
327+
; CHECK-NEXT: s_mov_b32 s32, s33
328+
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
329+
; CHECK-NEXT: buffer_load_dword v24, off, s[0:3], s33 ; 4-byte Folded Reload
330+
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
331+
; CHECK-NEXT: s_mov_b32 s33, s18
332+
; CHECK-NEXT: s_waitcnt vmcnt(0)
333+
; CHECK-NEXT: s_setpc_b64 s[30:31]
334+
%call0 = call nofpclass(nan) <5 x double> @func_v5f64(ptr addrspace(1) %ptr)
335+
%call1 = call nofpclass(nan) <5 x double> @func_v5f64(ptr addrspace(1) %ptr)
336+
%min = call <5 x double> @llvm.minnum.v5f64(<5 x double> %call0, <5 x double> %call1)
337+
ret <5 x double> %min
338+
}

0 commit comments

Comments
 (0)