Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 22 additions & 11 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18862,27 +18862,38 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) {

static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
const TargetLowering &TLI) {
// We only do this if the target has legal ftrunc. Otherwise, we'd likely be
// replacing casts with a libcall. We also must be allowed to ignore -0.0
// because FTRUNC will return -0.0 for (-1.0, -0.0), but using integer
// conversions would return +0.0.
// We can fold the fpto[us]i -> [us]itofp pattern into a single ftrunc.
// If NoSignedZerosFPMath is enabled, this is a direct replacement.
// Otherwise, for strict math, we must handle edge cases:
// 1. For unsigned conversions, use FABS to handle negative cases. Take -0.0
// as example, it first becomes integer 0, and is converted back to +0.0.
// FTRUNC on its own could produce -0.0.

// FIXME: We should be able to use node-level FMF here.
// TODO: If strict math, should we use FABS (+ range check for signed cast)?
EVT VT = N->getValueType(0);
if (!TLI.isOperationLegal(ISD::FTRUNC, VT) ||
!DAG.getTarget().Options.NoSignedZerosFPMath)
if (!TLI.isOperationLegal(ISD::FTRUNC, VT))
return SDValue();

// fptosi/fptoui round towards zero, so converting from FP to integer and
// back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X
SDValue N0 = N->getOperand(0);
if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT &&
N0.getOperand(0).getValueType() == VT)
return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
N0.getOperand(0).getValueType() == VT) {
if (DAG.getTarget().Options.NoSignedZerosFPMath)
return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
}

if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT &&
N0.getOperand(0).getValueType() == VT)
return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
N0.getOperand(0).getValueType() == VT) {
if (DAG.getTarget().Options.NoSignedZerosFPMath)
return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NoSignedZerosFPMath might be not enough to cover the range check part. BTW, the constrained floating point intrinsics are converted into the SDNode with prefix STRICT, which might cause confusion.


// Strict math: use FABS to handle negative inputs correctly.
if (TLI.isFAbsFree(VT)) {
SDValue Abs = DAG.getNode(ISD::FABS, DL, VT, N0.getOperand(0));
return DAG.getNode(ISD::FTRUNC, DL, VT, Abs);
}
}

return SDValue();
}
Expand Down
12 changes: 12 additions & 0 deletions llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,12 @@ define half @t3(half %x) {
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
; CHECK-NEXT: ret
;
; USE-NEON-NO-GPRS-LABEL: t3:
; USE-NEON-NO-GPRS: // %bb.0: // %entry
; USE-NEON-NO-GPRS-NEXT: fcvtzs h0, h0
; USE-NEON-NO-GPRS-NEXT: scvtf h0, h0
; USE-NEON-NO-GPRS-NEXT: ret
;
; NONEON-NOSVE-LABEL: t3:
; NONEON-NOSVE: // %bb.0: // %entry
; NONEON-NOSVE-NEXT: fcvt s0, h0
Expand Down Expand Up @@ -147,6 +153,12 @@ define half @t6(half %x) {
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
; CHECK-NEXT: ret
;
; USE-NEON-NO-GPRS-LABEL: t6:
; USE-NEON-NO-GPRS: // %bb.0: // %entry
; USE-NEON-NO-GPRS-NEXT: fcvtzu h0, h0
; USE-NEON-NO-GPRS-NEXT: ucvtf h0, h0
; USE-NEON-NO-GPRS-NEXT: ret
;
; NONEON-NOSVE-LABEL: t6:
; NONEON-NOSVE: // %bb.0: // %entry
; NONEON-NOSVE-NEXT: fcvt s0, h0
Expand Down
209 changes: 209 additions & 0 deletions llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc < %s -mtriple=amdgcn | FileCheck %s
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
; RUN: llc < %s -mtriple=amdgcn | FileCheck %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s

Should test with and without legal 16-bit operations, the 16-bit checks are missing the fabs

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've updated the testcase. It looks like the results are correct. I found that the gfx900 also has different codegen with f64. Are f64 operations also illegal in gfx600?

(I don't have permission to merge the PR, so if everything looks good to you, could you please help merge it?)


define amdgpu_kernel void @fptoui_f32_to_i16_to_f32(ptr addrspace(1) %out, float %x) {
; CHECK-LABEL: fptoui_f32_to_i16_to_f32:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dword s6, s[4:5], 0xb
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; CHECK-NEXT: s_mov_b32 s3, 0xf000
; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_trunc_f32_e64 v0, |s6|
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CHECK-NEXT: s_endpgm
entry:
%ui = fptoui float %x to i16
%fp = uitofp i16 %ui to float
store float %fp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @fptoui_f32_to_i32_to_f32(ptr addrspace(1) %out, float %x) {
; CHECK-LABEL: fptoui_f32_to_i32_to_f32:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dword s6, s[4:5], 0xb
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; CHECK-NEXT: s_mov_b32 s3, 0xf000
; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_trunc_f32_e64 v0, |s6|
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CHECK-NEXT: s_endpgm
entry:
%ui = fptoui float %x to i32
%fp = uitofp i32 %ui to float
store float %fp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @fptoui_f32_to_i64_to_f32(ptr addrspace(1) %out, float %x) {
; CHECK-LABEL: fptoui_f32_to_i64_to_f32:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dword s6, s[4:5], 0xb
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; CHECK-NEXT: s_mov_b32 s3, 0xf000
; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_trunc_f32_e64 v0, |s6|
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CHECK-NEXT: s_endpgm
entry:
%ui = fptoui float %x to i64
%fp = uitofp i64 %ui to float
store float %fp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @fptoui_f16_to_i16_to_f16(ptr addrspace(1) %out, half %x) {
; CHECK-LABEL: fptoui_f16_to_i16_to_f16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dword s6, s[4:5], 0xb
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; CHECK-NEXT: s_mov_b32 s3, 0xf000
; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cvt_f32_f16_e32 v0, s6
; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0
; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0
; CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0
; CHECK-NEXT: s_endpgm
entry:
%ui = fptoui half %x to i16
%fp = uitofp i16 %ui to half
store half %fp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @fptoui_f16_to_i32_to_f16(ptr addrspace(1) %out, half %x) {
; CHECK-LABEL: fptoui_f16_to_i32_to_f16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dword s6, s[4:5], 0xb
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; CHECK-NEXT: s_mov_b32 s3, 0xf000
; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cvt_f32_f16_e64 v0, |s6|
; CHECK-NEXT: v_trunc_f32_e32 v0, v0
; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0
; CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0
; CHECK-NEXT: s_endpgm
entry:
%ui = fptoui half %x to i32
%fp = uitofp i32 %ui to half
store half %fp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @fptoui_f16_to_i64_to_f16(ptr addrspace(1) %out, half %x) {
; CHECK-LABEL: fptoui_f16_to_i64_to_f16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dword s6, s[4:5], 0xb
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; CHECK-NEXT: s_mov_b32 s3, 0xf000
; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cvt_f32_f16_e64 v0, |s6|
; CHECK-NEXT: v_trunc_f32_e32 v0, v0
; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0
; CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0
; CHECK-NEXT: s_endpgm
entry:
%ui = fptoui half %x to i64
%fp = uitofp i64 %ui to half
store half %fp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @fptoui_f64_to_i16_to_f64(ptr addrspace(1) %out, double %x) {
; CHECK-LABEL: fptoui_f64_to_i16_to_f64:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; CHECK-NEXT: s_mov_b32 s7, 0xf000
; CHECK-NEXT: s_mov_b32 s6, -1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s4, s0
; CHECK-NEXT: s_mov_b32 s5, s1
; CHECK-NEXT: v_cvt_u32_f64_e32 v0, s[2:3]
; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
; CHECK-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; CHECK-NEXT: s_endpgm
entry:
%ui = fptoui double %x to i16
%fp = uitofp i16 %ui to double
store double %fp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @fptoui_f64_to_i32_to_f64(ptr addrspace(1) %out, double %x) {
; CHECK-LABEL: fptoui_f64_to_i32_to_f64:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; CHECK-NEXT: s_mov_b32 s7, 0xf000
; CHECK-NEXT: s_mov_b32 s6, -1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s4, s0
; CHECK-NEXT: s_mov_b32 s5, s1
; CHECK-NEXT: v_cvt_u32_f64_e32 v0, s[2:3]
; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
; CHECK-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; CHECK-NEXT: s_endpgm
entry:
%ui = fptoui double %x to i32
%fp = uitofp i32 %ui to double
store double %fp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @fptoui_f64_to_i64_to_f64(ptr addrspace(1) %out, double %x) {
; CHECK-LABEL: fptoui_f64_to_i64_to_f64:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; CHECK-NEXT: s_mov_b32 s3, 0xf000
; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_mov_b32 s9, 0xfffff
; CHECK-NEXT: v_not_b32_e32 v2, 31
; CHECK-NEXT: v_mov_b32_e32 v0, -1
; CHECK-NEXT: v_mov_b32_e32 v1, 0x3fefffff
; CHECK-NEXT: s_mov_b32 s10, 0
; CHECK-NEXT: s_mov_b32 s11, 0xc1f00000
; CHECK-NEXT: s_mov_b32 s8, s2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s0, s4
; CHECK-NEXT: s_mov_b32 s1, s5
; CHECK-NEXT: s_bfe_u32 s4, s7, 0xb0014
; CHECK-NEXT: s_and_b32 s12, s7, 0x80000000
; CHECK-NEXT: s_add_i32 s13, s4, 0xfffffc01
; CHECK-NEXT: s_lshr_b64 s[4:5], s[8:9], s13
; CHECK-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
; CHECK-NEXT: s_cmp_lt_i32 s13, 0
; CHECK-NEXT: s_cselect_b32 s4, 0, s4
; CHECK-NEXT: s_cselect_b32 s5, s12, s5
; CHECK-NEXT: s_cmp_gt_i32 s13, 51
; CHECK-NEXT: s_cselect_b32 s5, s7, s5
; CHECK-NEXT: s_cselect_b32 s4, s6, s4
; CHECK-NEXT: v_ldexp_f64 v[2:3], s[4:5], v2
; CHECK-NEXT: v_mov_b32_e32 v4, s4
; CHECK-NEXT: v_mov_b32_e32 v5, s5
; CHECK-NEXT: v_fract_f64_e32 v[6:7], v[2:3]
; CHECK-NEXT: v_min_f64 v[0:1], v[6:7], v[0:1]
; CHECK-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 3
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; CHECK-NEXT: v_add_f64 v[0:1], v[2:3], -v[0:1]
; CHECK-NEXT: v_fma_f64 v[2:3], v[0:1], s[10:11], v[4:5]
; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1]
; CHECK-NEXT: v_cvt_u32_f64_e32 v2, v[2:3]
; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
; CHECK-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32
; CHECK-NEXT: v_cvt_f64_u32_e32 v[2:3], v2
; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
; CHECK-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; CHECK-NEXT: s_endpgm
entry:
%ui = fptoui double %x to i64
%fp = uitofp i64 %ui to double
store double %fp, ptr addrspace(1) %out
ret void
}