Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -3464,6 +3464,12 @@ class TargetLoweringBase {
return false;
}

// Is it preferable to legalize FP types to SINT instead of UINT if both SINT
// and UINT are custom.
virtual bool preferPromoteFPToCustomSINTOverCustomUINT() const {
return true;
}

/// Create the IR node for the given complex deinterleaving operation.
/// If one cannot be created using all the given inputs, nullptr should be
/// returned.
Expand Down
25 changes: 20 additions & 5 deletions llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -853,21 +853,36 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT(SDNode *N) {

// If we're promoting a UINT to a larger size and the larger FP_TO_UINT is
// not Legal, check to see if we can use FP_TO_SINT instead. (If both UINT
// and SINT conversions are Custom, there is no way to tell which is
// preferable. We choose SINT because that's the right thing on PPC.)
// and SINT conversions are Custom, we use a TLI call to check which is
// preferable.)
if (N->getOpcode() == ISD::FP_TO_UINT &&
!TLI.isOperationLegal(ISD::FP_TO_UINT, NVT) &&
TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NVT))
(TLI.isOperationLegal(ISD::FP_TO_SINT, NVT) ||
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This condition is now way more complicated and difficult to follow. Should have something directly return the opcode to prefer

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed the hook to return an opcode.

(!TLI.isOperationCustom(ISD::FP_TO_UINT, NVT) &&
TLI.isOperationCustom(ISD::FP_TO_SINT, NVT)) ||
(TLI.isOperationCustom(ISD::FP_TO_SINT, NVT) &&
TLI.isOperationCustom(ISD::FP_TO_UINT, NVT) &&
TLI.preferPromoteFPToCustomSINTOverCustomUINT())))
NewOpc = ISD::FP_TO_SINT;

if (N->getOpcode() == ISD::STRICT_FP_TO_UINT &&
!TLI.isOperationLegal(ISD::STRICT_FP_TO_UINT, NVT) &&
TLI.isOperationLegalOrCustom(ISD::STRICT_FP_TO_SINT, NVT))
(TLI.isOperationLegal(ISD::STRICT_FP_TO_SINT, NVT) ||
(!TLI.isOperationCustom(ISD::STRICT_FP_TO_UINT, NVT) &&
TLI.isOperationCustom(ISD::STRICT_FP_TO_SINT, NVT)) ||
(TLI.isOperationCustom(ISD::STRICT_FP_TO_SINT, NVT) &&
TLI.isOperationCustom(ISD::STRICT_FP_TO_UINT, NVT) &&
TLI.preferPromoteFPToCustomSINTOverCustomUINT())))
NewOpc = ISD::STRICT_FP_TO_SINT;

if (N->getOpcode() == ISD::VP_FP_TO_UINT &&
!TLI.isOperationLegal(ISD::VP_FP_TO_UINT, NVT) &&
TLI.isOperationLegalOrCustom(ISD::VP_FP_TO_SINT, NVT))
(TLI.isOperationLegal(ISD::VP_FP_TO_SINT, NVT) ||
(!TLI.isOperationCustom(ISD::VP_FP_TO_UINT, NVT) &&
TLI.isOperationCustom(ISD::VP_FP_TO_SINT, NVT)) ||
(TLI.isOperationCustom(ISD::VP_FP_TO_SINT, NVT) &&
TLI.isOperationCustom(ISD::VP_FP_TO_UINT, NVT) &&
TLI.preferPromoteFPToCustomSINTOverCustomUINT())))
NewOpc = ISD::VP_FP_TO_SINT;

SDValue Res;
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,10 @@ class NVPTXTargetLowering : public TargetLowering {
Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;

bool preferPromoteFPToCustomSINTOverCustomUINT() const override {
return false;
}

private:
const NVPTXSubtarget &STI; // cache the subtarget here
mutable unsigned GlobalUniqueCallSite;
Expand Down
134 changes: 134 additions & 0 deletions llvm/test/CodeGen/NVPTX/convert-fp-i8.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does this change buy us?
It appears that on this test LLVM currently generates the same output (AFAICT): https://godbolt.org/z/svM1Kns79

It's possible that I've missed something. Can you point me at the suboptimal code LLVM generates now?
Perhaps we just need a few more test cases to cover the changes introduced by the patch..

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The first two test cases should have different PTX with the change.
If we look at line 21 in the PTX of that godbolt link:
cvt.rzi.s16.f32 %rs1, %f1;
Compare that to the expected output in the test, line 10:
; CHECK-NEXT: cvt.rzi.u16.f32 %rs1, %f1;
The difference is the s16 vs u16. Current LLVM is generating a signed cvt instruction for what should be an unsigned value.

You can see the SASS differences here: https://godbolt.org/z/GjqYzejPz

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it matter in practice?

The change affects only conversions to u8 and given that we always actually convert to the 16-bit integer, for the valid range of inputs [0f .. 255f], the low 8 bits of the result will be correct regardless of whether we used signed or unsigned conversion. For the inputs outside of the valid input range LLVM IR considers results to be poison.

If the goal is to have upper bits of the result always be 0, even for out of range inputs, then this patch is insufficient. It will still fill in upper bits for some out-of range input values, because we're converting to a 16-bit int. E.g converting 65535.0 will give us 0xffff. If all-zero MSBs are indeed the goal here, we'd need to explicitly mask the low 8 bits after conversion, instead. Considering that this code has been in place in the current shape for a very long time, I do not think this is necessary in practice.

Perhaps I'm still missing something. Can you elaborate on what motivates this change and what exactly is the issue it is intended to solve. Simply using PTX instruction with a better matching name but no effect on the valid inputs is not worth plumbing special case into generic LLVM code. IMO.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I generally agree with your reasoning as to why this seems irrelevant on the PTX level. However...

Does it matter in practice?

Can you provide your opinion on the SASS diff I linked?
Without change:

cvt_u8_f32:
 F2I.S16.TRUNC.NTZ R4, R4 
 LOP3.LUT R4, R4, 0xffff, RZ, 0xc0, !PT 
 RET.ABS.NODEC R20 0x0

With change:

cvt_u8_f32:
 F2I.U16.TRUNC.NTZ R4, R4 
 RET.ABS.NODEC R20 0x0

It's definitely something of a corner case, but it does seem better. The biggest thing for me is that this implies that the signed/unsigned distinction is relevant for ptxas and is it not just a better name.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you provide your opinion on the SASS diff I linked?

There's obviously one less instruction. However, considering that float to u8 conversion happens approximately never, and that even when it does, it would be dwarfed by whatever operations we perform in FP, I'd be really surprised if you were to see any measurable differences on anything other than an artificial benchmark targeting just this instruction.

On the other hand, it adds non-zero complexity to LLVM which will need to be maintained ~forever. Considering near-zero benefit vs non-zero maintenance cost, I'd say it's not worth it. The impact is minor either way, so if the change could be plausibly used for other targets, I'd be fine with it.

@arsenm -- would AMDGPU benefit from the distinction in signedness in FP-to-int conversions?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the cases that require larger expansions, unsigned is always preferable

; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 | FileCheck %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 | FileCheck %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 | FileCheck %s

define i8 @cvt_u8_f32(float %x) {
; CHECK-LABEL: cvt_u8_f32(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-NEXT: .reg .f32 %f<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.f32 %f1, [cvt_u8_f32_param_0];
; CHECK-NEXT: cvt.rzi.u16.f32 %rs1, %f1;
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-NEXT: ret;
%a = fptoui float %x to i8
ret i8 %a
}

define i8 @cvt_u8_f64(double %x) {
; CHECK-LABEL: cvt_u8_f64(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-NEXT: .reg .f64 %fd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.f64 %fd1, [cvt_u8_f64_param_0];
; CHECK-NEXT: cvt.rzi.u16.f64 %rs1, %fd1;
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-NEXT: ret;
%a = fptoui double %x to i8
ret i8 %a
}

define float @cvt_f32_i8(i8 %x) {
; CHECK-LABEL: cvt_f32_i8(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .f32 %f<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u8 %rs1, [cvt_f32_i8_param_0];
; CHECK-NEXT: cvt.rn.f32.u16 %f1, %rs1;
; CHECK-NEXT: st.param.f32 [func_retval0], %f1;
; CHECK-NEXT: ret;
%a = uitofp i8 %x to float
ret float %a
}

define double @cvt_f64_i8(i8 %x) {
; CHECK-LABEL: cvt_f64_i8(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .f64 %fd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u8 %rs1, [cvt_f64_i8_param_0];
; CHECK-NEXT: cvt.rn.f64.u16 %fd1, %rs1;
; CHECK-NEXT: st.param.f64 [func_retval0], %fd1;
; CHECK-NEXT: ret;
%a = uitofp i8 %x to double
ret double %a
}

define float @cvt_f32_s8(i8 %x) {
; CHECK-LABEL: cvt_f32_s8(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .f32 %f<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.s8 %rs1, [cvt_f32_s8_param_0];
; CHECK-NEXT: cvt.rn.f32.s16 %f1, %rs1;
; CHECK-NEXT: st.param.f32 [func_retval0], %f1;
; CHECK-NEXT: ret;
%a = sitofp i8 %x to float
ret float %a
}

define double @cvt_f64_s8(i8 %x) {
; CHECK-LABEL: cvt_f64_s8(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .f64 %fd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.s8 %rs1, [cvt_f64_s8_param_0];
; CHECK-NEXT: cvt.rn.f64.s16 %fd1, %rs1;
; CHECK-NEXT: st.param.f64 [func_retval0], %fd1;
; CHECK-NEXT: ret;
%a = sitofp i8 %x to double
ret double %a
}

define i8 @cvt_s8_f32(float %x) {
; CHECK-LABEL: cvt_s8_f32(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .f32 %f<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.f32 %f1, [cvt_s8_f32_param_0];
; CHECK-NEXT: cvt.rzi.s16.f32 %rs1, %f1;
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
; CHECK-NEXT: and.b32 %r2, %r1, 255;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%a = fptosi float %x to i8
ret i8 %a
}

define i8 @cvt_s8_f64(double %x) {
; CHECK-LABEL: cvt_s8_f64(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .f64 %fd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.f64 %fd1, [cvt_s8_f64_param_0];
; CHECK-NEXT: cvt.rzi.s16.f64 %rs1, %fd1;
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
; CHECK-NEXT: and.b32 %r2, %r1, 255;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%a = fptosi double %x to i8
ret i8 %a
}