Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 0 additions & 19 deletions llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,6 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
case ISD::ADDRSPACECAST:
SelectAddrSpaceCast(N);
return;
case ISD::ConstantFP:
if (tryConstantFP(N))
return;
break;
case ISD::CopyToReg: {
if (N->getOperand(1).getValueType() == MVT::i128) {
SelectV2I64toI128(N);
Expand Down Expand Up @@ -212,21 +208,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
}
}

// There's no way to specify FP16 and BF16 immediates in .(b)f16 ops, so we
// have to load them into an .(b)f16 register first.
Comment on lines -215 to -216
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're deleting this code because the comment is false, right? Or am I misunderstanding something.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the source code comment is false, then the update LGTM.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment is sort of true but it doesn't justify making things the way they were. It is simpler and cleaner to replace these with normally named Mov instructions and to use tablegen to generate the ISel logic. This is handled the same way any other instruction where we do not support an immediate operand is.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure it's been fixed in ptxas: https://godbolt.org/z/d8EcMevc8

If anything, recent versions seem to be more restrictive than the older ones.

Copy link
Member

@Artem-B Artem-B Dec 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm fine with removing this code as long as we can still handle constants correctly. This should already be covered by the original tests in llvm/test/CodeGen/NVPTX/f16-instructions.ll

if LLVM tests are happy after the run with ptxas enabled, we should be fine.

bool NVPTXDAGToDAGISel::tryConstantFP(SDNode *N) {
if (N->getValueType(0) != MVT::f16 && N->getValueType(0) != MVT::bf16)
return false;
SDValue Val = CurDAG->getTargetConstantFP(
cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), N->getValueType(0));
SDNode *LoadConstF16 = CurDAG->getMachineNode(
(N->getValueType(0) == MVT::f16 ? NVPTX::LOAD_CONST_F16
: NVPTX::LOAD_CONST_BF16),
SDLoc(N), N->getValueType(0), Val);
ReplaceNode(N, LoadConstF16);
return true;
}

// Map ISD:CONDCODE value to appropriate CmpMode expected by
// NVPTXInstPrinter::printCmpMode()
static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
Expand Down
123 changes: 30 additions & 93 deletions llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1177,17 +1177,6 @@ def NegDoubleConst : SDNodeXForm<fpimm, [{
SDLoc(N), MVT::f64);
}]>;

// Loads FP16 constant into a register.
//
// ptxas does not have hex representation for fp16, so we can't use
// fp16 immediate values in .f16 instructions. Instead we have to load
// the constant into a register using mov.b16.
def LOAD_CONST_F16 :
NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$a),
"mov.b16 \t$dst, $a;", []>;
def LOAD_CONST_BF16 :
NVPTXInst<(outs Int16Regs:$dst), (ins bf16imm:$a),
"mov.b16 \t$dst, $a;", []>;
defm FADD : F3_fma_component<"add", fadd>;
defm FSUB : F3_fma_component<"sub", fsub>;
defm FMUL : F3_fma_component<"mul", fmul>;
Expand Down Expand Up @@ -1963,7 +1952,7 @@ let hasSideEffects = false in {


// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
let IsSimpleMove=1, hasSideEffects=0 in {
let IsSimpleMove=1, hasSideEffects=0, isAsCheapAsAMove=1 in {
def IMOV1rr : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
"mov.pred \t$dst, $sss;", []>;
def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
Expand All @@ -1975,44 +1964,40 @@ let IsSimpleMove=1, hasSideEffects=0 in {
def IMOV128rr : NVPTXInst<(outs Int128Regs:$dst), (ins Int128Regs:$sss),
"mov.b128 \t$dst, $sss;", []>;

def IMOVB16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
"mov.b16 \t$dst, $sss;", []>;
def IMOVB32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
"mov.b32 \t$dst, $sss;", []>;
def IMOVB64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
"mov.b64 \t$dst, $sss;", []>;

def FMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
// We have to use .b16 here as there's no mov.f16.
"mov.b16 \t$dst, $src;", []>;
def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
"mov.f32 \t$dst, $src;", []>;
def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
"mov.f64 \t$dst, $src;", []>;
}

def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
"mov.pred \t$dst, $src;",
[(set i1:$dst, imm:$src)]>;
def IMOVB16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
"mov.b16 \t$dst, $src;",
[(set i16:$dst, imm:$src)]>;
def IMOVB32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
"mov.b32 \t$dst, $src;",
[(set i32:$dst, imm:$src)]>;
def IMOVB64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
"mov.b64 \t$dst, $src;",
[(set i64:$dst, imm:$src)]>;

def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
"mov.f32 \t$dst, $src;",
[(set f32:$dst, fpimm:$src)]>;
def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
"mov.f64 \t$dst, $src;",
[(set f64:$dst, fpimm:$src)]>;

def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOVB32ri texternalsym:$dst)>;
def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOVB64ri texternalsym:$dst)>;
def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
"mov.pred \t$dst, $src;",
[(set i1:$dst, imm:$src)]>;
def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
"mov.b16 \t$dst, $src;",
[(set i16:$dst, imm:$src)]>;
def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
"mov.b32 \t$dst, $src;",
[(set i32:$dst, imm:$src)]>;
def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
"mov.b64 \t$dst, $src;",
[(set i64:$dst, imm:$src)]>;

def FMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$src),
"mov.b16 \t$dst, $src;",
[(set f16:$dst, fpimm:$src)]>;
def BFMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins bf16imm:$src),
"mov.b16 \t$dst, $src;",
[(set bf16:$dst, fpimm:$src)]>;
def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
"mov.f32 \t$dst, $src;",
[(set f32:$dst, fpimm:$src)]>;
def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
"mov.f64 \t$dst, $src;",
[(set f64:$dst, fpimm:$src)]>;
}

def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOV64ri texternalsym:$dst)>;

//---- Copy Frame Index ----
def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
Expand Down Expand Up @@ -2208,18 +2193,6 @@ multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
def : Pat<(i1 (OpNode f16:$a, f16:$b)),
(SETP_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
Requires<[useFP16Math]>;
def : Pat<(i1 (OpNode f16:$a, fpimm:$b)),
(SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
Requires<[useFP16Math,doF32FTZ]>;
def : Pat<(i1 (OpNode f16:$a, fpimm:$b)),
(SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
Requires<[useFP16Math]>;
def : Pat<(i1 (OpNode fpimm:$a, f16:$b)),
(SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
Requires<[useFP16Math,doF32FTZ]>;
def : Pat<(i1 (OpNode fpimm:$a, f16:$b)),
(SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>,
Requires<[useFP16Math]>;

// bf16 -> pred
def : Pat<(i1 (OpNode bf16:$a, bf16:$b)),
Expand All @@ -2228,18 +2201,6 @@ multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
def : Pat<(i1 (OpNode bf16:$a, bf16:$b)),
(SETP_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
Requires<[hasBF16Math]>;
def : Pat<(i1 (OpNode bf16:$a, fpimm:$b)),
(SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>,
Requires<[hasBF16Math,doF32FTZ]>;
def : Pat<(i1 (OpNode bf16:$a, fpimm:$b)),
(SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>,
Requires<[hasBF16Math]>;
def : Pat<(i1 (OpNode fpimm:$a, bf16:$b)),
(SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
Requires<[hasBF16Math,doF32FTZ]>;
def : Pat<(i1 (OpNode fpimm:$a, bf16:$b)),
(SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>,
Requires<[hasBF16Math]>;

// f32 -> pred
def : Pat<(i1 (OpNode f32:$a, f32:$b)),
Expand Down Expand Up @@ -2273,18 +2234,6 @@ multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
def : Pat<(i32 (OpNode f16:$a, f16:$b)),
(SET_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
Requires<[useFP16Math]>;
def : Pat<(i32 (OpNode f16:$a, fpimm:$b)),
(SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
Requires<[useFP16Math, doF32FTZ]>;
def : Pat<(i32 (OpNode f16:$a, fpimm:$b)),
(SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
Requires<[useFP16Math]>;
def : Pat<(i32 (OpNode fpimm:$a, f16:$b)),
(SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
Requires<[useFP16Math, doF32FTZ]>;
def : Pat<(i32 (OpNode fpimm:$a, f16:$b)),
(SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>,
Requires<[useFP16Math]>;

// bf16 -> i32
def : Pat<(i32 (OpNode bf16:$a, bf16:$b)),
Expand All @@ -2293,18 +2242,6 @@ multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
def : Pat<(i32 (OpNode bf16:$a, bf16:$b)),
(SET_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
Requires<[hasBF16Math]>;
def : Pat<(i32 (OpNode bf16:$a, fpimm:$b)),
(SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>,
Requires<[hasBF16Math, doF32FTZ]>;
def : Pat<(i32 (OpNode bf16:$a, fpimm:$b)),
(SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>,
Requires<[hasBF16Math]>;
def : Pat<(i32 (OpNode fpimm:$a, bf16:$b)),
(SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
Requires<[hasBF16Math, doF32FTZ]>;
def : Pat<(i32 (OpNode fpimm:$a, bf16:$b)),
(SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>,
Requires<[hasBF16Math]>;

// f32 -> i32
def : Pat<(i32 (OpNode f32:$a, f32:$b)),
Expand Down
6 changes: 4 additions & 2 deletions llvm/test/CodeGen/NVPTX/atomics-sm70.ll
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
; CHECKPTX62-NEXT: and.b32 %r10, %r22, -4;
; CHECKPTX62-NEXT: shl.b32 %r38, %r22, 3;
; CHECKPTX62-NEXT: and.b32 %r11, %r38, 24;
; CHECKPTX62-NEXT: shl.b32 %r40, %r26, %r11;
; CHECKPTX62-NEXT: mov.b32 %r39, 65535;
; CHECKPTX62-NEXT: shl.b32 %r40, %r39, %r11;
; CHECKPTX62-NEXT: not.b32 %r12, %r40;
; CHECKPTX62-NEXT: ld.global.u32 %r56, [%r10];
; CHECKPTX62-NEXT: $L__BB0_5: // %atomicrmw.start9
Expand All @@ -114,7 +115,8 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
; CHECKPTX62-NEXT: and.b32 %r16, %r23, -4;
; CHECKPTX62-NEXT: shl.b32 %r46, %r23, 3;
; CHECKPTX62-NEXT: and.b32 %r17, %r46, 24;
; CHECKPTX62-NEXT: shl.b32 %r48, %r26, %r17;
; CHECKPTX62-NEXT: mov.b32 %r47, 65535;
; CHECKPTX62-NEXT: shl.b32 %r48, %r47, %r17;
; CHECKPTX62-NEXT: not.b32 %r18, %r48;
; CHECKPTX62-NEXT: ld.shared.u32 %r57, [%r16];
; CHECKPTX62-NEXT: $L__BB0_7: // %atomicrmw.start
Expand Down
9 changes: 4 additions & 5 deletions llvm/test/CodeGen/NVPTX/misched_func_call.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) {
; CHECK-NEXT: ld.param.u32 %r3, [wombat_param_1];
; CHECK-NEXT: ld.param.u32 %r2, [wombat_param_0];
; CHECK-NEXT: mov.b32 %r10, 0;
; CHECK-NEXT: mov.b64 %rd1, 0;
; CHECK-NEXT: mov.b32 %r6, 1;
; CHECK-NEXT: $L__BB0_1: // %bb3
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: { // callseq 0, 0
Expand All @@ -29,16 +27,17 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) {
; CHECK-NEXT: (
; CHECK-NEXT: param0
; CHECK-NEXT: );
; CHECK-NEXT: ld.param.f64 %fd1, [retval0];
; CHECK-NEXT: } // callseq 0
; CHECK-NEXT: mul.lo.s32 %r7, %r10, %r3;
; CHECK-NEXT: or.b32 %r8, %r4, %r7;
; CHECK-NEXT: mul.lo.s32 %r9, %r2, %r8;
; CHECK-NEXT: cvt.rn.f64.s32 %fd3, %r9;
; CHECK-NEXT: ld.param.f64 %fd1, [retval0];
; CHECK-NEXT: } // callseq 0
; CHECK-NEXT: cvt.rn.f64.u32 %fd4, %r10;
; CHECK-NEXT: add.rn.f64 %fd5, %fd4, %fd3;
; CHECK-NEXT: mov.b64 %rd1, 0;
; CHECK-NEXT: st.global.f64 [%rd1], %fd5;
; CHECK-NEXT: mov.u32 %r10, %r6;
; CHECK-NEXT: mov.b32 %r10, 1;
; CHECK-NEXT: bra.uni $L__BB0_1;
bb:
br label %bb3
Expand Down
Loading