[NVPTX] Legalize ctpop and ctlz in operation legalization

AlexMaclean · AlexMaclean · commit e2a60bba532d · 2025-03-03T17:51:00.000Z
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -766,16 +766,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // Custom handling for i8 intrinsics
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
 
-  for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
-    setOperationAction(ISD::ABS,  Ty, Legal);
-    setOperationAction(ISD::SMIN, Ty, Legal);
-    setOperationAction(ISD::SMAX, Ty, Legal);
-    setOperationAction(ISD::UMIN, Ty, Legal);
-    setOperationAction(ISD::UMAX, Ty, Legal);
+  setOperationAction({ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX},
+                     {MVT::i16, MVT::i32, MVT::i64}, Legal);
 
-    setOperationAction(ISD::CTPOP, Ty, Legal);
-    setOperationAction(ISD::CTLZ, Ty, Legal);
-  }
+  setOperationAction({ISD::CTPOP, ISD::CTLZ}, MVT::i32, Legal);
+  setOperationAction({ISD::CTPOP, ISD::CTLZ}, {MVT::i16, MVT::i64}, Custom);
 
   setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
   setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
@@ -2750,6 +2745,42 @@ static SDValue LowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) {
   return Op;
 }
 
+static SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) {
+  SDValue V = Op->getOperand(0);
+  SDLoc DL(Op);
+
+  if (V.getValueType() == MVT::i16) {
+    SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V);
+    SDValue CT = DAG.getNode(ISD::CTPOP, DL, MVT::i32, Zext);
+    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, CT, SDNodeFlags::NoWrap);
+  }
+  if (V.getValueType() == MVT::i64) {
+    SDValue CT = DAG.getNode(ISD::CTPOP, DL, MVT::i32, V);
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT);
+  }
+  llvm_unreachable("Unexpected CTPOP type to legalize");
+}
+
+static SDValue lowerCTLZ(SDValue Op, SelectionDAG &DAG) {
+  SDValue V = Op->getOperand(0);
+  SDLoc DL(Op);
+
+  if (V.getValueType() == MVT::i16) {
+    SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V);
+    SDValue CT = DAG.getNode(ISD::CTLZ, DL, MVT::i32, Zext);
+    SDValue Sub =
+        DAG.getNode(ISD::ADD, DL, MVT::i32, CT,
+                    DAG.getConstant(APInt(32, -16, true), DL, MVT::i32),
+                    SDNodeFlags::NoSignedWrap);
+    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Sub, SDNodeFlags::NoWrap);
+  }
+  if (V.getValueType() == MVT::i64) {
+    SDValue CT = DAG.getNode(ISD::CTLZ, DL, MVT::i32, V);
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT);
+  }
+  llvm_unreachable("Unexpected CTLZ type to legalize");
+}
+
 SDValue
 NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
@@ -2835,6 +2866,10 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FMUL:
     // Used only for bf16 on SM80, where we select fma for non-ftz operation
     return PromoteBinOpIfF32FTZ(Op, DAG);
+  case ISD::CTPOP:
+    return lowerCTPOP(Op, DAG);
+  case ISD::CTLZ:
+    return lowerCTLZ(Op, DAG);
 
   default:
     llvm_unreachable("Custom lowering not defined for operation");
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3267,69 +3267,20 @@ def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, i32:$amt)),
 def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, (i32 imm:$amt))),
           (SHF_R_CLAMP_i $lo, $hi, imm:$amt)>;
 
-// Count leading zeros
 let hasSideEffects = false in {
-  def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
-                         "clz.b32 \t$d, $a;", []>;
-  def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-                         "clz.b64 \t$d, $a;", []>;
+  foreach RT = [I32RT, I64RT] in {
+    // Count leading zeros
+    def CLZr # RT.Size : NVPTXInst<(outs Int32Regs:$d), (ins RT.RC:$a),
+                                   "clz.b" # RT.Size # " \t$d, $a;",
+                                   [(set i32:$d, (ctlz RT.Ty:$a))]>;
+
+    // Population count
+    def POPCr # RT.Size : NVPTXInst<(outs Int32Regs:$d), (ins RT.RC:$a),
+                                    "popc.b" # RT.Size # " \t$d, $a;",
+                                    [(set i32:$d, (ctpop RT.Ty:$a))]>;
+  }
 }
 
-// 32-bit has a direct PTX instruction
-def : Pat<(i32 (ctlz i32:$a)), (CLZr32 $a)>;
-
-// The return type of the ctlz ISD node is the same as its input, but the PTX
-// ctz instruction always returns a 32-bit value.  For ctlz.i64, convert the
-// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
-// truncating back down to 32 bits.
-def : Pat<(i64 (ctlz i64:$a)), (CVT_u64_u32 (CLZr64 $a), CvtNONE)>;
-def : Pat<(i32 (trunc (i64 (ctlz i64:$a)))), (CLZr64 $a)>;
-
-// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
-// result back to 16-bits if necessary.  We also need to subtract 16 because
-// the high-order 16 zeros were counted.
-//
-// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
-// use to save one SASS instruction (on sm_35 anyway):
-//
-//   mov.b32 $tmp, {0xffff, $a}
-//   ctlz.b32 $result, $tmp
-//
-// That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
-// and then ctlz that value.  This way we don't have to subtract 16 from the
-// result.  Unfortunately today we don't have a way to generate
-// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
-def : Pat<(i16 (ctlz i16:$a)),
-          (SUBi16ri (CVT_u16_u32
-           (CLZr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE), 16)>;
-def : Pat<(i32 (zext (i16 (ctlz i16:$a)))),
-          (SUBi32ri (CLZr32 (CVT_u32_u16 $a, CvtNONE)), 16)>;
-
-// Population count
-let hasSideEffects = false in {
-  def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
-                          "popc.b32 \t$d, $a;", []>;
-  def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-                          "popc.b64 \t$d, $a;", []>;
-}
-
-// 32-bit has a direct PTX instruction
-def : Pat<(i32 (ctpop i32:$a)), (POPCr32 $a)>;
-
-// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
-// to match the LLVM semantics.  Just as with ctlz.i64, we provide a second
-// pattern that avoids the type conversion if we're truncating the result to
-// i32 anyway.
-def : Pat<(ctpop i64:$a), (CVT_u64_u32 (POPCr64 $a), CvtNONE)>;
-def : Pat<(i32 (trunc (i64 (ctpop i64:$a)))), (POPCr64 $a)>;
-
-// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
-// If we know that we're storing into an i32, we can avoid the final trunc.
-def : Pat<(ctpop i16:$a),
-          (CVT_u16_u32 (POPCr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE)>;
-def : Pat<(i32 (zext (i16 (ctpop i16:$a)))),
-          (POPCr32 (CVT_u32_u16 $a, CvtNONE))>;
-
 // fpround f32 -> f16
 def : Pat<(f16 (fpround f32:$a)),
           (CVT_f16_f32 $a, CvtRN)>;
diff --git a/llvm/test/CodeGen/NVPTX/ctlz.ll b/llvm/test/CodeGen/NVPTX/ctlz.ll
@@ -112,14 +112,12 @@ define i32 @myctlz64_as_32_2(i64 %a) {
 define i16 @myctlz_ret16(i16 %a) {
 ; CHECK-LABEL: myctlz_ret16(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [myctlz_ret16_param_0];
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT:    ld.param.u16 %r1, [myctlz_ret16_param_0];
 ; CHECK-NEXT:    clz.b32 %r2, %r1;
-; CHECK-NEXT:    sub.s32 %r3, %r2, 16;
+; CHECK-NEXT:    add.s32 %r3, %r2, -16;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
@@ -128,14 +126,12 @@ define i16 @myctlz_ret16(i16 %a) {
 define i16 @myctlz_ret16_2(i16 %a) {
 ; CHECK-LABEL: myctlz_ret16_2(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [myctlz_ret16_2_param_0];
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT:    ld.param.u16 %r1, [myctlz_ret16_2_param_0];
 ; CHECK-NEXT:    clz.b32 %r2, %r1;
-; CHECK-NEXT:    sub.s32 %r3, %r2, 16;
+; CHECK-NEXT:    add.s32 %r3, %r2, -16;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %val = call i16 @llvm.ctlz.i16(i16 %a, i1 true) readnone
@@ -147,18 +143,15 @@ define i16 @myctlz_ret16_2(i16 %a) {
 define void @myctlz_store16(i16 %a, ptr %b) {
 ; CHECK-LABEL: myctlz_store16(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<4>;
-; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [myctlz_store16_param_0];
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT:    ld.param.u16 %r1, [myctlz_store16_param_0];
 ; CHECK-NEXT:    clz.b32 %r2, %r1;
-; CHECK-NEXT:    cvt.u16.u32 %rs2, %r2;
-; CHECK-NEXT:    sub.s16 %rs3, %rs2, 16;
+; CHECK-NEXT:    add.s32 %r3, %r2, -16;
 ; CHECK-NEXT:    ld.param.u64 %rd1, [myctlz_store16_param_1];
-; CHECK-NEXT:    st.u16 [%rd1], %rs3;
+; CHECK-NEXT:    st.u16 [%rd1], %r3;
 ; CHECK-NEXT:    ret;
   %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
   store i16 %val, ptr %b
@@ -167,18 +160,15 @@ define void @myctlz_store16(i16 %a, ptr %b) {
 define void @myctlz_store16_2(i16 %a, ptr %b) {
 ; CHECK-LABEL: myctlz_store16_2(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<4>;
-; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [myctlz_store16_2_param_0];
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT:    ld.param.u16 %r1, [myctlz_store16_2_param_0];
 ; CHECK-NEXT:    clz.b32 %r2, %r1;
-; CHECK-NEXT:    cvt.u16.u32 %rs2, %r2;
-; CHECK-NEXT:    sub.s16 %rs3, %rs2, 16;
+; CHECK-NEXT:    add.s32 %r3, %r2, -16;
 ; CHECK-NEXT:    ld.param.u64 %rd1, [myctlz_store16_2_param_1];
-; CHECK-NEXT:    st.u16 [%rd1], %rs3;
+; CHECK-NEXT:    st.u16 [%rd1], %r3;
 ; CHECK-NEXT:    ret;
   %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
   store i16 %val, ptr %b
diff --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll
@@ -150,12 +150,10 @@ define void @test_popc16(i16 %a, ptr %b) {
 define i32 @test_popc16_to_32(i16 %a) {
 ; CHECK-LABEL: test_popc16_to_32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [test_popc16_to_32_param_0];
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT:    ld.param.u16 %r1, [test_popc16_to_32_param_0];
 ; CHECK-NEXT:    popc.b32 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;