Combine: Support Float16 datatype for OutTokens (#3)

varun-sundar-rabindranath · Varun Sundar Rabindranath · web-flow · commit be55fd741045 · 2025-04-06T19:59:14.000-07:00
Support Float16 datatype for OutTokens
- Replace hard coded `nv_bfloat16` with a Template type for outTokens in
the combine kernel
  - Updated python and cpp `all_to_all` tests
  - Updated python and cpp benchmarks

---------

Signed-off-by: Varun Sundar Rabindranath &lt;vsundarr@redhat.com&gt;
Co-authored-by: Varun Sundar Rabindranath &lt;vsundarr@redhat.com&gt;
diff --git a/csrc/all_to_all/bench_all_to_all.cpp b/csrc/all_to_all/bench_all_to_all.cpp
@@ -40,7 +40,7 @@ Time average(const std::vector<float> &timesUs) {
   return std::make_pair(mean, stddev);
 }
 
-template <typename T>
+template <typename T, typename U>
 std::pair<Time, Time> benchmark(
     unsigned repeat,
     const BenchConfig &config,
@@ -65,7 +65,7 @@ std::pair<Time, Time> benchmark(
   DeviceBuffer<float> outExpertScaleDevice(
       expertsPerRank * config.numTokens * numPEs * data.hiddenDimScale
   );
-  DeviceBuffer<nv_bfloat16> outTokensDevice(config.numTokens * data.hiddenDim);
+  DeviceBuffer<U> outTokensDevice(config.numTokens * data.hiddenDim);
   DeviceBuffer<T> xDevice(data.x);
   DeviceBuffer<float> xScaleDevice(data.xScale);
   DeviceBuffer<uint32_t> indicesDevice(data.indices);
@@ -128,8 +128,8 @@ std::pair<Time, Time> benchmark(
 
       CUDACHECK(cudaEventRecord(std::get<1>(events[i]), stream));
 
-      allToAll.combine<T>(
-          Strided1D<nv_bfloat16>(outTokensDevice, config.hiddenDim),
+      allToAll.combine<T, U>(
+          Strided1D<U>(outTokensDevice, config.hiddenDim),
           Strided2D<uint32_t>(indicesDevice, 1, config.expertsPerToken),
           Strided2D<float>(weightsDevice, 1, config.expertsPerToken),
           Strided2D<T>(
@@ -239,19 +239,33 @@ int main(int argc, char **argv) {
       {128, 256, 8, 7168, 128},
   };
 
-  for (const auto &config : configs) {
-    auto [dispatch, combine] = benchmark<nv_bfloat16>(10, config, currentPE, numPEs, stream);
-    if (currentPE == 0) {
-      auto [dispatchMean, dispatchStddev] = dispatch;
-      auto [combineMean, combineStddev] = combine;
-      std::cout << std::setw(3) << config.numTokens << " " << std::setw(3) << config.numExperts
-                << " " << std::setw(3) << config.expertsPerToken << " " << std::setw(4)
-                << config.hiddenDim << " " << std::fixed << std::setprecision(3)
+  auto maybe_print_bench_results = [](int const myPE,
+                                      BenchConfig const &config,
+                                      Time const &dispatch_time,
+                                      Time const &combine_time,
+                                      std::string const description = "") {
+    if (myPE == 0) {
+      auto [dispatchMean, dispatchStddev] = dispatch_time;
+      auto [combineMean, combineStddev] = combine_time;
+      std::cout << description << std::setw(6) << config.numTokens << " " << std::setw(3)
+                << config.numExperts << " " << std::setw(3) << config.expertsPerToken << " "
+                << std::setw(4) << config.hiddenDim << " " << std::fixed << std::setprecision(3)
                 << "Dispatch: " << std::setw(10) << dispatchMean << "us ± " << dispatchStddev
                 << "us "
                 << "Combine: " << std::setw(10) << combineMean << "us ± " << combineStddev << "us"
                 << std::endl;
     }
+  };
+
+  for (const auto &config : configs) {
+    auto [dispatch, combine] =
+        benchmark<nv_bfloat16, nv_bfloat16>(10, config, currentPE, numPEs, stream);
+    maybe_print_bench_results(currentPE, config, dispatch, combine, "nv_bfloat16->nv_bfloat16:");
+  }
+
+  for (const auto &config : configs) {
+    auto [dispatch, combine] = benchmark<half, half>(10, config, currentPE, numPEs, stream);
+    maybe_print_bench_results(currentPE, config, dispatch, combine, "half->half:");
   }
 
   // Cleanup.
diff --git a/csrc/all_to_all/internode.h b/csrc/all_to_all/internode.h
@@ -96,9 +96,9 @@ class AllToAllInterNode final : public AllToAll {
   /// Shape: [1].
   ///
   /// @param stream The CUDA stream to launch the kernel on.
-  template <typename T>
+  template <typename T, typename U>
   void combine(
-      const Strided1D<nv_bfloat16> &outTokens,
+      const Strided1D<U> &outTokens,
       const Strided2D<uint32_t> &indices,
       const Strided2D<float> &weights,
       const Strided2D<T> &expertX,
diff --git a/csrc/all_to_all/internode_combine.cu b/csrc/all_to_all/internode_combine.cu
@@ -8,9 +8,9 @@
 
 using namespace pplx;
 
-template <typename T, size_t NUM_WARPS, bool DO_SEND, bool DO_RECV>
+template <typename T, typename U, size_t NUM_WARPS, bool DO_SEND, bool DO_RECV>
 __global__ __launch_bounds__(NUM_WARPS * 32, 1) void combineKernel(
-    nv_bfloat16 *outTokens,
+    U *outTokens,
     size_t outTokensStrideElem,
     uint32_t *indices,
     size_t indicesStrideElem,
@@ -104,7 +104,7 @@ __global__ __launch_bounds__(NUM_WARPS * 32, 1) void combineKernel(
       __syncthreads();
       combineSignalBuffer[i] = 0;
 
-      nv_bfloat16 *dstPtr = outTokens + i * outTokensStrideElem;
+      U *dstPtr = outTokens + i * outTokensStrideElem;
       constexpr unsigned VEC_SIZE = 8;
       for (unsigned j = threadIdx.x * VEC_SIZE; j < hiddenDim; j += blockDim.x * VEC_SIZE) {
         float sum[VEC_SIZE];
@@ -140,9 +140,9 @@ __global__ __launch_bounds__(NUM_WARPS * 32, 1) void combineKernel(
   }
 }
 
-template <typename T>
+template <typename T, typename U>
 void AllToAllInterNode::combine(
-    const Strided1D<nv_bfloat16> &outTokens,
+    const Strided1D<U> &outTokens,
     const Strided2D<uint32_t> &indices,
     const Strided2D<float> &weights,
     const Strided2D<T> &expertX,
@@ -165,7 +165,7 @@ void AllToAllInterNode::combine(
   dim3 dimBlock(NUM_WARPS * 32, 1, 1);
 
   void *args[] = {
-      const_cast<nv_bfloat16 **>(&outTokens.data),
+      const_cast<U **>(&outTokens.data),
       const_cast<size_t *>(&outTokens.strideElem),
       const_cast<uint32_t **>(&indices.data),
       const_cast<size_t *>(&indices.strideElem),
@@ -198,17 +198,17 @@ void AllToAllInterNode::combine(
   switch (splitMode) {
   case SplitMode::SEND:
     CUDACHECK(cudaLaunchCooperativeKernel(
-        (void *)&combineKernel<T, NUM_WARPS, true, false>, dimGrid, dimBlock, args, 0, stream
+        (void *)&combineKernel<T, U, NUM_WARPS, true, false>, dimGrid, dimBlock, args, 0, stream
     ));
     break;
   case SplitMode::RECV:
     CUDACHECK(cudaLaunchCooperativeKernel(
-        (void *)&combineKernel<T, NUM_WARPS, false, true>, dimGrid, dimBlock, args, 0, stream
+        (void *)&combineKernel<T, U, NUM_WARPS, false, true>, dimGrid, dimBlock, args, 0, stream
     ));
     break;
   case SplitMode::NONE:
     CUDACHECK(cudaLaunchCooperativeKernel(
-        (void *)&combineKernel<T, NUM_WARPS, true, true>, dimGrid, dimBlock, args, 0, stream
+        (void *)&combineKernel<T, U, NUM_WARPS, true, true>, dimGrid, dimBlock, args, 0, stream
     ));
     break;
   default:
@@ -217,9 +217,9 @@ void AllToAllInterNode::combine(
   nvtxRangePop();
 }
 
-#define INSTANTIATE_COMBINE(T)                                                                     \
-  template void AllToAllInterNode::combine<T>(                                                     \
-      const Strided1D<nv_bfloat16> &outTokens,                                                     \
+#define INSTANTIATE_COMBINE(T, U)                                                                  \
+  template void AllToAllInterNode::combine<T, U>(                                                  \
+      const Strided1D<U> &outTokens,                                                               \
       const Strided2D<uint32_t> &indices,                                                          \
       const Strided2D<float> &weights,                                                             \
       const Strided2D<T> &expertX,                                                                 \
@@ -229,6 +229,9 @@ void AllToAllInterNode::combine(
       cudaStream_t stream                                                                          \
   );
 
-INSTANTIATE_COMBINE(float)
-INSTANTIATE_COMBINE(half)
-INSTANTIATE_COMBINE(nv_bfloat16)
+INSTANTIATE_COMBINE(float, nv_bfloat16)
+INSTANTIATE_COMBINE(half, nv_bfloat16)
+INSTANTIATE_COMBINE(nv_bfloat16, nv_bfloat16)
+INSTANTIATE_COMBINE(float, half)
+INSTANTIATE_COMBINE(half, half)
+INSTANTIATE_COMBINE(nv_bfloat16, half)
diff --git a/csrc/all_to_all/test_all_to_all.cpp b/csrc/all_to_all/test_all_to_all.cpp
@@ -19,7 +19,7 @@
 
 using namespace pplx;
 
-template <typename T, typename Kernel>
+template <typename T, typename U, typename Kernel>
 bool testDispatchCombine(
     cudaStream_t stream,
     unsigned dpRank,
@@ -75,7 +75,7 @@ bool testDispatchCombine(
   DeviceBuffer<float> outExpertScaleDevice(
       expertsPerRank * maxNumTokens * numDPGroups * rank.hiddenDimScale
   );
-  DeviceBuffer<nv_bfloat16> outTokensDevice(maxNumTokens * hiddenDim);
+  DeviceBuffer<U> outTokensDevice(maxNumTokens * hiddenDim);
 
   const size_t hiddenDimBytes = rank.hiddenDim * sizeof(T);
   const size_t hiddenDimScaleBytes = rank.hiddenDimScale * sizeof(float);
@@ -113,7 +113,7 @@ bool testDispatchCombine(
   CUDACHECK(cudaStreamSynchronize(stream));
 
   allToAll.combine(
-      Strided1D<nv_bfloat16>(outTokensDevice, hiddenDim),
+      Strided1D<U>(outTokensDevice, hiddenDim),
       Strided2D<uint32_t>(indicesDevice, 1, expertsPerToken),
       Strided2D<float>(weightsDevice, 1, expertsPerToken),
       Strided2D<T>(outExpertDevice, hiddenDim, hiddenDim * maxNumTokens * numDPGroups),
@@ -127,7 +127,7 @@ bool testDispatchCombine(
   HostBuffer<int32_t> outNumTokensPerExpertHost(outTokensPerExpertDevice);
   HostBuffer<T> outExpertHost(outExpertDevice);
   HostBuffer<float> outExpertScaleHost(outExpertScaleDevice);
-  HostBuffer<nv_bfloat16> outTokensHost(outTokensDevice);
+  HostBuffer<U> outTokensHost(outTokensDevice);
 
   // Print the results.
   for (unsigned i = 0; i < epSize; ++i) {
@@ -322,10 +322,22 @@ int main(int argc, char **argv) {
 
   // Run the tests.
   int exit_code = EXIT_SUCCESS;
-  if (!testDispatchCombine<float, AllToAllInterNode>(stream, rank / 2, 2, rank, world_size)) {
+  if (!testDispatchCombine<float, nv_bfloat16, AllToAllInterNode>(
+          stream, rank / 2, 2, rank, world_size
+      )) {
     exit_code = EXIT_FAILURE;
   }
-  if (!testDispatchCombine<nv_bfloat16, AllToAllInterNode>(stream, rank / 2, 2, rank, world_size)) {
+  if (!testDispatchCombine<nv_bfloat16, nv_bfloat16, AllToAllInterNode>(
+          stream, rank / 2, 2, rank, world_size
+      )) {
+    exit_code = EXIT_FAILURE;
+  }
+  if (!testDispatchCombine<float, half, AllToAllInterNode>(stream, rank / 2, 2, rank, world_size)) {
+    exit_code = EXIT_FAILURE;
+  }
+  if (!testDispatchCombine<nv_bfloat16, half, AllToAllInterNode>(
+          stream, rank / 2, 2, rank, world_size
+      )) {
     exit_code = EXIT_FAILURE;
   }
 
diff --git a/csrc/bindings/all_to_all_ops.cpp b/csrc/bindings/all_to_all_ops.cpp
@@ -136,7 +136,10 @@ void combine(
     bool doRecv
 ) {
   _CHECK_TENSOR(2, outTokens);
-  TORCH_CHECK(outTokens.scalar_type() == at::kBFloat16, "outTokens must be of type BFloat16");
+  TORCH_CHECK(
+      outTokens.scalar_type() == at::kBFloat16 || outTokens.scalar_type() == at::kHalf,
+      "outTokens must be of type BFloat16 or Float16"
+  );
   _CHECK_TENSOR(2, indices);
   TORCH_CHECK(indices.scalar_type() == at::kUInt32, "indices must be of type UInt32");
   _CHECK_TENSOR(2, weights);
@@ -149,9 +152,9 @@ void combine(
   }
 
   auto *all_to_all = (AllToAllInterNode *)ptr;
-  auto run = [&]<typename T>() {
-    all_to_all->combine<T>(
-        Strided1D<nv_bfloat16>((nv_bfloat16 *)outTokens.data_ptr(), (size_t)outTokens.stride(0)),
+  auto run = [&]<typename T, typename U>() {
+    all_to_all->combine<T, U>(
+        Strided1D<U>((U *)outTokens.data_ptr(), (size_t)outTokens.stride(0)),
         Strided2D<uint32_t>(
             indices.data_ptr<uint32_t>(), (size_t)indices.stride(1), (size_t)indices.stride(0)
         ),
@@ -166,15 +169,28 @@ void combine(
     );
   };
 
+  auto out_type_switch = [&]<typename T>(at::ScalarType const &out_dtype) {
+    switch (out_dtype) {
+    case at::kBFloat16:
+      run.operator()<T, nv_bfloat16>();
+      break;
+    case at::kHalf:
+      run.operator()<T, half>();
+      break;
+    default:
+      TORCH_CHECK(false, "Unsupported dtype for outTokens");
+    }
+  };
+
   switch (expertY.scalar_type()) {
   case at::kFloat:
-    run.operator()<float>();
+    out_type_switch.operator()<float>(outTokens.scalar_type());
     break;
   case at::kBFloat16:
-    run.operator()<nv_bfloat16>();
+    out_type_switch.operator()<nv_bfloat16>(outTokens.scalar_type());
     break;
   case at::kHalf:
-    run.operator()<half>();
+    out_type_switch.operator()<half>(outTokens.scalar_type());
     break;
   default:
     TORCH_CHECK(false, "Unsupported dtype for expertY");
diff --git a/tests/bench_all_to_all.py b/tests/bench_all_to_all.py
@@ -234,30 +234,32 @@ def _worker_bench_all_to_all(
     pgi: ProcessGroupInfo,
     dp_size: int,
     in_dtype_str: str,
+    out_dtype_str: str,
 ) -> None:
     uid = nvshmem_get_unique_id() if pgi.rank == 0 else nvshmem_alloc_empty_unique_id()
     torch.distributed.broadcast(uid, src=0)
     nvshmem_init(uid, pgi.rank, pgi.world_size)
 
     in_dtype = getattr(torch, in_dtype_str)
+    out_dtype = getattr(torch, out_dtype_str)
     assert isinstance(in_dtype, torch.dtype)
     configs = [
         # V2-Lite:  64 Experts, 6 Experts per Token, 2048 Hidden Dim
-        MoEConfig(64, 6, 2048, 1, in_dtype),
-        MoEConfig(64, 6, 2048, 4, in_dtype),
-        MoEConfig(64, 6, 2048, 8, in_dtype),
-        MoEConfig(64, 6, 2048, 16, in_dtype),
-        MoEConfig(64, 6, 2048, 32, in_dtype),
-        MoEConfig(64, 6, 2048, 64, in_dtype),
-        MoEConfig(64, 6, 2048, 128, in_dtype),
+        MoEConfig(64, 6, 2048, 1, in_dtype, out_dtype),
+        MoEConfig(64, 6, 2048, 4, in_dtype, out_dtype),
+        MoEConfig(64, 6, 2048, 8, in_dtype, out_dtype),
+        MoEConfig(64, 6, 2048, 16, in_dtype, out_dtype),
+        MoEConfig(64, 6, 2048, 32, in_dtype, out_dtype),
+        MoEConfig(64, 6, 2048, 64, in_dtype, out_dtype),
+        MoEConfig(64, 6, 2048, 128, in_dtype, out_dtype),
         # R1     : 256 Experts, 8 Experts per Token, 7168 Hidden Dim
-        MoEConfig(256, 8, 7168, 1, in_dtype),
-        MoEConfig(256, 8, 7168, 4, in_dtype),
-        MoEConfig(256, 8, 7168, 8, in_dtype),
-        MoEConfig(256, 8, 7168, 16, in_dtype),
-        MoEConfig(256, 8, 7168, 32, in_dtype),
-        MoEConfig(256, 8, 7168, 64, in_dtype),
-        MoEConfig(256, 8, 7168, 128, in_dtype),
+        MoEConfig(256, 8, 7168, 1, in_dtype, out_dtype),
+        MoEConfig(256, 8, 7168, 4, in_dtype, out_dtype),
+        MoEConfig(256, 8, 7168, 8, in_dtype, out_dtype),
+        MoEConfig(256, 8, 7168, 16, in_dtype, out_dtype),
+        MoEConfig(256, 8, 7168, 32, in_dtype, out_dtype),
+        MoEConfig(256, 8, 7168, 64, in_dtype, out_dtype),
+        MoEConfig(256, 8, 7168, 128, in_dtype, out_dtype),
     ]
 
     header = [
@@ -340,18 +342,26 @@ def main() -> None:
     parser.add_argument("--dp-size", type=int, default=1)
     parser.add_argument(
         "--in-dtype",
-        choices=["bfloat16", "float8_e4m3fn"],
+        choices=["bfloat16", "float16", "float8_e4m3fn"],
         default="float8_e4m3fn",
     )
+    parser.add_argument(
+        "--out-dtype",
+        choices=["bfloat16", "float16"],
+        default="bfloat16",
+    )
     args = parser.parse_args()
     dp_size = int(args.dp_size)
     in_dtype = str(args.in_dtype)
+    out_dtype = str(args.out_dtype)
 
     if "MASTER_ADDR" in os.environ:
-        parallel_launch_from_env(_worker_bench_all_to_all, dp_size, in_dtype)
+        parallel_launch_from_env(_worker_bench_all_to_all, dp_size, in_dtype, out_dtype)
     else:
         world_size = torch.cuda.device_count()
-        parallel_launch(world_size, _worker_bench_all_to_all, dp_size, in_dtype)
+        parallel_launch(
+            world_size, _worker_bench_all_to_all, dp_size, in_dtype, out_dtype
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/test_all_to_all.py b/tests/test_all_to_all.py