NVIDIA
diff --git a/‎cpp/tensorrt_llm/thop/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion b/‎cpp/tensorrt_llm/thop/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎cpp/tensorrt_llm/thop/fusedGemmAllreduceOp.cpp‎
Lines changed: 150 additions & 0 deletions b/‎cpp/tensorrt_llm/thop/fusedGemmAllreduceOp.cpp‎
Lines changed: 150 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/custom_ops/torch_custom_ops.py‎
Lines changed: 93 additions & 0 deletions b/‎tensorrt_llm/_torch/custom_ops/torch_custom_ops.py‎
Lines changed: 93 additions & 0 deletions
@@ -104,7 +104,8 @@ add_library(
   loraOp.cpp
   finegrained_mixed_dtype_gemm_thop.cpp
   tinygemm2.cpp
-  dsv3RopeOp.cpp)
+  dsv3RopeOp.cpp
+  fusedGemmAllreduceOp.cpp)
 set_property(TARGET th_common PROPERTY POSITION_INDEPENDENT_CODE ON)
 target_link_libraries(
   th_common PRIVATE ${TORCH_LIBRARIES} th_utils ${Python3_LIBRARIES}
 
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cutlass_extensions/gemm_configs.h"
+
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/kernels/cutlass_kernels/include/allreduce_gemm_runner.h"
+#include "tensorrt_llm/runtime/ipcNvlsMemory.h"
+#include "tensorrt_llm/thop/thUtils.h"
+
+#include <cstddef>
+#include <cuda_fp16.h>
+
+#include <cstdint>
+#include <functional>
+#include <type_traits>
+#include <vector>
+
+using tensorrt_llm::kernels::opened_cutlass_kernels::GemmAllReduceImplRunner;
+using tensorrt_llm::kernels::opened_cutlass_kernels::GemmAllReduceImplInterface;
+using tensorrt_llm::kernels::opened_cutlass_kernels::GemmTypes;
+using tensorrt_llm::kernels::opened_cutlass_kernels::PersistentWorkspaceInterface;
+
+namespace torch_ext
+{
+PersistentWorkspaceInterface* getWorkspace(
+    GemmAllReduceImplInterface* runner, GemmAllReduceImplInterface::ProblemArgs const& problem)
+{
+    thread_local std::shared_ptr<PersistentWorkspaceInterface> curWorkspace;
+    thread_local size_t curWorkspaceSize = 0;
+    auto newWorkspace = runner->getPersistentWorkspace(problem);
+    if (newWorkspace->size() > curWorkspaceSize)
+    {
+        TLLM_LOG_WARNING(
+            "Fp4GemmAllreduceRunner workspace is not enough, allocate new workspace", newWorkspace->size());
+        newWorkspace->allocate();
+        curWorkspaceSize = newWorkspace->size();
+        curWorkspace = newWorkspace;
+    }
+    return curWorkspace.get();
+}
+
+class Fp4GemmAllreduceRunner : public torch::CustomClassHolder
+{
+public:
+    explicit Fp4GemmAllreduceRunner(at::ScalarType outputDtype, int64_t rank, torch::List<int64_t> group)
+        : mOutputDtype(outputDtype)
+        , mRank(rank)
+    {
+        for (int64_t rank : group)
+        {
+            mGroup.insert(static_cast<int>(rank));
+        }
+
+        if (outputDtype == at::ScalarType::Half)
+        {
+            using Traits = GemmTypes<cutlass::float_e2m1_t, cutlass::float_e2m1_t, cutlass::half_t, cutlass::half_t,
+                cutlass::float_ue4m3_t, cutlass::float_ue4m3_t, cutlass::layout::RowMajor, cutlass::layout::ColumnMajor,
+                cutlass::layout::RowMajor, cutlass::layout::RowMajor>;
+            mRunner = std::make_shared<GemmAllReduceImplRunner<Traits>>();
+        }
+        else if (outputDtype == at::ScalarType::BFloat16)
+        {
+            using Traits = GemmTypes<cutlass::float_e2m1_t, cutlass::float_e2m1_t, cutlass::bfloat16_t,
+                cutlass::bfloat16_t, cutlass::float_ue4m3_t, cutlass::float_ue4m3_t, cutlass::layout::RowMajor,
+                cutlass::layout::ColumnMajor, cutlass::layout::RowMajor, cutlass::layout::RowMajor>;
+            mRunner = std::make_shared<GemmAllReduceImplRunner<Traits>>();
+        }
+        else
+        {
+            C10_THROW_ERROR(NotImplementedError, "Unsupported input or output dtype");
+        }
+
+        mConfigs = mRunner->getSupportedLaunchConfigs();
+    }
+
+    at::Tensor runGemm(at::Tensor const& mat1, at::Tensor const& mat2, at::Tensor const& mat1Scale,
+        at::Tensor const& mat2Scale, at::Tensor const& alpha, int64_t configIdx) const
+    {
+        if (configIdx < 0)
+            configIdx = 0;
+
+        TORCH_CHECK(configIdx < int64_t(mConfigs.size()), "configIdx out of bounds");
+        const int64_t M = mat1.size(0);
+        const int64_t N = mat2.size(0);
+        const int64_t K = mat1.size(1) * 2;
+
+        GemmAllReduceImplInterface::ProblemArgs problemArgs;
+        problemArgs.argProblemShape(M, N, K, 1);
+        problemArgs.argA(mat1.data_ptr());
+        problemArgs.argB(mat2.data_ptr());
+        problemArgs.argAScale(mat1Scale.data_ptr());
+        problemArgs.argBScale(mat2Scale.data_ptr());
+        problemArgs.argC(nullptr);
+        problemArgs.argAlphaPtr(reinterpret_cast<float const*>(alpha.const_data_ptr()));
+        problemArgs.argBeta(0.f);
+        problemArgs.argRanks(mRank, mGroup);
+        problemArgs.argLaunchConfig(mConfigs[configIdx]);
+
+        size_t dSize = M * N * c10::elementSize(mOutputDtype);
+        auto handle = tensorrt_llm::runtime::ipcNvlsAllocate(dSize, mGroup);
+        problemArgs.argD((void*) handle->uc_ptr, (void*) handle->mc_ptr, (void**) handle->ipc_uc_ptrs.data());
+
+        auto workspace = getWorkspace(mRunner.get(), problemArgs);
+        problemArgs.argWorkspace(workspace);
+
+        auto stream = at::cuda::getCurrentCUDAStream(mat1.get_device());
+        mRunner->run(problemArgs, stream);
+
+        auto options = mat1.options().dtype(mOutputDtype);
+        auto deleter = [=](void* unused) { ipcNvlsFree(handle); };
+        auto D = at::from_blob((void*) handle->uc_ptr, {M, N}, {N, 1}, deleter, options);
+        return D;
+    }
+
+    int64_t getNumConfigs() const
+    {
+        return static_cast<int64_t>(mConfigs.size());
+    }
+
+private:
+    at::ScalarType mOutputDtype;
+    int mRank;
+    std::set<int> mGroup;
+    std::shared_ptr<GemmAllReduceImplInterface> mRunner{nullptr};
+    std::vector<GemmAllReduceImplInterface::LaunchConfig> mConfigs;
+};
+
+} // namespace torch_ext
+
+TORCH_LIBRARY_FRAGMENT(trtllm, m)
+{
+    m.class_<torch_ext::Fp4GemmAllreduceRunner>("Fp4GemmAllreduceRunner")
+        .def(torch::init<at::ScalarType, int64_t, torch::List<int64_t>>())
+        .def("run_gemm", &torch_ext::Fp4GemmAllreduceRunner::runGemm)
+        .def("get_num_configs", &torch_ext::Fp4GemmAllreduceRunner::getNumConfigs);
+}
@@ -1621,3 +1621,96 @@ def record_stream(tensor: torch.Tensor, stream_id: int) -> None:
     stream = get_stream(stream_id)
     assert stream is not None
     tensor.record_stream(stream)
+
+
+class Fp4GemmAllreduceRunner(TunableRunner):
+    runner_dict = dict()
+    tuning_config = TuningConfig(dynamic_tensor_specs=(DynamicTensorSpec(
+        0, 0, get_last_power_of_2_num_tokens_buckets,
+        last_positive_power_of_2), ),
+                                 constraint_specs=(ConstraintSpec(
+                                     2, 0, fp4_scale_infer_shape), ))
+
+    def __init__(
+        self,
+        output_dtype: torch.dtype,
+        tp_rank: int,
+        tp_group: List[int],
+    ):
+        self.output_dtype = output_dtype
+        self.tp_rank = tp_rank
+        self.tp_group_str = '-'.join(str(g) for g in tp_group)
+        instance_key = (output_dtype, self.tp_group_str)
+        if instance_key not in Fp4GemmAllreduceRunner.runner_dict:
+            Fp4GemmAllreduceRunner.runner_dict[
+                instance_key] = torch.classes.trtllm.Fp4GemmAllreduceRunner(
+                    output_dtype, tp_rank, tp_group)
+        self.fp4_gemm_all_reduce_runner = Fp4GemmAllreduceRunner.runner_dict[
+            instance_key]
+
+    def unique_id(self):
+        return (self.output_dtype, self.tp_group_str)
+
+    def get_valid_tactics(self, inputs: List[torch.Tensor],
+                          profile: OptimizationProfile, **kwargs) -> List[int]:
+        return list(range(self.fp4_gemm_all_reduce_runner.get_num_configs()))
+
+    def forward(
+        self,
+        inputs: List[torch.Tensor],
+        tactic: int = 0,
+    ) -> torch.Tensor:
+        mat1, mat2, mat1_scale, mat2_scale, global_scale = inputs
+        return self.fp4_gemm_all_reduce_runner.run_gemm(
+            mat1,
+            mat2,
+            mat1_scale,
+            mat2_scale,
+            global_scale,
+            tactic,
+        )
+
+
+@torch.library.custom_op("trtllm::nvfp4_gemm_allreduce", mutates_args=())
+def nvfp4_gemm_allreduce(
+    act_fp4: torch.Tensor,
+    weight: torch.Tensor,
+    act_sf: torch.Tensor,
+    weight_scale: torch.Tensor,
+    alpha: torch.Tensor,
+    output_dtype: torch.dtype,
+    tp_rank: int,
+    tp_group: List[int],
+) -> torch.Tensor:
+    tuner = AutoTuner.get()
+
+    # Use Cutlass runner with predefined configs
+    nvfp4_gemm_allreduce_runner = Fp4GemmAllreduceRunner(
+        output_dtype, tp_rank, tp_group)
+
+    runner_type = type(nvfp4_gemm_allreduce_runner).__name__
+    _, best_tactic = tuner.choose_one(
+        f"trtllm::nvfp4_gemm_allreduce::{runner_type}",
+        [nvfp4_gemm_allreduce_runner],
+        nvfp4_gemm_allreduce_runner.tuning_config,
+        [act_fp4, weight, act_sf, weight_scale, alpha],
+    )
+
+    return nvfp4_gemm_allreduce_runner(
+        inputs=[act_fp4, weight, act_sf, weight_scale, alpha],
+        tactic=best_tactic)
+
+
+@nvfp4_gemm_allreduce.register_fake
+def _(
+    act_fp4: torch.Tensor,
+    weight: torch.Tensor,
+    act_sf: torch.Tensor,
+    weight_scale: torch.Tensor,
+    alpha: torch.Tensor,
+    output_dtype: torch.dtype,
+    tp_rank: int,
+    tp_group: List[int],
+) -> torch.Tensor:
+    return act_fp4.new_empty((act_fp4.size(0), weight.size(0)),
+                             dtype=output_dtype)