Merge branch 'main' into feature/adopt-ds-v3.2-encode

soaringk · soaringk · commit 0adcd4534388 · 2026-01-05T14:01:50.000+08:00
diff --git a/rtp_llm/cpp/config/ConfigModules.cc b/rtp_llm/cpp/config/ConfigModules.cc
@@ -152,7 +152,8 @@ std::string HWKernelConfig::to_string() const {
         << "num_native_cuda_graph: " << num_native_cuda_graph << "\n"
         << "prefill_capture_seq_lens size: " << prefill_capture_seq_lens.size() << "\n"
         << "decode_capture_batch_sizes size: " << decode_capture_batch_sizes.size() << "\n"
-        << "disable_dpc_random: " << disable_dpc_random;
+        << "disable_dpc_random: " << disable_dpc_random << "\n"
+        << "rocm_disable_custom_ag" << rocm_disable_custom_ag;
     return oss.str();
 }
 
diff --git a/rtp_llm/cpp/config/ConfigModules.h b/rtp_llm/cpp/config/ConfigModules.h
@@ -153,6 +153,7 @@ struct HWKernelConfig {
     // Comma-separated list of batch sizes, e.g., "1,2,4,8,16,32"
     std::vector<int> decode_capture_batch_sizes;
     bool             disable_dpc_random = false;
+    bool             rocm_disable_custom_ag = true;
     std::string      to_string() const;
 };
 
diff --git a/rtp_llm/cpp/devices/rocm_impl/ROCmDevice.cc b/rtp_llm/cpp/devices/rocm_impl/ROCmDevice.cc
@@ -71,7 +71,7 @@ ROCmDevice::ROCmDevice(const DeviceInitParams& params): DeviceBase(params) {
         auto&               nccl_param = tp_nccl_param_;
         std::vector<size_t> tp_ranks   = fcNcclGatherRanks(nccl_param, stream_);
         // Initialization may fail, and the variable will still be nullptr. When allreduce is called, it will fall back to the normal allreduce.
-        custom_allreduce_comm_         = initCustomAllReduceComm(nccl_param, tp_ranks, stream_);
+        custom_allreduce_comm_         = initCustomAllReduceComm(nccl_param, tp_ranks, stream_, params.hw_kernel_config);
         quick_allreduce_comm_          = initQuickAllReduceComm(nccl_param, tp_ranks, stream_);
     }
 
diff --git a/rtp_llm/cpp/devices/rocm_impl/ROCmDistributedOp.cc b/rtp_llm/cpp/devices/rocm_impl/ROCmDistributedOp.cc
@@ -162,6 +162,31 @@ void ROCmDevice::allGather(const AllGatherParams& params) {
                               "Buffer size %ld must be divisible by world size %d",
                               recv_buffer->size(),
                               nccl_param.world_size_);
+
+        // invoke aiter custom all-gather
+        // custom all-gather is integrated into custom all-reduce
+        bool use_custom_ag =
+            params.mode == ParallelMode::TP
+            and custom_allreduce_comm_
+            and custom_allreduce_comm_->checkAllGatherAvailable();
+
+        if (use_custom_ag) {
+            torch::Tensor input_tensor;
+
+            if (params.inplace) {
+                auto option_ = torch::dtype(dataTypeToTorchType(recv_buffer->type())).device(memoryTypeToTorchDevice(recv_buffer->where())).requires_grad(false);
+                std::vector<int64_t> shape_{static_cast<int64_t>(data_num)};
+                input_tensor = torch::from_blob(recv_buffer->dataWithOffset(nccl_param.rank_ * data_num), shape_, option_);
+            } else {
+                input_tensor = Buffer2torchTensor(*(params.send_buffers[i]), false);
+            }
+            torch::Tensor output_tensor = Buffer2torchTensor(*recv_buffer, false);
+
+            custom_allreduce_comm_->allGather(input_tensor, output_tensor);
+
+            continue;
+        }
+
         if (params.inplace) {
             const auto data_size = data_num * recv_buffer->typeSize();
             NCCLCHECK(ncclAllGather((char*)(recv_buffer->data()) + nccl_param.rank_ * data_size,
diff --git a/rtp_llm/cpp/model_rpc/PrefillRpcServer.cc b/rtp_llm/cpp/model_rpc/PrefillRpcServer.cc
@@ -131,8 +131,7 @@ void PrefillRpcServer::getRpcConnection(PrefillGenerateContext& prefill_context)
 
     // If no host specified in request, check if there's a master role
     char* remote_rpc_server_ip_env = std::getenv("REMOTE_RPC_SERVER_IP");
-    bool has_master_role =
-        (remote_rpc_server_ip_env != nullptr && strlen(remote_rpc_server_ip_env) > 0);
+    bool  has_master_role          = (remote_rpc_server_ip_env != nullptr && strlen(remote_rpc_server_ip_env) > 0);
 
     // If no host specified in request and no master role, this is a direct prefill request
     // In this case, we still need to select decode machines as specified in the requirements
@@ -293,7 +292,6 @@ void PrefillRpcServer::remoteGenerate(PrefillGenerateContext& prefill_context) {
     generate_request.mutable_propose_token_ids()->CopyFrom(
         {stream->getProposeToken().begin(), stream->getProposeToken().end()});
 
-    // TODO(yinzhi): trans propose probs and hidden states
     auto sp_output_buffer = stream->getSPOutputBuffer();
 
     if (sp_output_buffer) {
@@ -407,11 +405,10 @@ grpc::Status PrefillRpcServer::GenerateStreamCall(grpc::ServerContext*
                                                   meta_);
     prefill_context.onflight_requests      = onflight_requests_;
     prefill_context.loading_cache_requests = loading_cache_requests_;
-    
 
-    auto max_retry_times                   = maga_init_params_.pd_sep_config.prefill_retry_times;
-    auto max_retry_timeout_ms              = maga_init_params_.pd_sep_config.prefill_retry_timeout_ms;
-    int  retry_interval_ms                 = 1;
+    auto max_retry_times      = maga_init_params_.pd_sep_config.prefill_retry_times;
+    auto max_retry_timeout_ms = maga_init_params_.pd_sep_config.prefill_retry_timeout_ms;
+    int  retry_interval_ms    = 1;
 
     try {
         EXECUTE_WITH_RETRY(
diff --git a/rtp_llm/cpp/pybind/ConfigInit.cc b/rtp_llm/cpp/pybind/ConfigInit.cc
@@ -411,6 +411,7 @@ PYBIND11_MODULE(libth_transformer_config, m) {
         .def_readwrite("prefill_capture_seq_lens", &HWKernelConfig::prefill_capture_seq_lens)
         .def_readwrite("decode_capture_batch_sizes", &HWKernelConfig::decode_capture_batch_sizes)
         .def_readwrite("disable_dpc_random", &HWKernelConfig::disable_dpc_random)
+        .def_readwrite("rocm_disable_custom_ag", &HWKernelConfig::rocm_disable_custom_ag)
         .def("to_string", &HWKernelConfig::to_string)
         .def(py::pickle(
             [](const HWKernelConfig& self) {
@@ -427,10 +428,11 @@ PYBIND11_MODULE(libth_transformer_config, m) {
                                       self.num_native_cuda_graph,
                                       self.prefill_capture_seq_lens,
                                       self.decode_capture_batch_sizes,
-                                      self.disable_dpc_random);
+                                      self.disable_dpc_random,
+                                      self.rocm_disable_custom_ag);
             },
             [](py::tuple t) {
-                if (t.size() != 14)
+                if (t.size() != 15)
                     throw std::runtime_error("Invalid state!");
                 HWKernelConfig c;
                 try {
@@ -448,6 +450,7 @@ PYBIND11_MODULE(libth_transformer_config, m) {
                     c.prefill_capture_seq_lens     = t[11].cast<std::vector<int>>();
                     c.decode_capture_batch_sizes   = t[12].cast<std::vector<int>>();
                     c.disable_dpc_random           = t[13].cast<bool>();
+                    c.rocm_disable_custom_ag       = t[14].cast<bool>();
                 } catch (const std::exception& e) {
                     throw std::runtime_error(std::string("HWKernelConfig unpickle error: ") + e.what());
                 }
diff --git a/rtp_llm/cpp/rocm/custom_ar/custom_ar_comm.cc b/rtp_llm/cpp/rocm/custom_ar/custom_ar_comm.cc
@@ -14,13 +14,15 @@ using namespace std;
 
 namespace rtp_llm {
 
-CustomAllReduceComm::CustomAllReduceComm(const std::vector<size_t>& tp_ranks, size_t rank, size_t rank_index):
+CustomAllReduceComm::CustomAllReduceComm(const std::vector<size_t>& tp_ranks, size_t rank, size_t rank_index, const HWKernelConfig& hw_kernel_config):
     rank_(rank),
     rank_index_(rank_index),
     world_size_(tp_ranks.size()),
     support_nv_link_(true),  // TODO(liyangcheng.lyc): add check function
     comm_buf_threshold_(getCommBufThreshold()),
-    tp_ranks_(std::move(tp_ranks)) {}
+    tp_ranks_(std::move(tp_ranks)),
+    ft_disable_custom_ar_(hw_kernel_config.ft_disable_custom_ar),
+    rocm_disable_custom_ag_(hw_kernel_config.rocm_disable_custom_ag) {}
 
 CustomAllReduceComm::~CustomAllReduceComm() {
     aiter::dispose(fa_);
@@ -41,6 +43,15 @@ bool CustomAllReduceComm::checkAllReduceAvailable(size_t elts_total_num, DataTyp
     return false;
 }
 
+bool CustomAllReduceComm::checkAllGatherAvailable() {
+    if (rocm_disable_custom_ag_) {
+        RTP_LLM_LOG_INFO("Disable custom ag since ROCM_DISABLE_CUSTOM_AG is set");
+        return false;
+    }
+
+    return true;
+}
+
 void CustomAllReduceComm::allReduce(torch::Tensor& input_tensor, torch::Tensor& output_tensor) {
     if (at::hip::currentStreamCaptureStatusMayInitCtx() != at::hip::CaptureStatus::None) {
         aiter::all_reduce(fa_, input_tensor, output_tensor, false, std::nullopt);
@@ -49,6 +60,14 @@ void CustomAllReduceComm::allReduce(torch::Tensor& input_tensor, torch::Tensor&
     }
 }
 
+void CustomAllReduceComm::allGather(torch::Tensor& input_tensor, torch::Tensor& output_tensor) {
+    if (at::hip::currentStreamCaptureStatusMayInitCtx() != at::hip::CaptureStatus::None) {
+        aiter::all_gather_reg(fa_, input_tensor, output_tensor);
+    } else {
+        aiter::all_gather_unreg(fa_, input_tensor, buffer_, output_tensor);
+    }
+}
+
 void CustomAllReduceComm::registerGraphBuffers() {
     auto handle_and_offset = aiter::get_graph_buffer_ipc_meta(fa_); // tuple<tensor, vector<int64_t>> -> vector<tensor> size=2
     auto handle = std::get<0>(handle_and_offset);
@@ -144,7 +163,7 @@ CustomAllReduceComm::prepareP2PBuffer_(const NcclParam& nccl_para, torch::Tensor
     return handles;
 }
 
-bool CustomAllReduceComm::shouldCustomAR(const std::vector<size_t>& tp_ranks, size_t rank) {
+bool CustomAllReduceComm::shouldCustomAR(const std::vector<size_t>& tp_ranks, size_t rank, const HWKernelConfig& hw_kernel_config) {
     size_t world_size       = tp_ranks.size();
     size_t local_world_size = rocm::getDeviceCount();
 
@@ -158,9 +177,7 @@ bool CustomAllReduceComm::shouldCustomAR(const std::vector<size_t>& tp_ranks, si
     }
 
     // 2. check whether disabled flag is set
-    char* disable_custom_ar_str = std::getenv("FT_DISABLE_CUSTOM_AR");
-    bool  disable_custom_ar     = disable_custom_ar_str != nullptr && std::string(disable_custom_ar_str) == "1";
-    if (disable_custom_ar) {
+    if (hw_kernel_config.ft_disable_custom_ar) {
         RTP_LLM_LOG_INFO("Disable custom ar since FT_DISABLE_CUSTOM_AR is set");
         return false;
     }
@@ -186,7 +203,7 @@ size_t CustomAllReduceComm::getCommBufThreshold() {
 }
 
 std::unique_ptr<CustomAllReduceComm>
-initCustomAllReduceComm(const NcclParam& nccl_para, const std::vector<size_t>& tp_ranks, hipStream_t stream) {
+initCustomAllReduceComm(const NcclParam& nccl_para, const std::vector<size_t>& tp_ranks, hipStream_t stream, const HWKernelConfig& hw_kernel_config) {
     size_t rank_index = 0;
     for (size_t i = 0; i < tp_ranks.size(); i++) {
         if (tp_ranks[i] == nccl_para.rank_) {
@@ -195,11 +212,11 @@ initCustomAllReduceComm(const NcclParam& nccl_para, const std::vector<size_t>& t
         }
     }
 
-    if (!CustomAllReduceComm::shouldCustomAR(tp_ranks, nccl_para.rank_)) {
+    if (!CustomAllReduceComm::shouldCustomAR(tp_ranks, nccl_para.rank_, hw_kernel_config)) {
         return nullptr;
     }
 
-    auto comm = std::make_unique<CustomAllReduceComm>(tp_ranks, nccl_para.rank_, rank_index);
+    auto comm = std::make_unique<CustomAllReduceComm>(tp_ranks, nccl_para.rank_, rank_index, hw_kernel_config);
     comm->init(nccl_para, stream);
     RTP_LLM_LOG_INFO("Custom all reduce is enabled on rank %d of %d", nccl_para.rank_, tp_ranks.size());
     return comm;
diff --git a/rtp_llm/cpp/rocm/custom_ar/custom_ar_comm.h b/rtp_llm/cpp/rocm/custom_ar/custom_ar_comm.h
@@ -10,6 +10,7 @@
 #include "rtp_llm/cpp/core/Types.h"
 #include "rtp_llm/cpp/cuda/nccl/nccl_utils.h"
 #include "rtp_llm/cpp/utils/Logger.h"
+#include "rtp_llm/cpp/config/ConfigModules.h"
 
 // aiter custom all reduce kernel
 #include "custom_all_reduce.h"
@@ -18,17 +19,22 @@
 namespace rtp_llm {
 class CustomAllReduceComm {
 public:
-    CustomAllReduceComm(const std::vector<size_t>& tp_ranks, size_t rank, size_t rank_index);
+    CustomAllReduceComm(const std::vector<size_t>& tp_ranks, size_t rank, size_t rank_index, const HWKernelConfig& hw_kernel_config);
 
     ~CustomAllReduceComm();
 
     void init(const NcclParam& nccl_para, hipStream_t stream);
 
     void allReduce(torch::Tensor& input_tensor, torch::Tensor& output_tensor);
 
+    // NOTE(liyangcheng.lyc): the implementation of custom all gather is placed together with custom all reduce
+    void allGather(torch::Tensor& input_tensor, torch::Tensor& output_tensor);
+
     bool checkAllReduceAvailable(size_t elts_total_num, DataType data_type, size_t world_size);
 
-    static bool shouldCustomAR(const std::vector<size_t>& tp_ranks, size_t rank);
+    bool checkAllGatherAvailable();
+
+    static bool shouldCustomAR(const std::vector<size_t>& tp_ranks, size_t rank, const HWKernelConfig& hw_kernel_config);
 
     void registerGraphBuffers();
 
@@ -55,9 +61,11 @@ class CustomAllReduceComm {
     torch::Tensor       rank_data_;
     int64_t             fa_;
     NcclParam           nccl_para_;
+    bool                ft_disable_custom_ar_ = true;
+    bool                rocm_disable_custom_ag_ = true;
 };
 
 std::unique_ptr<CustomAllReduceComm>
-initCustomAllReduceComm(const NcclParam& nccl_para, const std::vector<size_t>& tp_ranks, hipStream_t stream);
+initCustomAllReduceComm(const NcclParam& nccl_para, const std::vector<size_t>& tp_ranks, hipStream_t stream, const HWKernelConfig& hw_kernel_config);
 
 }  // namespace rtp_llm
diff --git a/rtp_llm/cpp/speculative_engine/SpeculativeEngine.cc b/rtp_llm/cpp/speculative_engine/SpeculativeEngine.cc
@@ -563,7 +563,12 @@ absl::Status SpeculativeEngine::prefillMtpStep(std::list<GenerateStreamPtr>& str
 
         RTP_LLM_LOG_DEBUG("update stream");
         for (GenerateStreamPtr& stream : streams) {
-            SpeculativeExecutorStreamOutputPtr score_output = stream->getScoreStream()->getSPOutputBuffer();
+            GenerateStreamPtr score_stream = stream->getScoreStream();
+            if (checkStopAndSetError(score_stream, stream)) {
+                continue;
+            }
+
+            SpeculativeExecutorStreamOutputPtr score_output = score_stream->getSPOutputBuffer();
             StreamUpdateInfo                   update_info{score_output->tokens,
                                          (int)1,
                                          nullptr,
@@ -586,10 +591,17 @@ absl::Status SpeculativeEngine::prefillMtpStep(std::list<GenerateStreamPtr>& str
 
         propose_begin_time_us = autil::TimeUtility::currentTimeInMicroSeconds();
         RTP_LLM_LOG_DEBUG("propose model prefill");
-        THROW_IF_STATUS_ERROR(propose_executor_->propose(streams, true));
+        THROW_IF_STATUS_ERROR(propose_executor_->propose(streams));
 
         for (const GenerateStreamPtr& stream : streams) {
-            BufferPtr   propose_tokens = stream->getProposeStream()->getSPOutputBuffer()->tokens;
+            GenerateStreamPtr propose_stream = stream->getProposeStream();
+
+            // check propose stream status
+            if (checkStopAndSetError(propose_stream, stream)) {
+                continue;
+            }
+
+            BufferPtr   propose_tokens = propose_stream->getSPOutputBuffer()->tokens;
             vector<int> propose_tokens_vec;
             for (int i = 0; i < propose_tokens->shape()[1]; ++i) {
                 propose_tokens_vec.push_back(propose_tokens->data<int>()[i]);
@@ -606,8 +618,7 @@ absl::Status SpeculativeEngine::prefillMtpStep(std::list<GenerateStreamPtr>& str
                 RTP_LLM_LOG_DEBUG("stream [%ld] set setNeedRemoteGenerate", stream->streamId());
                 stream->setNeedRemoteGenerate(true);
             }
-            auto score_stream   = stream->getScoreStream();
-            auto propose_stream = stream->getProposeStream();
+            auto score_stream = stream->getScoreStream();
             if (score_stream) {
                 score_stream->setLastHiddenStates(nullptr);
                 score_stream->setSPOutputBuffer(nullptr);
@@ -735,16 +746,31 @@ absl::Status SpeculativeEngine::mtpStep(std::list<GenerateStreamPtr>& streams) {
         }
     }
 
+    // check propose stream status
+    for (const GenerateStreamPtr& stream : streams) {
+        GenerateStreamPtr propose_stream = stream->getProposeStream();
+        checkStopAndSetError(propose_stream, stream);
+    }
+
     // base model score propose new tokens.
     {
         RTP_LLM_LOG_DEBUG("score step");
         score_begin_time_us = autil::TimeUtility::currentTimeInMicroSeconds();
         THROW_IF_STATUS_ERROR(score_executor_->score(streams));
 
+        std::list<GenerateStreamPtr> sample_streams;
+        for (const GenerateStreamPtr& stream : streams) {
+            GenerateStreamPtr score_stream = stream->getScoreStream();
+            if (checkStopAndSetError(score_stream, stream)) {
+                continue;
+            }
+            sample_streams.emplace_back(stream);
+        }
+
         if (device_->getDeviceProperties().tp_rank == 0) {
             RTP_LLM_LOG_DEBUG("sample step");
             sampler_begin_time_us = autil::TimeUtility::currentTimeInMicroSeconds();
-            CHECK_AND_RETURN_REF(sampler_output, speculative_sampler_->sample(streams));
+            CHECK_AND_RETURN_REF(sampler_output, speculative_sampler_->sample(sample_streams));
             RTP_LLM_LOG_DEBUG("speculative sample done");
 
             metrics_.propose_token_num += sampler_output.propose_token_num;
@@ -806,4 +832,22 @@ KVCacheInfo SpeculativeEngine::getCacheStatusInfo(int64_t latest_version, bool n
     return resource_context_.cache_manager->getKVCacheInfo(latest_version, need_cache_keys);
 }
 
+bool SpeculativeEngine::checkStopAndSetError(const GenerateStreamPtr& check_stream,
+                                             const GenerateStreamPtr& target_stream) {
+    if (target_stream->stopped()) {
+        return true;
+    }
+
+    if (check_stream && check_stream->stopped()) {
+        ErrorInfo error_info = check_stream->statusInfo();
+        if (error_info.hasError()) {
+            target_stream->setStop(error_info.code(), error_info.ToString());
+            RTP_LLM_LOG_ERROR(
+                "stream [%ld] stopped with error: %s", target_stream->streamId(), error_info.ToString().c_str());
+        }
+        return true;
+    }
+    return false;
+}
+
 }  // namespace rtp_llm
diff --git a/rtp_llm/cpp/speculative_engine/SpeculativeEngine.h b/rtp_llm/cpp/speculative_engine/SpeculativeEngine.h
@@ -142,6 +142,8 @@ class SpeculativeEngine: public EngineBase {
 
     bool updateEplbConfig(const EPLBConfig& config) override;
 
+    bool checkStopAndSetError(const GenerateStreamPtr& check_stream, const GenerateStreamPtr& target_stream);
+
 private:
     kmonitor::MetricsReporterPtr                  metrics_reporter_ = nullptr;
     std::unique_ptr<ProposeModelEngineInitParams> propose_model_params_;
diff --git a/rtp_llm/ops/libth_transformer_config.pyi b/rtp_llm/ops/libth_transformer_config.pyi
@@ -480,6 +480,7 @@ class HWKernelConfig:
     prefill_capture_seq_lens: list[int]
     rocm_hipblaslt_config: str
     use_swizzleA: bool
+    rocm_disable_custom_ag: bool
     def __getstate__(self) -> tuple:
         ...
     def __init__(self) -> None:
diff --git a/rtp_llm/server/server_args/hw_kernel_group_args.py b/rtp_llm/server/server_args/hw_kernel_group_args.py
@@ -146,6 +146,14 @@ def init_hw_kernel_group_args(parser, hw_kernel_config):
         help="控制是否禁用 DPC 的随机性",
     )
 
+    hw_kernel_group.add_argument(
+        "--rocm_disable_custom_ag",
+        env_name="ROCM_DISABLE_CUSTOM_AG",
+        bind_to=(hw_kernel_config, 'rocm_disable_custom_ag'),
+        type=str2bool,
+        default=None,
+        help="设置为 `True` 时，禁用ROCm平台自定义的 AllGather (AG) 实现，可能回退到标准库（如 RCCL）的 AllGather。",
+    )
 
 def _parse_comma_separated_ints(
     config: str, config_name: str, item_name: str, raise_on_empty: bool = True
diff --git a/rtp_llm/test/utils/maga_server_manager.py b/rtp_llm/test/utils/maga_server_manager.py
diff --git a/tests/BUILD b/tests/BUILD
diff --git a/tests/custom_ar/rocm_custom_ag_test.cc b/tests/custom_ar/rocm_custom_ag_test.cc
diff --git a/tests/custom_ar/rocm_custom_ag_test.py b/tests/custom_ar/rocm_custom_ag_test.py

Original file line number	Diff line number	Diff line change
`@@ -152,7 +152,8 @@ std::string HWKernelConfig::to_string() const {`
`152`	`152`	`<< "num_native_cuda_graph: " << num_native_cuda_graph << "\n"`
`153`	`153`	`<< "prefill_capture_seq_lens size: " << prefill_capture_seq_lens.size() << "\n"`
`154`	`154`	`<< "decode_capture_batch_sizes size: " << decode_capture_batch_sizes.size() << "\n"`
`155`		`- << "disable_dpc_random: " << disable_dpc_random;`
	`155`	`+ << "disable_dpc_random: " << disable_dpc_random << "\n"`
	`156`	`+ << "rocm_disable_custom_ag" << rocm_disable_custom_ag;`
`156`	`157`	`return oss.str();`
`157`	`158`	`}`
`158`	`159`
Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,7 @@ ROCmDevice::ROCmDevice(const DeviceInitParams& params): DeviceBase(params) {`
`71`	`71`	`auto& nccl_param = tp_nccl_param_;`
`72`	`72`	`std::vector<size_t> tp_ranks = fcNcclGatherRanks(nccl_param, stream_);`
`73`	`73`	`// Initialization may fail, and the variable will still be nullptr. When allreduce is called, it will fall back to the normal allreduce.`
`74`		`- custom_allreduce_comm_ = initCustomAllReduceComm(nccl_param, tp_ranks, stream_);`
	`74`	`+ custom_allreduce_comm_ = initCustomAllReduceComm(nccl_param, tp_ranks, stream_, params.hw_kernel_config);`
`75`	`75`	`quick_allreduce_comm_ = initQuickAllReduceComm(nccl_param, tp_ranks, stream_);`
`76`	`76`	`}`
`77`	`77`