Skip to content

Commit cc405f6

Browse files
[Distributed] fix recreate nccl comm bug (#73625) (#74168)
1 parent 4f714b7 commit cc405f6

File tree

3 files changed

+6
-4
lines changed

3 files changed

+6
-4
lines changed

paddle/fluid/distributed/collective/process_group_nccl.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1000,8 +1000,8 @@ void ProcessGroupNCCL::Restart() {
10001000
phi::distributed::P2POption p2p_opts = place_to_p2p_opts_.at(place_key);
10011001
phi::distributed::CommContextManager::RecreateNCCLComm(
10021002
store_, store_key, rank_, std::to_string(create_count_), &p2p_opts);
1003-
create_count_++;
10041003
}
1004+
create_count_++;
10051005
}
10061006
phi::CUDAStream ProcessGroupNCCL::GetStream(const Place& place) {
10071007
const auto& place_key = GetKeyFromPlace(place);

paddle/fluid/distributed/collective/process_group_nccl.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#pragma once
1616

1717
#include <chrono>
18+
#include <map>
1819
#include <memory>
1920
#include <string>
2021
#include <unordered_map>
@@ -287,7 +288,7 @@ class ProcessGroupNCCL final : public ProcessGroupWithStream {
287288

288289
uint64_t comm_seq_{0};
289290
std::unordered_map<std::string, uint64_t> p2p_comm_seq_;
290-
std::unordered_map<std::string, std::string> place_to_group_key_;
291+
std::map<std::string, std::string> place_to_group_key_;
291292

292293
// TODO(sunyilun): attrs below will be removed later
293294
std::mutex mutex_;

paddle/phi/core/distributed/comm_context_manager.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ void CommContextManager::CreateNCCLCommContext(
131131
void CommContextManager::RecreateNCCLComm(const std::shared_ptr<Store>& store,
132132
const std::string& unique_comm_key,
133133
int rank,
134-
const std::string& hash_key,
134+
const std::string& recreate_key,
135135
const P2POption* p2p_opt) {
136136
auto& comm_context_manager = CommContextManager::GetInstance();
137137

@@ -140,7 +140,8 @@ void CommContextManager::RecreateNCCLComm(const std::shared_ptr<Store>& store,
140140
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetUniqueId(&nccl_id));
141141
}
142142

143-
std::string unique_key = "NCCLCommContext/" + unique_comm_key + hash_key;
143+
std::string unique_key =
144+
"NCCLCommContext/" + unique_comm_key + "/" + recreate_key;
144145
if (rank == 0 || (p2p_opt && p2p_opt->is_p2p_op && p2p_opt->p2p_rank == 0)) {
145146
std::vector<uint8_t> nccl_id_wrapper(
146147
reinterpret_cast<uint8_t*>(&nccl_id),

0 commit comments

Comments
 (0)