-
Notifications
You must be signed in to change notification settings - Fork 80
Symmetric memory pytorch backends #6023
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
14fd212
5646c03
14816aa
6996d05
49d669c
8962475
62c6945
67181c8
eea57d8
a9ddffd
f9cac71
8e62ccc
1be0134
3596301
9b05915
6147139
b5a2418
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,7 +14,9 @@ | |
| #include <numeric> | ||
|
|
||
| #ifdef NVFUSER_DISTRIBUTED | ||
| #include <torch/csrc/distributed/c10d/GroupRegistry.hpp> | ||
| #include <torch/csrc/distributed/c10d/PrefixStore.hpp> | ||
| #include <torch/csrc/distributed/c10d/ProcessGroup.hpp> | ||
| #include <torch/csrc/distributed/c10d/exception.h> | ||
| #ifdef USE_C10D_NCCL | ||
| #include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp> | ||
|
|
@@ -121,7 +123,8 @@ bool parseEnv( | |
| } | ||
|
|
||
| // retrieves master port | ||
| if ((env = std::getenv("NVFUSER_MASTER_PORT")) != nullptr) { | ||
| env = std::getenv("NVFUSER_MASTER_PORT"); | ||
| if (env != nullptr) { | ||
| master_port = std::atoi(env); | ||
| } else { | ||
| LOG(INFO) << "The environment variable NVFUSER_MASTER_PORT has not been " | ||
|
|
@@ -248,10 +251,10 @@ void waitForDebuggerAtRanks( | |
| std::cerr << "Process " << pid | ||
| << " is waiting for the debugger. To continue debugging, " | ||
| << "start gdb, `attach " << pid | ||
| << "`, `set var waiting=false`, and `fini`." << std::endl; | ||
| << "`, `set var waiting=false`, and `fini`.\n"; | ||
| while (waiting) { // Please change `waiting` in the debugger. | ||
| } | ||
| std::cerr << "Process " << getpid() << " finished waiting." << std::endl; | ||
| std::cerr << "Process " << getpid() << " finished waiting.\n"; | ||
| } | ||
|
|
||
| if (communicator->is_available()) { | ||
|
|
@@ -349,19 +352,25 @@ void Communicator::cleanup() { | |
|
|
||
| store_ = nullptr; | ||
|
|
||
| #if defined(NVFUSER_DISTRIBUTED) && defined(USE_C10D_NCCL) | ||
| #if defined(NVFUSER_DISTRIBUTED) | ||
| #if defined(USE_C10D_NCCL) | ||
| // Sort backends to work around a NCCL bug (nvbugs/4889623). Closing backends | ||
| // in different orders between ranks have been causing a hang. | ||
| std::vector<std::pair<std::string, c10::intrusive_ptr<c10d::Backend>>> | ||
| keyed_backends(backends_.begin(), backends_.end()); | ||
| std::sort(keyed_backends.begin(), keyed_backends.end()); | ||
| std::ranges::sort(keyed_backends.begin(), keyed_backends.end()); | ||
| for (auto& [key, backend] : keyed_backends) { | ||
| // Call shutdown before destructing a ProcessGroupNCCL as instructed by | ||
| // https://github.com/pytorch/pytorch/blob/e62073d7997c9e63896cb5289ffd0874a8cc1838/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp#L1164-L1170. | ||
| if (auto* pg_nccl = dynamic_cast<c10d::ProcessGroupNCCL*>(backend.get())) { | ||
| pg_nccl->shutdown(); | ||
| } | ||
| } | ||
| #endif | ||
| for (const auto& entry : process_groups_) { | ||
| c10d::unregister_process_group(entry.first); | ||
| } | ||
| process_groups_.clear(); | ||
| #endif | ||
| backends_.clear(); | ||
| } | ||
|
|
@@ -388,7 +397,7 @@ c10d::Backend* Communicator::getBackendForTeam( | |
| #ifdef NVFUSER_DISTRIBUTED | ||
| backends_[team_key] = [&]() -> c10::intrusive_ptr<c10d::Backend> { | ||
| // check that the caller's rank belongs to the requested team | ||
| auto rank_it = std::find(team.begin(), team.end(), deviceId()); | ||
| auto rank_it = std::ranges::find(team.begin(), team.end(), deviceId()); | ||
| if (rank_it == team.end()) { | ||
| return nullptr; | ||
| } | ||
|
|
@@ -402,6 +411,28 @@ c10d::Backend* Communicator::getBackendForTeam( | |
| }(); | ||
| #else | ||
| backends_[team_key] = nullptr; | ||
| #endif | ||
| #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) | ||
| std::optional<c10d::ProcessGroup::BackendType> pg_backend = | ||
| (b == CommunicatorBackend::kNccl) | ||
| ? std::optional<c10d::ProcessGroup::BackendType>( | ||
| c10d::ProcessGroup::BackendType::NCCL) | ||
| : std::nullopt; | ||
| if (backends_[team_key] != nullptr && pg_backend.has_value()) { | ||
| auto rank_it = std::ranges::find(team.begin(), team.end(), deviceId()); | ||
| RankType team_rank = std::distance(team.begin(), rank_it); | ||
|
|
||
| auto pg = c10::make_intrusive<c10d::ProcessGroup>( | ||
| c10::make_intrusive<c10d::PrefixStore>(team_key, store_), | ||
| team_rank, | ||
| static_cast<int>(team.size())); | ||
| pg->setBackend(c10::DeviceType::CUDA, *pg_backend, backends_[team_key]); | ||
| pg->setDefaultBackend(*pg_backend); | ||
| pg->setGroupName(team_key); | ||
|
|
||
| c10d::register_process_group(team_key, pg); | ||
| process_groups_[team_key] = std::move(pg); | ||
| } | ||
|
Comment on lines
+415
to
+435
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The The ProcessGroup registration should not be gated solely on first-time backend creation. Consider also checking if (backends_[team_key] != nullptr && pg_backend.has_value()
&& process_groups_.count(team_key) == 0) {
// ... create and register ProcessGroup
} |
||
| #endif | ||
| } | ||
| return backends_.at(team_key).get(); | ||
|
|
@@ -424,4 +455,13 @@ void Communicator::barrier(std::optional<CommunicatorBackend> backend) { | |
| getWorld(backend)->barrier(options)->wait(); | ||
| } | ||
|
|
||
| std::string Communicator::getSymmMemGroupKey( | ||
| std::optional<CommunicatorBackend> backend) { | ||
| std::vector<RankType> all_ranks(size_); | ||
| std::iota(all_ranks.begin(), all_ranks.end(), 0); | ||
| CommunicatorBackend b = backend.value_or(default_backend_); | ||
| (void)getBackendForTeam(all_ranks, b); | ||
| return getTeamKey(all_ranks, b); | ||
| } | ||
|
|
||
| } // namespace nvfuser | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,8 +11,9 @@ | |
| #include <ATen/core/ivalue.h> | ||
| #include <c10/util/intrusive_ptr.h> | ||
|
|
||
| #ifdef NVFUSER_DISTRIBUTED | ||
| #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) | ||
| #include <torch/csrc/distributed/c10d/Backend.hpp> | ||
| #include <torch/csrc/distributed/c10d/ProcessGroup.hpp> | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this header should always be present, no? |
||
| #include <torch/csrc/distributed/c10d/TCPStore.hpp> | ||
| #include <torch/csrc/distributed/c10d/Work.hpp> | ||
| #else | ||
|
|
@@ -110,6 +111,10 @@ class NVF_API Communicator { | |
| c10d::Backend* getWorld( | ||
| std::optional<CommunicatorBackend> backend = std::nullopt); | ||
|
|
||
| // Returns the world process-group name for the given backend. | ||
| std::string getSymmMemGroupKey( | ||
| std::optional<CommunicatorBackend> backend = std::nullopt); | ||
|
|
||
| // returns if a backend is available for creation | ||
| bool isBackendAvailable(CommunicatorBackend backend) const { | ||
| if (backend == CommunicatorBackend::kUcc) { | ||
|
|
@@ -153,6 +158,11 @@ class NVF_API Communicator { | |
| c10::intrusive_ptr<c10d::TCPStore> store_; | ||
| // cache for the created backends. The keys are strings generated from Teams | ||
| std::unordered_map<std::string, c10::intrusive_ptr<c10d::Backend>> backends_; | ||
| // c10d process-group wrappers registered for symmetric-memory rendezvous. | ||
| #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) | ||
| std::unordered_map<std::string, c10::intrusive_ptr<c10d::ProcessGroup>> | ||
| process_groups_; | ||
| #endif | ||
| }; | ||
|
|
||
| } // namespace nvfuser | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
process_groups_cleanup guard mismatch — compile error whenNVFUSER_DISTRIBUTEDis set withoutUSE_DISTRIBUTEDprocess_groups_is declared incommunicator.hunder#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED), but the cleanup loop here lives under the broader#if defined(NVFUSER_DISTRIBUTED)(without theUSE_DISTRIBUTEDguard). When a build definesNVFUSER_DISTRIBUTEDbut notUSE_DISTRIBUTED,process_groups_does not exist as a member, yet this code tries to iterate over it — a hard compile error.c10d::unregister_process_group(fromGroupRegistry.hpp) is already included under#ifdef NVFUSER_DISTRIBUTED, so fixing just the guard on these lines is sufficient:(The surrounding
#if defined(NVFUSER_DISTRIBUTED)/#endifalready provides the outer distributed guard.)