Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
cf77bdb
first working dispatch and combine primitive for k=1
samnordmann Jan 21, 2026
66e7811
add comments and cleanup
samnordmann Jan 21, 2026
dda9aa7
add kernel based a2av and cuda backend for d/c
samnordmann Jan 22, 2026
7aa2de8
unstable - add nixl backend
x41lakazam Feb 25, 2026
9a8a377
unstable
x41lakazam Feb 26, 2026
0f21528
add python build changes for nixl
x41lakazam Feb 26, 2026
6144827
fix typo
x41lakazam Feb 26, 2026
04a9133
merge main
x41lakazam Feb 26, 2026
b32587a
restore main:
x41lakazam Feb 26, 2026
f8a94fc
fix bug where zero-length buffer was passed to nixl
x41lakazam Feb 26, 2026
a6b6f87
Reduce probe size to 1
x41lakazam Feb 26, 2026
95460af
Address PR comments.
x41lakazam Mar 1, 2026
41ec0ac
typos
x41lakazam Mar 4, 2026
d63ffd7
set getAgentName to inline
x41lakazam Mar 4, 2026
86e5028
fix comments in nixl.cpp
x41lakazam Mar 4, 2026
7283aa8
clean ifdef USE_NIXL statements
x41lakazam Mar 4, 2026
a085c54
inline exchangeMetadata inside registerTensors
x41lakazam Mar 8, 2026
13ae58f
include deviceId (rank) inside TensorDesc
x41lakazam Mar 8, 2026
149c15a
remove useless handleImpl.isPrepared
x41lakazam Mar 8, 2026
1b41788
add thread yield in wait transfer loop
x41lakazam Mar 8, 2026
2eccaa5
remove remote_rank from prepare transfer
x41lakazam Mar 8, 2026
10d010a
add nixlbackend::impl when use_nixl is false
x41lakazam Mar 8, 2026
e9062a4
Move exchangeMetadata to private when USE_NIXL is false
x41lakazam Mar 9, 2026
9047991
fix CI: clang-format, clang-tidy, and trailing newline
x41lakazam Mar 9, 2026
b3b5fbd
fix ci
x41lakazam Mar 9, 2026
b283c2a
fix CI
x41lakazam Mar 9, 2026
c4726d0
Separate device and rank in tensordesc for more clarity
x41lakazam Mar 9, 2026
7ff4aae
fix linter
x41lakazam Mar 10, 2026
0739240
Update cmake config
x41lakazam Mar 10, 2026
5560230
Fix CI
x41lakazam Mar 10, 2026
ec01db9
Fix no-headers in NIXL install instructions (Cmake config)
x41lakazam Mar 10, 2026
67a92f0
Replace NVFUSER_STANDALONE_BUILD_WITH_NIXL by NVFUSER_BUILD_WITH_NIXL
x41lakazam Mar 10, 2026
dae35aa
move nixl linkage from CmakeList to handle_nixl.cmake
x41lakazam Mar 10, 2026
a9fb56d
move TPL locs to handle_nixl.cmake
x41lakazam Mar 10, 2026
e364949
Fix - move nixl linkage to handle_nixl.cmake
x41lakazam Mar 10, 2026
f74078f
fix linter
x41lakazam Mar 10, 2026
9847a11
Add NIXL to CI image
x41lakazam Mar 11, 2026
40758d6
remove import nixl from install-nixl.sh
x41lakazam Mar 11, 2026
bf471ea
Add transitive shared libs deps for nixl
x41lakazam Mar 12, 2026
df6c384
Add nixl*.mesonpy.libs and nixl*.libs as shared lib dirs in CI's inst…
x41lakazam Mar 12, 2026
1c10779
try to make nixl tests work
x41lakazam Mar 16, 2026
5eca79f
remove nixl from clang build
x41lakazam Mar 17, 2026
deddb46
Merge branch 'main' into dispatch_combine/nixl_backend
x41lakazam Mar 23, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ set(NVFUSER_CUTLASS "${NVFUSER_ROOT}/cutlass")
set(NVFUSER_THIRD_PARTY_DIR "${NVFUSER_ROOT}/third_party")

option(NVFUSER_STANDALONE_BUILD_WITH_UCC "" OFF)
option(NVFUSER_BUILD_WITH_NIXL "" OFF)
option(NVFUSER_EXPLICIT_ERROR_CHECK "" OFF)
option(NVFUSER_ENABLE_DEPENDENCY_REPORT "Enable Python-based dependency reporting and log capture" ON)

Expand Down Expand Up @@ -76,6 +77,7 @@ include(cmake/deps/handle_torch.cmake)
include(cmake/deps/handle_pybind11.cmake)
include(cmake/deps/handle_llvm.cmake)
include(cmake/deps/handle_nvmmh.cmake)
include(cmake/deps/handle_nixl.cmake)
include(cmake/deps/handle_git_submodules.cmake)

# Initialize success flag
Expand All @@ -95,6 +97,7 @@ handle_torch() # Must come AFTER python and cudatoolkit.
handle_pybind11()
handle_llvm()
handle_nvmmh() # Must come AFTER python to query correct site-packages
handle_nixl() # Must come AFTER python and cudatoolkit for CUDA version check

if(NVFUSER_ENABLE_DEPENDENCY_REPORT)
stop_capture(DEP_LOGS)
Expand Down Expand Up @@ -248,6 +251,7 @@ list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/multidevice/ipc_utils.cpp
${NVFUSER_SRCS_DIR}/multidevice/device_mesh.cpp
${NVFUSER_SRCS_DIR}/multidevice/executor.cpp
${NVFUSER_SRCS_DIR}/multidevice/nixl.cpp
${NVFUSER_SRCS_DIR}/multidevice/execution_utils.cpp
${NVFUSER_SRCS_DIR}/multidevice/propagation.cpp
${NVFUSER_SRCS_DIR}/multidevice/resharding.cpp
Expand Down Expand Up @@ -1001,6 +1005,7 @@ if(BUILD_TEST)
${NVFUSER_ROOT}/tests/cpp/test_multidevice_lower_communication.cpp
${NVFUSER_ROOT}/tests/cpp/test_multidevice_lower_communication_cuda.cpp
${NVFUSER_ROOT}/tests/cpp/test_multidevice_matmul.cpp
${NVFUSER_ROOT}/tests/cpp/test_multidevice_nixl.cpp
${NVFUSER_ROOT}/tests/cpp/test_multidevice_pipeline.cpp
${NVFUSER_ROOT}/tests/cpp/test_multidevice_sharding.cpp
${NVFUSER_ROOT}/tests/cpp/test_multidevice_stream_parallel_type.cpp
Expand Down Expand Up @@ -1299,6 +1304,12 @@ if(NVFUSER_STANDALONE_BUILD_WITH_UCC)
message(STATUS " UCX_DIR : $ENV{UCX_DIR}")
endif()
message(STATUS " NVFUSER_STANDALONE_BUILD_WITH_UCC : ${NVFUSER_STANDALONE_BUILD_WITH_UCC}")
message(STATUS " NVFUSER_BUILD_WITH_NIXL : ${NVFUSER_BUILD_WITH_NIXL}")
message(STATUS " NIXL_FOUND : ${NIXL_FOUND}")
if(NIXL_FOUND)
message(STATUS " NIXL_INCLUDE_DIR: ${NIXL_INCLUDE_DIR}")
message(STATUS " NIXL_LIBRARY : ${NIXL_LIBRARY}")
endif()
Comment on lines +1308 to +1312
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please report this in cmake/deps/handle_nixl.cmake

message(STATUS " NVFUSER_BUILD_WITH_ASAN : ${NVFUSER_BUILD_WITH_ASAN}")
message(STATUS " NVFUSER_DISTRIBUTED : ${NVFUSER_DISTRIBUTED}")
message(STATUS " NVFUSER_CPP_STANDARD : ${NVFUSER_CPP_STANDARD}")
Expand Down
3 changes: 3 additions & 0 deletions cmake/DependencyRequirements.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,8 @@ set(NVFUSER_REQUIREMENT_LLVM_VERSION_MIN "18.1")
# NVMMH
set(NVFUSER_REQUIREMENT_NVMMH_OPTIONAL "TRUE")

# NIXL
set(NVFUSER_REQUIREMENT_NIXL_OPTIONAL "TRUE")

# Git Submodules (required for build)
# No version requirement - just checks if submodules are initialized
87 changes: 87 additions & 0 deletions cmake/deps/handle_nixl.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-present NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause

# ------------------------------------------------------------------------------
# NIXL Handler
# ------------------------------------------------------------------------------

macro(handle_nixl)
message("")
message("Finding NIXL...")

if(NOT NVFUSER_BUILD_WITH_NIXL)
set(NIXL_FOUND FALSE)
message(STATUS "NIXL disabled (NVFUSER_BUILD_WITH_NIXL=OFF)")
else()
# User may need to set NIXL_PREFIX to the NIXL install directory.
find_path(NIXL_INCLUDE_DIR nixl.h
HINTS $ENV{NIXL_PREFIX}/include ENV CPATH
)
find_library(NIXL_LIBRARY nixl
HINTS $ENV{NIXL_PREFIX}/lib $ENV{NIXL_PREFIX}/lib64
$ENV{NIXL_PREFIX}/lib/x86_64-linux-gnu
$ENV{NIXL_PREFIX}/lib/aarch64-linux-gnu
)
find_library(NIXL_BUILD_LIBRARY nixl_build
HINTS $ENV{NIXL_PREFIX}/lib $ENV{NIXL_PREFIX}/lib64
$ENV{NIXL_PREFIX}/lib/x86_64-linux-gnu
$ENV{NIXL_PREFIX}/lib/aarch64-linux-gnu
)

if(NIXL_INCLUDE_DIR AND NIXL_LIBRARY)
set(NIXL_FOUND TRUE)
message(STATUS "Found NIXL: ${NIXL_LIBRARY} (include: ${NIXL_INCLUDE_DIR})")
if(NIXL_BUILD_LIBRARY)
message(STATUS "Found NIXL build lib: ${NIXL_BUILD_LIBRARY}")
endif()

add_library(__nvfuser_nixl INTERFACE)
target_include_directories(__nvfuser_nixl INTERFACE ${NIXL_INCLUDE_DIR})

get_filename_component(NIXL_LIB_DIR "${NIXL_LIBRARY}" DIRECTORY)
target_link_directories(__nvfuser_nixl INTERFACE ${NIXL_LIB_DIR})
target_link_options(__nvfuser_nixl INTERFACE "LINKER:-rpath-link,${NIXL_LIB_DIR}")

target_link_libraries(__nvfuser_nixl INTERFACE ${NIXL_LIBRARY})
if(NIXL_BUILD_LIBRARY)
target_link_libraries(__nvfuser_nixl INTERFACE ${NIXL_BUILD_LIBRARY})
endif()
else()
set(NIXL_FOUND FALSE)
message(WARNING "NIXL not found – building without NIXL support. Set NIXL_PREFIX to the NIXL install directory.")
endif()

# CUDA major version constraint check
if(NIXL_FOUND AND Python_FOUND AND CUDAToolkit_FOUND)
execute_process(
COMMAND "${Python_EXECUTABLE}" -c "import nixl; print(nixl._pkg.__name__.split('_cu')[-1])"
OUTPUT_VARIABLE nixl_cuda_major
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
RESULT_VARIABLE nixl_cuda_result
)

if(nixl_cuda_result EQUAL 0 AND NOT nixl_cuda_major STREQUAL "")
set(NIXL_CUDA_VERSION "${nixl_cuda_major}")
set(cuda_toolkit_major "${CUDAToolkit_VERSION_MAJOR}")

if(NOT nixl_cuda_major STREQUAL cuda_toolkit_major)
set(NIXL_CUDA_constraint_status "mismatch")
set(NIXL_CUDA_constraint_found "${nixl_cuda_major}")
set(NIXL_CUDA_constraint_required "${cuda_toolkit_major}")
message(WARNING "NIXL CUDA major version mismatch: NIXL built for CUDA ${nixl_cuda_major}, but CUDAToolkit major is ${cuda_toolkit_major}")
else()
set(NIXL_CUDA_constraint_status "match")
set(NIXL_CUDA_constraint_version "${nixl_cuda_major}")
endif()
else()
set(NIXL_CUDA_constraint_status "not_available")
endif()
else()
set(NIXL_CUDA_constraint_status "not_available")
endif()
endif()

set_dependency_report_status(NIXL)
endmacro()
22 changes: 16 additions & 6 deletions csrc/multidevice/communicator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

#include <netdb.h>

#include <algorithm>
#include <cstdlib>
#include <map>
#include <numeric>
Expand Down Expand Up @@ -41,6 +42,9 @@ std::ostream& operator<<(std::ostream& out, const CommunicatorBackend& cb) {
case CommunicatorBackend::kCuda:
out << "CUDA";
break;
case CommunicatorBackend::kNixl:
out << "NIXL";
break;
}
return out;
}
Expand Down Expand Up @@ -121,7 +125,8 @@ bool parseEnv(
}

// retrieves master port
if ((env = std::getenv("NVFUSER_MASTER_PORT")) != nullptr) {
env = std::getenv("NVFUSER_MASTER_PORT");
if (env != nullptr) {
master_port = std::atoi(env);
} else {
LOG(INFO) << "The environment variable NVFUSER_MASTER_PORT has not been "
Expand Down Expand Up @@ -183,7 +188,8 @@ Communicator::Communicator(
master_port_(
c10d::TCPStoreOptions::kDefaultPort + 42), // to avoid collision
ucc_available_(false),
nccl_available_(false) {
nccl_available_(false),
nixl_available_(false) {
if (isOptionDisabled(DisableOption::Multidevice)) {
TORCH_WARN(
"Multi-device support is disabled. All communication operations will "
Expand Down Expand Up @@ -236,6 +242,10 @@ Communicator::Communicator(
#ifdef USE_C10D_NCCL
nccl_available_ = true;
#endif

#ifdef USE_NIXL
nixl_available_ = true;
#endif
}

namespace {
Expand All @@ -248,10 +258,10 @@ void waitForDebuggerAtRanks(
std::cerr << "Process " << pid
<< " is waiting for the debugger. To continue debugging, "
<< "start gdb, `attach " << pid
<< "`, `set var waiting=false`, and `fini`." << std::endl;
<< "`, `set var waiting=false`, and `fini`." << '\n';
while (waiting) { // Please change `waiting` in the debugger.
}
std::cerr << "Process " << getpid() << " finished waiting." << std::endl;
std::cerr << "Process " << getpid() << " finished waiting." << '\n';
}

if (communicator->is_available()) {
Expand Down Expand Up @@ -354,7 +364,7 @@ void Communicator::cleanup() {
// in different orders between ranks have been causing a hang.
std::vector<std::pair<std::string, c10::intrusive_ptr<c10d::Backend>>>
keyed_backends(backends_.begin(), backends_.end());
std::sort(keyed_backends.begin(), keyed_backends.end());
std::ranges::sort(keyed_backends);
for (auto& [key, backend] : keyed_backends) {
// Call shutdown before destructing a ProcessGroupNCCL as instructed by
// https://github.com/pytorch/pytorch/blob/e62073d7997c9e63896cb5289ffd0874a8cc1838/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp#L1164-L1170.
Expand Down Expand Up @@ -388,7 +398,7 @@ c10d::Backend* Communicator::getBackendForTeam(
#ifdef NVFUSER_DISTRIBUTED
backends_[team_key] = [&]() -> c10::intrusive_ptr<c10d::Backend> {
// check that the caller's rank belongs to the requested team
auto rank_it = std::find(team.begin(), team.end(), deviceId());
auto rank_it = std::ranges::find(team, deviceId());
if (rank_it == team.end()) {
return nullptr;
}
Expand Down
3 changes: 3 additions & 0 deletions csrc/multidevice/communicator.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ class NVF_API Communicator {
return ucc_available_;
} else if (backend == CommunicatorBackend::kNccl) {
return nccl_available_;
} else if (backend == CommunicatorBackend::kNixl) {
return nixl_available_;
}
return false;
}
Expand Down Expand Up @@ -149,6 +151,7 @@ class NVF_API Communicator {
int master_port_;
bool ucc_available_;
bool nccl_available_;
bool nixl_available_;
// stores the world's store used for the backend init
c10::intrusive_ptr<c10d::TCPStore> store_;
// cache for the created backends. The keys are strings generated from Teams
Expand Down
3 changes: 2 additions & 1 deletion csrc/multidevice/multidevice.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#pragma once

#include <cstdint>
#include <vector>

#include <c10/core/Device.h>
Expand All @@ -19,5 +20,5 @@ using DeviceType = c10::Device;
using Team = std::vector<DeviceIdxType>;

// Supported backends.
enum class CommunicatorBackend { kNccl, kUcc, kCuda };
enum class CommunicatorBackend : std::uint8_t { kNccl, kUcc, kCuda, kNixl };
} // namespace nvfuser
Loading
Loading