Skip to content

Commit f94b5b2

Browse files
authored
Libfabric: update required minimum version to v1.21.0 (#961)
1 parent 4211b39 commit f94b5b2

File tree

14 files changed

+110
-112
lines changed

14 files changed

+110
-112
lines changed

.gitlab/build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ EXTRA_BUILD_ARGS=${3:-""}
2929
# UCX_VERSION is the version of UCX to build override default with env variable.
3030
UCX_VERSION=${UCX_VERSION:-v1.19.0}
3131
# LIBFABRIC_VERSION is the version of libfabric to build override default with env variable.
32-
LIBFABRIC_VERSION=${LIBFABRIC_VERSION:-v2.3.0}
32+
LIBFABRIC_VERSION=${LIBFABRIC_VERSION:-v1.21.0}
3333
# LIBFABRIC_INSTALL_DIR can be set via environment variable, defaults to INSTALL_DIR
3434
LIBFABRIC_INSTALL_DIR=${LIBFABRIC_INSTALL_DIR:-$INSTALL_DIR}
3535

benchmark/nixlbench/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -279,8 +279,8 @@ sudo ldconfig
279279

280280
**LibFabric:**
281281
```bash
282-
wget https://github.com/ofiwg/libfabric/releases/download/v2.3.0/libfabric-2.3.0.tar.bz2
283-
tar xjf libfabric-2.3.0.tar.bz2 && cd libfabric-2.3.0
282+
wget https://github.com/ofiwg/libfabric/releases/download/v1.21.0/libfabric-1.21.0.tar.bz2
283+
tar xjf libfabric-1.21.0.tar.bz2 && cd libfabric-1.21.0
284284
./configure --prefix=/usr/local --with-cuda=/usr/local/cuda --enable-cuda-dlopen --enable-efa
285285
make -j$(nproc) && sudo make install
286286
```

benchmark/nixlbench/contrib/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ ARG DEFAULT_PYTHON_VERSION
114114
ARG WHL_PYTHON_VERSIONS="3.12"
115115
ARG WHL_PLATFORM="manylinux_2_39_$ARCH"
116116
ARG BUILD_TYPE="release"
117-
ARG LIBFABRIC_VERSION="v2.3.0"
117+
ARG LIBFABRIC_VERSION="v1.21.0"
118118
ARG NPROC
119119

120120
WORKDIR /workspace

contrib/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ ARG NIXL_PREFIX="/usr/local/nixl"
3030
ARG NIXL_PLUGIN_DIR="$NIXL_PREFIX/lib/$ARCH-linux-gnu/plugins"
3131
ARG NPROC
3232
ARG WHL_DEFAULT_PYTHON_VERSIONS="3.12"
33-
ARG LIBFABRIC_VERSION="v2.3.0"
33+
ARG LIBFABRIC_VERSION="v1.21.0"
3434
ARG LIBFABRIC_INSTALL_PATH="/usr/local"
3535

3636
# Install build dependencies from Ubuntu repository

contrib/Dockerfile.manylinux

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG}
2121
ARG DEFAULT_PYTHON_VERSION="3.12"
2222
ARG ARCH="x86_64"
2323
ARG UCX_REF="v1.19.0"
24-
ARG LIBFABRIC_VERSION="v2.3.0"
24+
ARG LIBFABRIC_VERSION="v1.21.0"
2525

2626
RUN yum groupinstall -y 'Development Tools' && \
2727
dnf install -y almalinux-release-synergy && \

contrib/aws-efa/aws_test.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,15 +64,14 @@ setup_cmd="set -x && \
6464
git clone ${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY} && \
6565
cd nixl && \
6666
${GIT_CHECKOUT_CMD}"
67-
efa_validation_cmd="fi_info -p efa"
6867
build_cmd=".gitlab/build.sh \${NIXL_INSTALL_DIR} \${UCX_INSTALL_DIR}"
6968

7069
# Add timeout only if TEST_TIMEOUT is set (expects minutes)
7170
if [ -n "$TEST_TIMEOUT" ]; then
7271
test_cmd="timeout ${TEST_TIMEOUT}m ${test_cmd}"
7372
fi
7473

75-
export AWS_CMD="${setup_cmd} && ${build_cmd} && ${efa_validation_cmd} && ${test_cmd}"
74+
export AWS_CMD="${setup_cmd} && ${build_cmd} && ${test_cmd}"
7675

7776
# Generate AWS job properties json from template
7877
envsubst < aws_vars.template > aws_vars.json

src/plugins/libfabric/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ EFA Specific **Topology-Aware Optimization**: Hardware-aware GPU-to-EFA and NUMA
1919
### Required Dependencies
2020

2121
- **Libfabric**
22-
- Many system will have installed libfabric already. If not, custom libfabric installation is available via https://ofiwg.github.io/libfabric/ - Minimum required version: v2.3.0rc2
23-
- For EFA enabled AWS instances, it is recommanded to install through AWS EFA installer: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html - Minimum required version: 1.43.2
22+
- Many system will have installed libfabric already. If not, custom libfabric installation is available via https://ofiwg.github.io/libfabric/ - Minimum required version: v1.21.0
23+
- For EFA enabled AWS instances, it is recommanded to install through AWS EFA installer: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html - Recommend to use the latest version
2424

2525
- **hwloc**
2626
- hwloc is used to understand the underlying architecture to optimize application performance. Suggested version: 2.10.0 or newer
@@ -30,7 +30,7 @@ EFA Specific **Topology-Aware Optimization**: Hardware-aware GPU-to-EFA and NUMA
3030
Validated compatiblity with:
3131
- **AWS EFA** (Elastic Fabric Adapter)
3232

33-
Any other Libfabric providers that support heterogeneous memory (FI_HMEM) should also work but have not been validated in production environments. Community validation and feedback are highly appreciated!
33+
Any other Libfabric providers should also work but have not been validated in production environments. Community validation and feedback are highly appreciated!
3434

3535
## Build Instructions
3636

src/plugins/libfabric/libfabric_backend.cpp

Lines changed: 24 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -173,13 +173,13 @@ nixlLibfabricBackendH::nixlLibfabricBackendH(nixl_xfer_op_t op, const std::strin
173173
// Initialize BinaryNotification
174174
binary_notif.clear();
175175

176-
NIXL_DEBUG << "constructor called, this: " << this
176+
NIXL_DEBUG << " handle constructor called, address: " << this
177177
<< " total_requests_used=" << submitted_requests_.load()
178178
<< " BinaryNotification initialized";
179179
}
180180

181181
nixlLibfabricBackendH::~nixlLibfabricBackendH() {
182-
NIXL_DEBUG << "destructor called, this: " << this;
182+
NIXL_DEBUG << "handle destructor called, address: " << this;
183183
}
184184

185185
// Multi-request completion tracking methods
@@ -425,7 +425,7 @@ nixlLibfabricEngine::getConnInfo(std::string &str) const {
425425

426426
NIXL_DEBUG << "Rail Manager serialized connection info for " << rail_manager.getNumDataRails()
427427
<< " rails, " << rail_manager.getNumControlRails() << " control rails, "
428-
<< "total size: " << str.length();
428+
<< "total size=" << str.length();
429429

430430
return NIXL_SUCCESS;
431431
}
@@ -436,7 +436,7 @@ nixlLibfabricEngine::loadRemoteConnInfo(const std::string &remote_agent,
436436
std::lock_guard<std::mutex> lock(connection_state_mutex_);
437437

438438
NIXL_DEBUG << "Loading remote info for agent: " << remote_agent
439-
<< ", info length: " << remote_conn_info.length()
439+
<< ", info length=" << remote_conn_info.length()
440440
<< ", info (hex): " << LibfabricUtils::hexdump(remote_conn_info.data());
441441

442442
if (remote_conn_info.empty()) {
@@ -475,13 +475,13 @@ nixlLibfabricEngine::connect(const std::string &remote_agent) {
475475
std::lock_guard<std::mutex> lock(connection_state_mutex_);
476476

477477
NIXL_DEBUG << "Connecting to agent: " << remote_agent
478-
<< ", connections_ size: " << connections_.size();
478+
<< ", connections_ size=" << connections_.size();
479479

480480
// Check if connection is already established
481481
auto it = connections_.find(remote_agent);
482482
if (it != connections_.end() && it->second->overall_state_ == ConnectionState::CONNECTED) {
483483
NIXL_DEBUG << "Connection already established for " << remote_agent
484-
<< ", fi_addr: " << it->second->rail_remote_addr_list_[0][0];
484+
<< ", fi_addr=" << it->second->rail_remote_addr_list_[0][0];
485485
return NIXL_SUCCESS;
486486
}
487487

@@ -519,7 +519,7 @@ nixlLibfabricEngine::disconnect(const std::string &remote_agent) {
519519
// Connection exists - check if already disconnected
520520
if (it->second->overall_state_ == ConnectionState::DISCONNECTED) {
521521
NIXL_DEBUG << "Connection already established for " << remote_agent
522-
<< ", fi_addr: " << it->second->rail_remote_addr_list_[0][0];
522+
<< ", fi_addr=" << it->second->rail_remote_addr_list_[0][0];
523523
return NIXL_SUCCESS;
524524
}
525525
// TODO: Implement disconnect logic to cleanup the AV Address Entries from both local and remote
@@ -997,7 +997,7 @@ nixlLibfabricEngine::postXfer(const nixl_xfer_op_t &operation,
997997
backend_handle->binary_notif.expected_completions =
998998
0; // Will be incremented during transfer submission
999999

1000-
NIXL_DEBUG << "Using pre-allocated BinaryNotification with XFER_ID: "
1000+
NIXL_DEBUG << "Using pre-allocated BinaryNotification with XFER_ID="
10011001
<< backend_handle->binary_notif.xfer_id;
10021002

10031003
nixlLibfabricReq::OpType op_type;
@@ -1032,8 +1032,8 @@ nixlLibfabricEngine::postXfer(const nixl_xfer_op_t &operation,
10321032
int gpu_id = local[desc_idx].devId;
10331033

10341034
NIXL_DEBUG << "Processing descriptor " << desc_idx << " GPU " << gpu_id
1035-
<< " local_addr: " << transfer_addr << " size: " << transfer_size
1036-
<< " remote_addr: " << (void *)remote[desc_idx].addr;
1035+
<< " local_addr: " << transfer_addr << " size=" << transfer_size
1036+
<< " remote_addr=" << (void *)remote[desc_idx].addr;
10371037

10381038
NIXL_DEBUG << "DEBUG: remote_agent='" << remote_agent << "' localAgent='" << localAgent
10391039
<< "'";
@@ -1091,7 +1091,7 @@ nixlLibfabricEngine::postXfer(const nixl_xfer_op_t &operation,
10911091
NIXL_ERROR << "Failed to send notification";
10921092
return notif_status;
10931093
}
1094-
NIXL_DEBUG << "Notification sent immediately with xfer_id: "
1094+
NIXL_DEBUG << "Notification sent immediately with XFER_ID="
10951095
<< backend_handle->binary_notif.xfer_id << ", expected_completions: "
10961096
<< backend_handle->binary_notif.expected_completions;
10971097
}
@@ -1259,25 +1259,24 @@ nixlLibfabricEngine::getNotifs(notif_list_t &notif_list) {
12591259
// Background progress function that continuously processes completions on all rails
12601260
nixl_status_t
12611261
nixlLibfabricEngine::cmThread() {
1262-
NIXL_DEBUG << "ConnectionManagement thread started successfully";
1263-
NIXL_DEBUG << "Initial receives already posted in main thread, entering progress loop";
1262+
NIXL_DEBUG << "CM: Thread started successfully";
12641263

12651264
// Main progress loop - continuously process completions on all rails
12661265
while (!cm_thread_stop_.load()) {
12671266

12681267
nixl_status_t status = rail_manager.progressAllControlRails();
12691268
if (status == NIXL_SUCCESS) {
1270-
NIXL_DEBUG << "Processed completions on control rails";
1269+
NIXL_DEBUG << "CM: Processed completions on control rails";
12711270
} else if (status != NIXL_IN_PROG && status != NIXL_SUCCESS) {
1272-
NIXL_ERROR << "Failed to process completions on control rails";
1271+
NIXL_ERROR << "CM: Failed to process completions on control rails";
12731272
return NIXL_ERR_BACKEND;
12741273
}
12751274
// Sleep briefly to avoid spinning too aggressively when blocking cq read is not used
12761275
if (!rail_manager.getControlRail(0).blocking_cq_sread_supported) {
12771276
std::this_thread::sleep_for(std::chrono::nanoseconds(10));
12781277
}
12791278
}
1280-
NIXL_DEBUG << "ConnectionManagement thread exiting cleanly";
1279+
NIXL_DEBUG << "CM: Thread exiting cleanly";
12811280
return NIXL_SUCCESS;
12821281
}
12831282

@@ -1288,24 +1287,24 @@ nixlLibfabricEngine::cmThread() {
12881287
// Progress thread that continuously processes completions only on data rails
12891288
nixl_status_t
12901289
nixlLibfabricEngine::progressThread() {
1291-
NIXL_DEBUG << "Progress thread started successfully for data rails only";
1290+
NIXL_DEBUG << "PT: Thread started successfully for data rails only";
12921291
// Main progress loop - continuously process completions only on data rails
12931292
while (!progress_thread_stop_.load()) {
12941293
// Process completions only on data rails (non-blocking)
12951294
bool any_completions = false;
12961295
nixl_status_t status = rail_manager.progressActiveDataRails();
12971296
if (status == NIXL_SUCCESS) {
12981297
any_completions = true;
1299-
NIXL_DEBUG << "Processed completions on data rails";
1298+
NIXL_DEBUG << "PT: Processed completions on data rails";
13001299
} else if (status != NIXL_IN_PROG && status != NIXL_SUCCESS) {
1301-
NIXL_ERROR << "Failed to process completions on data rails";
1300+
NIXL_ERROR << "PT: Failed to process completions on data rails";
13021301
// Don't return error, continue for robustness
13031302
}
13041303
if (!any_completions) {
13051304
std::this_thread::sleep_for(progress_thread_delay_);
13061305
}
13071306
}
1308-
NIXL_DEBUG << "Progress thread exiting cleanly";
1307+
NIXL_DEBUG << "PT: Thread exiting cleanly";
13091308
return NIXL_SUCCESS;
13101309
}
13111310

@@ -1356,11 +1355,11 @@ void
13561355
nixlLibfabricEngine::processNotification(const std::string &serialized_notif) {
13571356
// Only handle binary notification format
13581357
// Check if this is a binary notification (fixed size)
1359-
NIXL_DEBUG << "Received notification size: " << serialized_notif.size()
1358+
NIXL_DEBUG << "Received notification size=" << serialized_notif.size()
13601359
<< ", sizeof(Notification): " << sizeof(BinaryNotification);
13611360

13621361
if (serialized_notif.size() != sizeof(BinaryNotification)) {
1363-
NIXL_ERROR << "Invalid notification size: " << serialized_notif.size()
1362+
NIXL_ERROR << "Invalid notification size=" << serialized_notif.size()
13641363
<< ", expected: " << sizeof(BinaryNotification);
13651364
return;
13661365
}
@@ -1375,7 +1374,7 @@ nixlLibfabricEngine::processNotification(const std::string &serialized_notif) {
13751374
uint32_t expected_completions = binary_notif->expected_completions;
13761375

13771376
NIXL_TRACE << "Received notification from " << remote_name << " msg: " << msg
1378-
<< " xfer_id: " << xfer_id << " expected_completions: " << expected_completions;
1377+
<< " XFER_ID=" << xfer_id << " expected_completions: " << expected_completions;
13791378

13801379
// Check if this is a transfer notification that needs completions matching
13811380
if (expected_completions > 0) {
@@ -1426,7 +1425,7 @@ nixlLibfabricEngine::processConnectionAck(uint16_t agent_idx,
14261425
ConnectionState state) {
14271426
std::string remote_agent_name = agent_names_[agent_idx];
14281427
NIXL_DEBUG << "Connection state callback for agent " << remote_agent_name
1429-
<< " agent_idx: " << agent_idx;
1428+
<< " agent_idx=" << agent_idx;
14301429
std::lock_guard<std::mutex> lock(connections_[remote_agent_name]->conn_state_mutex_);
14311430
connections_[remote_agent_name]->overall_state_ = ConnectionState::CONNECTED;
14321431
connections_[remote_agent_name]->cv_.notify_all();
@@ -1478,7 +1477,7 @@ nixlLibfabricEngine::processConnectionRequest(uint16_t agent_idx,
14781477

14791478
NIXL_DEBUG << "Successfully inserted addresses for " << data_fi_addrs.size()
14801479
<< " data rails and " << control_fi_addrs.size() << " control rails"
1481-
<< ", initiator_control_fi_addr: " << initiator_control_fi_addr;
1480+
<< ", initiator_control_fi_addr=" << initiator_control_fi_addr;
14821481

14831482
// Send acknowledgement back to the initiator using the rail manager
14841483
size_t ep_name_len = sizeof(rail->ep_name);

src/plugins/libfabric/libfabric_backend.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,9 +114,9 @@ class nixlLibfabricConnection : public nixlBackendConnMD {
114114
size_t agent_index_; // Unique agent identifier in agent_names vector
115115
std::string remoteAgent_; // Remote agent name
116116
std::unordered_map<size_t, std::vector<fi_addr_t>>
117-
rail_remote_addr_list_; // Data rail libfabric addresses. Key: data rail id.
117+
rail_remote_addr_list_; // Data rail libfabric addresses. key=data rail id.
118118
std::unordered_map<size_t, std::vector<fi_addr_t>>
119-
control_rail_remote_addr_list_; // Control rail libfabric addresses. Key: control rail id.
119+
control_rail_remote_addr_list_; // Control rail libfabric addresses. key=control rail id.
120120
std::vector<char *> src_ep_names_; // Data rail endpoint names
121121
std::vector<char *> control_ep_names_; // Control rail endpoint names
122122
ConnectionState overall_state_; // Current connection state

src/utils/libfabric/libfabric_common.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,9 @@ getAvailableNetworkDevices() {
6565
std::string device_name = cur->domain_attr->name;
6666
std::string provider_name = cur->fabric_attr->prov_name;
6767

68-
NIXL_TRACE << "Found device - domain: " << device_name
69-
<< ", provider: " << provider_name << ", ep_type: " << cur->ep_attr->type
70-
<< ", caps: 0x" << std::hex << cur->caps << std::dec;
68+
NIXL_TRACE << "Found device - domain: " << device_name << ", provider=" << provider_name
69+
<< ", ep_type=" << cur->ep_attr->type << ", caps=" << std::hex << cur->caps
70+
<< std::dec;
7171

7272
if (provider_device_map.find(provider_name) == provider_device_map.end()) {
7373
provider_device_map[provider_name] = {};
@@ -81,7 +81,7 @@ getAvailableNetworkDevices() {
8181

8282
for (auto device_list : provider_device_map) {
8383
for (auto device : device_list.second) {
84-
NIXL_TRACE << "Provider: " << device_list.first << ", Device: " << device;
84+
NIXL_TRACE << "provider=" << device_list.first << ", device=" << device;
8585
}
8686
}
8787

0 commit comments

Comments
 (0)