Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 31 additions & 1 deletion health-checks/modal_bw_efa.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import json
import math
import os
import subprocess

from utils import read_port_counters
import modal
import modal.experimental

Expand All @@ -27,6 +29,7 @@
"curl -fsSL -o /tmp/fabtests-2.3.1.tar.bz2 https://github.com/ofiwg/libfabric/releases/download/v2.3.1/fabtests-2.3.1.tar.bz2",
"tar -xjf /tmp/fabtests-2.3.1.tar.bz2 -C /opt && rm -f /tmp/fabtests-2.3.1.tar.bz2",
)
.add_local_python_source("utils")
)
app = modal.App("rdma-bandwidth-efa", image=image)

Expand Down Expand Up @@ -59,7 +62,20 @@ def efa_bandwidth_test(server_ip_dict: modal.Dict):
# Get current node rank
cluster_info = modal.experimental.get_cluster_info()
container_rank: int = cluster_info.rank
print(f"[rank {container_rank}] Starting rdma_bandwidth_test", flush=True)
print(f"[rank {container_rank}] Initializing cluster info", flush=True)

# Read initial RDMA counters
initial_counters = read_port_counters(
"/sys/class/infiniband/*/ports/1/hw_counters/*",
{
"rdma_write_bytes": 0,
"rdma_write_recv_bytes": 0,
},
)
print(
f"[rank {container_rank}] Initial counters: {json.dumps(initial_counters)} bytes",
flush=True,
)

# Get local ib devices (sorted by device index from 0 to 7)
local_efa_domains = get_local_efa_domains()
Expand Down Expand Up @@ -175,6 +191,20 @@ def efa_bandwidth_test(server_ip_dict: modal.Dict):
client_args = failed
time.sleep(RETRY_DELAY)

# Read final RDMA counters and print the delta
final_counters = read_port_counters(
"/sys/class/infiniband/*/ports/1/hw_counters/*",
{
"rdma_write_bytes": 0,
"rdma_write_recv_bytes": 0,
},
)
delta = {k: final_counters[k] - initial_counters.get(k, 0) for k in final_counters}
print(
f"[rank {container_rank}] Counter delta: {json.dumps(delta)} bytes",
flush=True,
)


# Run fi_rma_bw command for server
def run_efa_write_server(domain: str, port: int, gpu_id: int) -> subprocess.Popen:
Expand Down
44 changes: 39 additions & 5 deletions health-checks/modal_bw_ib.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import json
import os
import re
import subprocess
import time

from utils import read_port_counters
import modal
import modal.experimental

import time
import json
import re

cuda_version = "12.4.0" # Should be no greater than host CUDA version
flavor = "devel" # Includes full CUDA toolkit
operating_sys = "ubuntu22.04"
Expand Down Expand Up @@ -34,6 +35,7 @@
"rm -f /tmp/perftest-25.10.0-0.128.tar.gz",
"cd /opt/perftest && ./autogen.sh && ./configure --prefix=/usr/local/ && make -j && make install",
)
.add_local_python_source("utils")
)
app = modal.App("rdma-bandwidth-ib", image=image)

Expand All @@ -58,10 +60,27 @@ def infiniband_bandwidth_test(server_ip_dict: modal.Dict):
waits until the Modal dict has 8 IPs, then runs ib_write_bw.
"""

# Get current cloud provider
print(f"Running on {os.environ['MODAL_CLOUD_PROVIDER'].split('_')[2]}", flush=True)

# Get current node rank
cluster_info = modal.experimental.get_cluster_info()
container_rank: int = cluster_info.rank
print(f"[rank {container_rank}] Starting rdma_bandwidth_test", flush=True)
print(f"[rank {container_rank}] Initializing cluster info", flush=True)

# Read initial port counters
initial_counters = read_port_counters(
"/sys/class/infiniband/*/ports/*/counters/*",
{
"port_xmit_data": 0,
"port_rcv_data": 0,
},
multiplier=4,
)
print(
f"[rank {container_rank}] Initial counters: {json.dumps(initial_counters)} bytes",
flush=True,
)

# Get local ib devices (sorted by device index from 0 to 7)
local_ib_devices = get_local_ib_devices()
Expand Down Expand Up @@ -168,6 +187,21 @@ def infiniband_bandwidth_test(server_ip_dict: modal.Dict):
)
process.wait()

# Read final port counters and print the delta
final_counters = read_port_counters(
"/sys/class/infiniband/*/ports/*/counters/*",
{
"port_xmit_data": 0,
"port_rcv_data": 0,
},
multiplier=4,
)
delta = {k: final_counters[k] - initial_counters[k] for k in initial_counters}
print(
f"[rank {container_rank}] Counter delta: {json.dumps(delta)} bytes",
flush=True,
)


# Run ib_write_bw command for server
def run_ib_write_server(device: str, port: int) -> subprocess.Popen:
Expand Down
14 changes: 14 additions & 0 deletions health-checks/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import glob
import os


# Reads RDMA port counters from sysfs for all devices.
def read_port_counters(
path_pattern: str, counters_dict: dict[str, float], multiplier: float = 1
) -> dict[str, float]:
for path in glob.glob(path_pattern):
metric = os.path.basename(path)
if metric in counters_dict:
with open(path, "r") as f:
counters_dict[metric] += float(f.read().strip()) * multiplier
return counters_dict