Skip to content

Commit 04f900a

Browse files
authored
Specify nodes for gpu metrics collection and split data to each rank (#320)
* Specify nodes for gpu metrics collection and split data to each rank Signed-off-by: Aishwarya Bhandare <[email protected]> * Fix unit test Signed-off-by: Aishwarya Bhandare <[email protected]> --------- Signed-off-by: Aishwarya Bhandare <[email protected]>
1 parent 21e4c61 commit 04f900a

File tree

2 files changed

+2
-2
lines changed

2 files changed

+2
-2
lines changed

nemo_run/core/execution/slurm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -560,7 +560,7 @@ def get_nsys_entrypoint(self) -> str:
560560
launcher = self.get_launcher()
561561
entrypoint, postfix = "nsys", ""
562562
if launcher.nsys_gpu_metrics:
563-
entrypoint = 'bash -c \'GPU_METRICS_FLAG=""; if [ "$SLURM_PROCID" -eq 0 ]; then GPU_METRICS_FLAG="--gpu-metrics-devices=all"; fi; nsys'
563+
entrypoint = 'bash -c \'GPU_METRICS_FLAG=""; if echo "${GPU_METRICS_NODES}" | grep -q -w "${SLURM_NODEID}"; then GPU_METRICS_FLAG="--gpu-metrics-devices=${SLURM_LOCALID}"; fi; nsys'
564564
postfix = "'"
565565
return (entrypoint, postfix)
566566

test/core/execution/test_slurm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ def test_get_nsys_entrypoint(self):
198198

199199
with patch.object(executor, "get_launcher", return_value=launcher_mock):
200200
assert executor.get_nsys_entrypoint() == (
201-
'bash -c \'GPU_METRICS_FLAG=""; if [ "$SLURM_PROCID" -eq 0 ]; then GPU_METRICS_FLAG="--gpu-metrics-devices=all"; fi; nsys',
201+
'bash -c \'GPU_METRICS_FLAG=""; if echo "${GPU_METRICS_NODES}" | grep -q -w "${SLURM_NODEID}"; then GPU_METRICS_FLAG="--gpu-metrics-devices=${SLURM_LOCALID}"; fi; nsys',
202202
"'",
203203
)
204204

0 commit comments

Comments
 (0)