Skip to content

Commit f3e69ee

Browse files
ashbhandareAishwarya Bhandare
andauthored
Enable Nsys gpu device metrics (#257)
* GPU metrics only on rank 0 Signed-off-by: ashbhandare <[email protected]> * Add unit tests Signed-off-by: ashbhandare <[email protected]> --------- Signed-off-by: ashbhandare <[email protected]> Co-authored-by: Aishwarya Bhandare <[email protected]>
1 parent 5cfcd7c commit f3e69ee

File tree

6 files changed

+51
-3
lines changed

6 files changed

+51
-3
lines changed

nemo_run/core/execution/base.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,9 @@ def get_launcher_prefix(self) -> Optional[list[str]]:
165165
os.makedirs(os.path.join(self.job_dir, launcher.nsys_folder), exist_ok=True)
166166
return launcher.get_nsys_prefix(profile_dir=self.job_dir)
167167

168+
def get_nsys_entrypoint(self) -> str:
169+
return ("nsys", "")
170+
168171
def supports_launcher_transform(self) -> bool:
169172
return False
170173

nemo_run/core/execution/launcher.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ class Launcher(ConfigurableMixin):
2424
"--cuda-event-trace=false",
2525
]
2626
)
27+
nsys_gpu_metrics: bool = False
2728

2829
def get_nsys_prefix(self, profile_dir: str) -> Optional[list[str]]:
2930
"""Make a command prefix for nsys profiling"""

nemo_run/core/execution/slurm.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -547,7 +547,18 @@ def assign(
547547
def get_launcher_prefix(self) -> Optional[list[str]]:
548548
launcher = self.get_launcher()
549549
if launcher.nsys_profile:
550-
return launcher.get_nsys_prefix(profile_dir=f"/{RUNDIR_NAME}")
550+
nsys_prefix = launcher.get_nsys_prefix(profile_dir=f"/{RUNDIR_NAME}")
551+
if launcher.nsys_gpu_metrics:
552+
nsys_prefix += ["$GPU_METRICS_FLAG"]
553+
return nsys_prefix
554+
555+
def get_nsys_entrypoint(self) -> str:
556+
launcher = self.get_launcher()
557+
entrypoint, postfix = "nsys", ""
558+
if launcher.nsys_gpu_metrics:
559+
entrypoint = 'bash -c \'GPU_METRICS_FLAG=""; if [ "$SLURM_PROCID" -eq 0 ]; then GPU_METRICS_FLAG="--gpu-metrics-devices=all"; fi; nsys'
560+
postfix = "'"
561+
return (entrypoint, postfix)
551562

552563
def supports_launcher_transform(self) -> bool:
553564
return True if isinstance(self.get_launcher(), SlurmTemplate) else False

nemo_run/run/torchx_backend/packaging.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -225,8 +225,8 @@ def _get_details_from_script(fn_or_script: Script, serialize_configs: bool):
225225
nsys_prefix = executor.get_launcher_prefix()
226226
if nsys_prefix:
227227
role.args = [role.entrypoint] + role.args
228-
role.entrypoint = "nsys"
229-
role.args = nsys_prefix + role.args
228+
role.entrypoint, nsys_postfix = executor.get_nsys_entrypoint()
229+
role.args = nsys_prefix + role.args + [nsys_postfix]
230230

231231
if metadata:
232232
if USE_WITH_RAY_CLUSTER_KEY in metadata:

test/core/execution/test_base.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,11 @@ def test_get_launcher_str(self):
126126
executor = Executor(launcher="torchrun")
127127
assert isinstance(executor.get_launcher(), Torchrun)
128128

129+
def test_get_nsys_entrypoint(self):
130+
mock_launcher = Launcher()
131+
executor = Executor(launcher=mock_launcher)
132+
assert executor.get_nsys_entrypoint() == ("nsys", "")
133+
129134
def test_cleanup(self):
130135
executor = Executor()
131136
assert executor.cleanup("handle") is None

test/core/execution/test_slurm.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,10 +170,38 @@ def test_get_launcher_prefix(self):
170170
launcher_mock = MagicMock()
171171
launcher_mock.nsys_profile = True
172172
launcher_mock.get_nsys_prefix.return_value = ["nsys", "profile"]
173+
launcher_mock.nsys_gpu_metrics = False
173174

174175
with patch.object(executor, "get_launcher", return_value=launcher_mock):
175176
assert executor.get_launcher_prefix() == ["nsys", "profile"]
176177

178+
def test_get_launcher_prefix_with_gpu_metrics(self):
179+
"""Test the get_launcher_prefix method with nsys_profile when gpu metrics is enabled."""
180+
executor = SlurmExecutor(account="test")
181+
182+
# Test with launcher that has nsys_profile
183+
launcher_mock = MagicMock()
184+
launcher_mock.nsys_profile = True
185+
launcher_mock.get_nsys_prefix.return_value = ["nsys", "profile"]
186+
launcher_mock.nsys_gpu_metrics = True
187+
188+
with patch.object(executor, "get_launcher", return_value=launcher_mock):
189+
assert executor.get_launcher_prefix() == ["nsys", "profile", "$GPU_METRICS_FLAG"]
190+
191+
def test_get_nsys_entrypoint(self):
192+
"""Test the get_nsys_entrypoint method with nsys_profile."""
193+
executor = SlurmExecutor(account="test")
194+
195+
# Test with launcher that has nsys_profile
196+
launcher_mock = MagicMock()
197+
launcher_mock.nsys_gpu_metrics = True
198+
199+
with patch.object(executor, "get_launcher", return_value=launcher_mock):
200+
assert executor.get_nsys_entrypoint() == (
201+
'bash -c \'GPU_METRICS_FLAG=""; if [ "$SLURM_PROCID" -eq 0 ]; then GPU_METRICS_FLAG="--gpu-metrics-devices=all"; fi; nsys',
202+
"'",
203+
)
204+
177205
def test_supports_launcher_transform(self):
178206
"""Test the supports_launcher_transform method."""
179207
executor = SlurmExecutor(account="test")

0 commit comments

Comments
 (0)