Skip to content

Commit ffebc26

Browse files
author
Copybara
committed
Copybara import of gpu-recipes:
- 93b9ae83d22603013f1d542fe282b611d82628a1 Enabling profiling GitOrigin-RevId: 93b9ae83d22603013f1d542fe282b611d82628a1
1 parent b990765 commit ffebc26

File tree

1 file changed

+7
-1
lines changed

1 file changed

+7
-1
lines changed

src/helm-charts/a3ultra/nemo-training/templates/nemo-launcher-job.yaml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -280,10 +280,16 @@ spec:
280280
echo "Job logs will go to ${EXPERIMENT_DIR}/${EXPERIMENT_NAME}/$JOB_IDENTIFIER/."
281281
282282
mkdir -p "${GCS_MOUNT_PATH_0}/index_mapping_dir"
283+
mkdir -p ${EXPERIMENT_DIR}/${EXPERIMENT_NAME}/$JOB_IDENTIFIER
283284
284285
sleep 10 # <- Allow some time for service to boot
285286
286-
OMP_NUM_THREADS=12 torchrun \
287+
OMP_NUM_THREADS=12 NSYS_CONFIG_DIRECTIVES="AgentLaunchTimeoutSec=240;AppLaunchTimeoutSec=240" TORCH_NCCL_ENABLE_MONITORING=0 \
288+
/usr/local/bin/nsys profile -s none -t nvtx,cuda --capture-range=cudaProfilerApi --capture-range-end=stop \
289+
-o ${EXPERIMENT_DIR}/${EXPERIMENT_NAME}/$JOB_IDENTIFIER/noderank-$NODE_RANK \
290+
--session-new "nemo-rank$NODE_RANK"-$RANDOM \
291+
--wait all \
292+
torchrun \
287293
--nproc-per-node="$GPUS_PER_NODE" \
288294
--nnodes="$NNODES" \
289295
--node_rank="$NODE_RANK" \

0 commit comments

Comments
 (0)