File tree Expand file tree Collapse file tree 1 file changed +7
-1
lines changed
src/helm-charts/a3ultra/nemo-training/templates Expand file tree Collapse file tree 1 file changed +7
-1
lines changed Original file line number Diff line number Diff line change @@ -280,10 +280,16 @@ spec:
280280 echo "Job logs will go to ${EXPERIMENT_DIR}/${EXPERIMENT_NAME}/$JOB_IDENTIFIER/."
281281
282282 mkdir -p "${GCS_MOUNT_PATH_0}/index_mapping_dir"
283+ mkdir -p ${EXPERIMENT_DIR}/${EXPERIMENT_NAME}/$JOB_IDENTIFIER
283284
284285 sleep 10 # <- Allow some time for service to boot
285286
286- OMP_NUM_THREADS=12 torchrun \
287+ OMP_NUM_THREADS=12 NSYS_CONFIG_DIRECTIVES="AgentLaunchTimeoutSec=240;AppLaunchTimeoutSec=240" TORCH_NCCL_ENABLE_MONITORING=0 \
288+ /usr/local/bin/nsys profile -s none -t nvtx,cuda --capture-range=cudaProfilerApi --capture-range-end=stop \
289+ -o ${EXPERIMENT_DIR}/${EXPERIMENT_NAME}/$JOB_IDENTIFIER/noderank-$NODE_RANK \
290+ --session-new "nemo-rank$NODE_RANK"-$RANDOM \
291+ --wait all \
292+ torchrun \
287293 --nproc-per-node="$GPUS_PER_NODE" \
288294 --nnodes="$NNODES" \
289295 --node_rank="$NODE_RANK" \
You can’t perform that action at this time.
0 commit comments