1- image : gitlab-master.nvidia.com/dl/dgx/ pytorch:21 .12-py3-devel
1+ image : gitlab-master.nvidia.com:5005/adlr/megatron-lm/ngc/ pytorch:22 .12-py3_pytest-cov
22
3- test :
3+ stages :
4+ - test
5+ - cleanup
6+
7+ variables : &VARS
8+ SELENE_ADLR_CI_PATH : " /lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron"
9+ DATA_DIR : " /lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/data"
10+ PYTORCH_IMAGE : gitlab-master.nvidia.com:5005/adlr/megatron-lm/ngc/pytorch:22.12-py3_pytest-cov
11+ PYTHON_VIRTUAL_ENV : /lustre/fsw/adlr/adlr-nlp/adlr_ci/cienv/bin/activate
12+ TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED : L0 # Can specify levels
13+ TESTS_TO_RUN_AFTER_MERGING : L0 # Can specify levels
14+ TESTS_TO_RUN_ON_THIS_COMMIT : unit_tests
15+ TEST_REGEX_ON_THIS_COMMIT : NONE # https://github.com/google/re2/wiki/Syntax (Can define regex as in this spec) e.g /.*gpt3.*/
16+ DISPLAY_OUTPUT : " True" # Set to true for new tests to copy the logs for creating golden truth file
17+
18+ unit_tests :
19+ tags :
20+ - docker_local_runner
21+ stage : test
422 script :
5- - pytest --junitxml=report.xml tests
23+ - torchrun --nproc_per_node=8 -m pytest --cov-report=term --cov-report=html --cov=megatron/core tests/unit_tests
24+ coverage : ' /(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/'
625 artifacts :
7- when : always
8- reports :
9- junit : report.xml
10-
26+ paths :
27+ - coverage
28+ expire_in : 30 days
29+ only :
30+ - merge_requests
31+
32+ .selene_test_resume_checkpoint_launcher : &selene-test-resume-checkpoint-launcher
33+ tags :
34+ - ssh_selene_runner
35+ stage : test
36+ script : &selene-test-resume-launcher-script
37+ - echo "Running selene resume from checkpoint test. "
38+ - pwd
39+ - export BUILD_DIR=`pwd`
40+ - export RUN_NAME=resume_${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}_${NUM_NODES}nodes
41+ - echo "In case of error check ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs."
42+ - export TP_SIZE PP_SIZE NUM_NODES MAX_STEPS
43+ - export DATA_DIR=$DATA_DIR
44+ - echo "Run name is $RUN_NAME"
45+ - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints
46+ - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs
47+ - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results
48+ - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints/*
49+ - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs/*
50+ - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results/*
51+ - export BASE_DIR=$SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME
52+ - export LOGS_DIR=$BASE_DIR/logs
53+ - export RESULTS_DIR=$BASE_DIR/results
54+ - export CHECKPOINTS_DIR=$BASE_DIR/checkpoints
55+ - echo "Submitting job"
56+ - sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_resume_checkpoint_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,TP_SIZE,PP_SIZE,NUM_NODES`
57+ - export SLURM_JOBID=$(echo $sbatch_submission| grep 'Submitted batch job' | awk '{ print $4 }');
58+ - bash $BUILD_DIR/tests/functional_tests/shell_test_utils/jobwait.sh $SLURM_JOBID
59+ - \[ ! -z ${SLURM_JOBID} \] && echo -e " --------------------------------------------------\n"
60+ " ----------WAITING FOR SLURM JOB TO BEGIN-----------\n "
61+ " ---------------------------------------------------\n "
62+ " $(scontrol show job=${SLURM_JOBID})\n "
63+ " ---------------------------------------------------\n "
64+ # Gitlab logs collapsible section markers
65+ - echo -e "\e[0Ksection_end:`date +%s`:slurm_setup\r\e[0K"
66+ # Follow output of the job
67+ - echo "Finished job"
68+ - export SLURM_STATE=$(sacct -j "${SLURM_JOBID}" --format State --parsable2 --noheader |& head -n 1)
69+ - echo "Slurm job state $SLURM_STATE"
70+ - if [[ "$SLURM_STATE" != "COMPLETED" ]]; then echo "Slurm job did not complete. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs. Skipping pytest."; exit 1; fi
71+ - source $PYTHON_VIRTUAL_ENV
72+ - pytest $BUILD_DIR/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py || echo "Pytest failed. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs."
73+ - echo "Completed the job"
74+ rules :
75+ - if : $TEST_LEVEL =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TEST_REGEX_ON_THIS_COMMIT
76+ when : always
77+ - if : ' $CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING'
78+ when : always
79+ - if : $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
80+ when : always
81+ allow_failure : false
82+
83+ .selene_test_launcher : &selene-test-launcher
84+ tags :
85+ - ssh_selene_runner
86+ stage : test
87+ script : &selene-test-launcher-script
88+ - echo "Running selene test"
89+ - echo "$CI_MERGE_REQUEST_APPROVED"
90+ - pwd
91+ - export BUILD_DIR=`pwd`
92+ - RUN_NAME=${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}_${NUM_NODES}nodes_${MAX_STEPS}steps
93+ - if [[ $USE_TE == 1 ]]; then RUN_NAME=${RUN_NAME}_te_enabled; fi
94+ - export $RUN_NAME
95+ - echo "In case of error check ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs."
96+ - export USE_TE TP_SIZE PP_SIZE NUM_NODES MAX_STEPS VP_SIZE
97+ - export MBS GBS
98+ - export DATA_DIR=$DATA_DIR
99+ - echo "Run name is $RUN_NAME"
100+ - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints
101+ - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs
102+ - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results
103+ - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints/*
104+ - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs/*
105+ - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results/*
106+ - export BASE_DIR=$SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME
107+ - export LOGS_DIR=$BASE_DIR/logs
108+ - export RESULTS_DIR=$BASE_DIR/results
109+ - export CHECKPOINTS_DIR=$BASE_DIR/checkpoints
110+ - echo "Submitting job"
111+ - sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,USE_TE,TP_SIZE,PP_SIZE,NUM_NODES,MAX_STEPS,VP_SIZE,MBS,GBS`
112+ - export SLURM_JOBID=$(echo $sbatch_submission| grep 'Submitted batch job' | awk '{ print $4 }');
113+ - bash $BUILD_DIR/tests/functional_tests/shell_test_utils/jobwait.sh $SLURM_JOBID
114+ - \[ ! -z ${SLURM_JOBID} \] && echo -e " --------------------------------------------------\n"
115+ " ----------WAITING FOR SLURM JOB TO BEGIN-----------\n "
116+ " ---------------------------------------------------\n "
117+ " $(scontrol show job=${SLURM_JOBID})\n "
118+ " ---------------------------------------------------\n "
119+ # Gitlab logs collapsible section markers
120+ - echo -e "\e[0Ksection_end:`date +%s`:slurm_setup\r\e[0K"
121+ # Follow output of the job
122+ - echo "Finished job"
123+ - echo "Slurm log dump start ------------------------------------------------------------"
124+ - cat $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results/*
125+ - echo "Slurm log dump end --------------------------------------------------------------"
126+ - python3 $BUILD_DIR/tests/functional_tests/python_test_utils/check_slurm_job_completion.py $SLURM_JOBID
127+ - if [ $? -ne 0 ]; then echo "Slurm job did not complete. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs. Skipping pytest."; exit 1; fi
128+ - source $PYTHON_VIRTUAL_ENV
129+ - |
130+ if [[ "$DISPLAY_OUTPUT" == "True" ]]; then
131+ python3 $BUILD_DIR/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $LOGS_DIR $RUN_NAME
132+ fi
133+ - |
134+ if [[ $USE_TE -ne 1 ]]; then
135+ echo "Checking against ground truth file"
136+ export EXPECTED_METRICS_FILE=$BUILD_DIR/tests/functional_tests/test_results/$RUN_MODEL/$RUN_NAME.json
137+ pytest $BUILD_DIR/tests/functional_tests/python_test_utils/test_ci_pipeline.py || echo "Pytest failed. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs."
138+ fi
139+ - echo "Completed the job"
140+ rules :
141+ - if : $TEST_LEVEL =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TEST_REGEX_ON_THIS_COMMIT
142+ when : always
143+ - if : ' $CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING'
144+ when : always
145+ - if : $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
146+ when : always
147+ allow_failure : false
148+
149+ train.te_gpt3.345m_tp2_pp2_1node_50steps :
150+ << : *selene-test-launcher
151+ variables :
152+ << : [*VARS]
153+ RUN_MODEL : gpt3
154+ USE_TE : 1
155+ TP_SIZE : 2
156+ PP_SIZE : 2
157+ NUM_NODES : 1
158+ MAX_STEPS : 50
159+ TIME_LIMIT : " 50:00"
160+ TEST_LEVEL : L0
161+
162+ train.gpt3.345m_tp4_pp1_1node_50steps :
163+ << : *selene-test-launcher
164+ variables :
165+ << : [*VARS]
166+ RUN_MODEL : gpt3
167+ USE_TE : 0
168+ TP_SIZE : 4
169+ PP_SIZE : 1
170+ NUM_NODES : 1
171+ MAX_STEPS : 50
172+ TIME_LIMIT : " 20:00"
173+ TEST_LEVEL : L0
174+
175+ train.gpt3.345m_tp2_pp2_1node_50steps :
176+ << : *selene-test-launcher
177+ variables :
178+ << : [*VARS]
179+ RUN_MODEL : gpt3
180+ USE_TE : 0
181+ TP_SIZE : 2
182+ PP_SIZE : 2
183+ NUM_NODES : 1
184+ MAX_STEPS : 50
185+ TIME_LIMIT : " 20:00"
186+ TEST_LEVEL : L0
187+
188+ train.gpt3.345m_tp1_pp2_1node_50steps :
189+ << : *selene-test-launcher
190+ variables :
191+ << : [*VARS]
192+ RUN_MODEL : gpt3
193+ USE_TE : 0
194+ TP_SIZE : 1
195+ PP_SIZE : 2
196+ NUM_NODES : 1
197+ MAX_STEPS : 50
198+ TIME_LIMIT : " 20:00"
199+ TEST_LEVEL : L0
200+
201+ train.gpt3.345m_tp1_pp4_1node_50steps :
202+ << : *selene-test-launcher
203+ variables :
204+ << : [*VARS]
205+ RUN_MODEL : gpt3
206+ USE_TE : 0
207+ TP_SIZE : 1
208+ PP_SIZE : 4
209+ VP_SIZE : 1
210+ NUM_NODES : 1
211+ MAX_STEPS : 50
212+ TIME_LIMIT : " 20:00"
213+ TEST_LEVEL : L0
214+
215+ resume.checkpoint.gpt3.345m_tp1_pp2_1node :
216+ << : *selene-test-resume-checkpoint-launcher
217+ variables :
218+ << : [*VARS]
219+ RUN_MODEL : gpt3
220+ TP_SIZE : 1
221+ PP_SIZE : 2
222+ NUM_NODES : 1
223+ TIME_LIMIT : " 30:00"
224+ TEST_LEVEL : L0
225+
226+ train.bert.345m_tp4_pp1_1node_50steps :
227+ << : *selene-test-launcher
228+ variables :
229+ << : [*VARS]
230+ RUN_MODEL : bert
231+ TP_SIZE : 4
232+ PP_SIZE : 1
233+ NUM_NODES : 1
234+ MAX_STEPS : 50
235+ TIME_LIMIT : " 20:00"
236+ TEST_LEVEL : L0
237+
238+ train.bert.345m_tp2_pp2_1node_50steps :
239+ << : *selene-test-launcher
240+ variables :
241+ << : [*VARS]
242+ RUN_MODEL : bert
243+ TP_SIZE : 2
244+ PP_SIZE : 2
245+ NUM_NODES : 1
246+ MAX_STEPS : 50
247+ TIME_LIMIT : " 20:00"
248+ TEST_LEVEL : L0
249+
250+ train.bert.345m_tp1_pp2_1node_50steps :
251+ << : *selene-test-launcher
252+ variables :
253+ << : [*VARS]
254+ RUN_MODEL : bert
255+ TP_SIZE : 1
256+ PP_SIZE : 2
257+ NUM_NODES : 1
258+ MAX_STEPS : 50
259+ TIME_LIMIT : " 20:00"
260+ TEST_LEVEL : L0
261+
262+ train.bert.345m_tp1_pp4_1node_50steps :
263+ << : *selene-test-launcher
264+ variables :
265+ << : [*VARS]
266+ RUN_MODEL : bert
267+ TP_SIZE : 1
268+ PP_SIZE : 4
269+ VP_SIZE : 2
270+ NUM_NODES : 1
271+ MAX_STEPS : 50
272+ TIME_LIMIT : " 20:00"
273+ TEST_LEVEL : L0
274+
275+ resume.checkpoint.bert.345m_tp1_pp2_1node :
276+ << : *selene-test-resume-checkpoint-launcher
277+ variables :
278+ << : [*VARS]
279+ RUN_MODEL : bert
280+ TP_SIZE : 1
281+ PP_SIZE : 2
282+ NUM_NODES : 1
283+ TIME_LIMIT : " 30:00"
284+ TEST_LEVEL : L0
285+
286+ cleanup.selene :
287+ tags :
288+ - ssh_selene_runner
289+ stage : cleanup
290+ variables :
291+ << : [*VARS]
292+ script :
293+ - set +e
294+ - NUM_CLEANUP=`find ${SELENE_ADLR_CI_PATH}/* -type d -ctime +20 | grep -v data | wc -l`
295+ - find ${SELENE_ADLR_CI_PATH}/* -type d -ctime +20 | grep -v data | xargs rm -rf
296+ - echo "Finished cleaning $NUM_CLEANUP directories older than 20 days everything in Selene"
297+ allow_failure : true
298+ rules :
299+ - when : always
0 commit comments