Skip to content

Commit 1397ac0

Browse files
authored
Merge branch 'multi-query-attention' into loubnabnl-patch-1
2 parents 22de429 + 3e22c9f commit 1397ac0

File tree

272 files changed

+15778
-6624
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

272 files changed

+15778
-6624
lines changed

.coveragerc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
[html]
2+
directory = coverage
3+
4+
[run]
5+
data_file = .coverage_$LOCAL_RANK

.gitignore

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,8 @@
11
__pycache__
2-
2+
*.so
3+
build
4+
.coverage_*
5+
*.egg-info
6+
*~
7+
slurm*
8+
logs

.gitlab-ci.yml

Lines changed: 296 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,299 @@
1-
image: gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel
1+
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/ngc/pytorch:22.12-py3_pytest-cov
22

3-
test:
3+
stages:
4+
- test
5+
- cleanup
6+
7+
variables: &VARS
8+
SELENE_ADLR_CI_PATH: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron"
9+
DATA_DIR: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/data"
10+
PYTORCH_IMAGE: gitlab-master.nvidia.com:5005/adlr/megatron-lm/ngc/pytorch:22.12-py3_pytest-cov
11+
PYTHON_VIRTUAL_ENV: /lustre/fsw/adlr/adlr-nlp/adlr_ci/cienv/bin/activate
12+
TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED: L0 # Can specify levels
13+
TESTS_TO_RUN_AFTER_MERGING: L0 # Can specify levels
14+
TESTS_TO_RUN_ON_THIS_COMMIT: unit_tests
15+
TEST_REGEX_ON_THIS_COMMIT: NONE #https://github.com/google/re2/wiki/Syntax (Can define regex as in this spec) e.g /.*gpt3.*/
16+
DISPLAY_OUTPUT: "True" # Set to true for new tests to copy the logs for creating golden truth file
17+
18+
unit_tests:
19+
tags:
20+
- docker_local_runner
21+
stage: test
422
script:
5-
- pytest --junitxml=report.xml tests
23+
- torchrun --nproc_per_node=8 -m pytest --cov-report=term --cov-report=html --cov=megatron/core tests/unit_tests
24+
coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/'
625
artifacts:
7-
when: always
8-
reports:
9-
junit: report.xml
10-
26+
paths:
27+
- coverage
28+
expire_in: 30 days
29+
only:
30+
- merge_requests
31+
32+
.selene_test_resume_checkpoint_launcher: &selene-test-resume-checkpoint-launcher
33+
tags:
34+
- ssh_selene_runner
35+
stage: test
36+
script: &selene-test-resume-launcher-script
37+
- echo "Running selene resume from checkpoint test. "
38+
- pwd
39+
- export BUILD_DIR=`pwd`
40+
- export RUN_NAME=resume_${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}_${NUM_NODES}nodes
41+
- echo "In case of error check ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs."
42+
- export TP_SIZE PP_SIZE NUM_NODES MAX_STEPS
43+
- export DATA_DIR=$DATA_DIR
44+
- echo "Run name is $RUN_NAME"
45+
- mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints
46+
- mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs
47+
- mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results
48+
- rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints/*
49+
- rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs/*
50+
- rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results/*
51+
- export BASE_DIR=$SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME
52+
- export LOGS_DIR=$BASE_DIR/logs
53+
- export RESULTS_DIR=$BASE_DIR/results
54+
- export CHECKPOINTS_DIR=$BASE_DIR/checkpoints
55+
- echo "Submitting job"
56+
- sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_resume_checkpoint_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,TP_SIZE,PP_SIZE,NUM_NODES`
57+
- export SLURM_JOBID=$(echo $sbatch_submission| grep 'Submitted batch job' | awk '{ print $4 }');
58+
- bash $BUILD_DIR/tests/functional_tests/shell_test_utils/jobwait.sh $SLURM_JOBID
59+
- \[ ! -z ${SLURM_JOBID} \] && echo -e " --------------------------------------------------\n"
60+
"----------WAITING FOR SLURM JOB TO BEGIN-----------\n"
61+
"---------------------------------------------------\n"
62+
"$(scontrol show job=${SLURM_JOBID})\n"
63+
"---------------------------------------------------\n"
64+
# Gitlab logs collapsible section markers
65+
- echo -e "\e[0Ksection_end:`date +%s`:slurm_setup\r\e[0K"
66+
# Follow output of the job
67+
- echo "Finished job"
68+
- export SLURM_STATE=$(sacct -j "${SLURM_JOBID}" --format State --parsable2 --noheader |& head -n 1)
69+
- echo "Slurm job state $SLURM_STATE"
70+
- if [[ "$SLURM_STATE" != "COMPLETED" ]]; then echo "Slurm job did not complete. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs. Skipping pytest."; exit 1; fi
71+
- source $PYTHON_VIRTUAL_ENV
72+
- pytest $BUILD_DIR/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py || echo "Pytest failed. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs."
73+
- echo "Completed the job"
74+
rules:
75+
- if: $TEST_LEVEL =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TEST_REGEX_ON_THIS_COMMIT
76+
when: always
77+
- if: '$CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING'
78+
when: always
79+
- if: $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
80+
when: always
81+
allow_failure: false
82+
83+
.selene_test_launcher: &selene-test-launcher
84+
tags:
85+
- ssh_selene_runner
86+
stage: test
87+
script: &selene-test-launcher-script
88+
- echo "Running selene test"
89+
- echo "$CI_MERGE_REQUEST_APPROVED"
90+
- pwd
91+
- export BUILD_DIR=`pwd`
92+
- RUN_NAME=${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}_${NUM_NODES}nodes_${MAX_STEPS}steps
93+
- if [[ $USE_TE == 1 ]]; then RUN_NAME=${RUN_NAME}_te_enabled; fi
94+
- export $RUN_NAME
95+
- echo "In case of error check ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs."
96+
- export USE_TE TP_SIZE PP_SIZE NUM_NODES MAX_STEPS VP_SIZE
97+
- export MBS GBS
98+
- export DATA_DIR=$DATA_DIR
99+
- echo "Run name is $RUN_NAME"
100+
- mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints
101+
- mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs
102+
- mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results
103+
- rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints/*
104+
- rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs/*
105+
- rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results/*
106+
- export BASE_DIR=$SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME
107+
- export LOGS_DIR=$BASE_DIR/logs
108+
- export RESULTS_DIR=$BASE_DIR/results
109+
- export CHECKPOINTS_DIR=$BASE_DIR/checkpoints
110+
- echo "Submitting job"
111+
- sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,USE_TE,TP_SIZE,PP_SIZE,NUM_NODES,MAX_STEPS,VP_SIZE,MBS,GBS`
112+
- export SLURM_JOBID=$(echo $sbatch_submission| grep 'Submitted batch job' | awk '{ print $4 }');
113+
- bash $BUILD_DIR/tests/functional_tests/shell_test_utils/jobwait.sh $SLURM_JOBID
114+
- \[ ! -z ${SLURM_JOBID} \] && echo -e " --------------------------------------------------\n"
115+
"----------WAITING FOR SLURM JOB TO BEGIN-----------\n"
116+
"---------------------------------------------------\n"
117+
"$(scontrol show job=${SLURM_JOBID})\n"
118+
"---------------------------------------------------\n"
119+
# Gitlab logs collapsible section markers
120+
- echo -e "\e[0Ksection_end:`date +%s`:slurm_setup\r\e[0K"
121+
# Follow output of the job
122+
- echo "Finished job"
123+
- echo "Slurm log dump start ------------------------------------------------------------"
124+
- cat $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results/*
125+
- echo "Slurm log dump end --------------------------------------------------------------"
126+
- python3 $BUILD_DIR/tests/functional_tests/python_test_utils/check_slurm_job_completion.py $SLURM_JOBID
127+
- if [ $? -ne 0 ]; then echo "Slurm job did not complete. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs. Skipping pytest."; exit 1; fi
128+
- source $PYTHON_VIRTUAL_ENV
129+
- |
130+
if [[ "$DISPLAY_OUTPUT" == "True" ]]; then
131+
python3 $BUILD_DIR/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $LOGS_DIR $RUN_NAME
132+
fi
133+
- |
134+
if [[ $USE_TE -ne 1 ]]; then
135+
echo "Checking against ground truth file"
136+
export EXPECTED_METRICS_FILE=$BUILD_DIR/tests/functional_tests/test_results/$RUN_MODEL/$RUN_NAME.json
137+
pytest $BUILD_DIR/tests/functional_tests/python_test_utils/test_ci_pipeline.py || echo "Pytest failed. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs."
138+
fi
139+
- echo "Completed the job"
140+
rules:
141+
- if: $TEST_LEVEL =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TEST_REGEX_ON_THIS_COMMIT
142+
when: always
143+
- if: '$CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING'
144+
when: always
145+
- if: $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
146+
when: always
147+
allow_failure: false
148+
149+
train.te_gpt3.345m_tp2_pp2_1node_50steps:
150+
<<: *selene-test-launcher
151+
variables:
152+
<<: [*VARS]
153+
RUN_MODEL: gpt3
154+
USE_TE: 1
155+
TP_SIZE: 2
156+
PP_SIZE: 2
157+
NUM_NODES: 1
158+
MAX_STEPS: 50
159+
TIME_LIMIT: "50:00"
160+
TEST_LEVEL: L0
161+
162+
train.gpt3.345m_tp4_pp1_1node_50steps:
163+
<<: *selene-test-launcher
164+
variables:
165+
<<: [*VARS]
166+
RUN_MODEL: gpt3
167+
USE_TE: 0
168+
TP_SIZE: 4
169+
PP_SIZE: 1
170+
NUM_NODES: 1
171+
MAX_STEPS: 50
172+
TIME_LIMIT: "20:00"
173+
TEST_LEVEL: L0
174+
175+
train.gpt3.345m_tp2_pp2_1node_50steps:
176+
<<: *selene-test-launcher
177+
variables:
178+
<<: [*VARS]
179+
RUN_MODEL: gpt3
180+
USE_TE: 0
181+
TP_SIZE: 2
182+
PP_SIZE: 2
183+
NUM_NODES: 1
184+
MAX_STEPS: 50
185+
TIME_LIMIT: "20:00"
186+
TEST_LEVEL: L0
187+
188+
train.gpt3.345m_tp1_pp2_1node_50steps:
189+
<<: *selene-test-launcher
190+
variables:
191+
<<: [*VARS]
192+
RUN_MODEL: gpt3
193+
USE_TE: 0
194+
TP_SIZE: 1
195+
PP_SIZE: 2
196+
NUM_NODES: 1
197+
MAX_STEPS: 50
198+
TIME_LIMIT: "20:00"
199+
TEST_LEVEL: L0
200+
201+
train.gpt3.345m_tp1_pp4_1node_50steps:
202+
<<: *selene-test-launcher
203+
variables:
204+
<<: [*VARS]
205+
RUN_MODEL: gpt3
206+
USE_TE: 0
207+
TP_SIZE: 1
208+
PP_SIZE: 4
209+
VP_SIZE: 1
210+
NUM_NODES: 1
211+
MAX_STEPS: 50
212+
TIME_LIMIT: "20:00"
213+
TEST_LEVEL: L0
214+
215+
resume.checkpoint.gpt3.345m_tp1_pp2_1node:
216+
<<: *selene-test-resume-checkpoint-launcher
217+
variables:
218+
<<: [*VARS]
219+
RUN_MODEL: gpt3
220+
TP_SIZE: 1
221+
PP_SIZE: 2
222+
NUM_NODES: 1
223+
TIME_LIMIT: "30:00"
224+
TEST_LEVEL: L0
225+
226+
train.bert.345m_tp4_pp1_1node_50steps:
227+
<<: *selene-test-launcher
228+
variables:
229+
<<: [*VARS]
230+
RUN_MODEL: bert
231+
TP_SIZE: 4
232+
PP_SIZE: 1
233+
NUM_NODES: 1
234+
MAX_STEPS: 50
235+
TIME_LIMIT: "20:00"
236+
TEST_LEVEL: L0
237+
238+
train.bert.345m_tp2_pp2_1node_50steps:
239+
<<: *selene-test-launcher
240+
variables:
241+
<<: [*VARS]
242+
RUN_MODEL: bert
243+
TP_SIZE: 2
244+
PP_SIZE: 2
245+
NUM_NODES: 1
246+
MAX_STEPS: 50
247+
TIME_LIMIT: "20:00"
248+
TEST_LEVEL: L0
249+
250+
train.bert.345m_tp1_pp2_1node_50steps:
251+
<<: *selene-test-launcher
252+
variables:
253+
<<: [*VARS]
254+
RUN_MODEL: bert
255+
TP_SIZE: 1
256+
PP_SIZE: 2
257+
NUM_NODES: 1
258+
MAX_STEPS: 50
259+
TIME_LIMIT: "20:00"
260+
TEST_LEVEL: L0
261+
262+
train.bert.345m_tp1_pp4_1node_50steps:
263+
<<: *selene-test-launcher
264+
variables:
265+
<<: [*VARS]
266+
RUN_MODEL: bert
267+
TP_SIZE: 1
268+
PP_SIZE: 4
269+
VP_SIZE: 2
270+
NUM_NODES: 1
271+
MAX_STEPS: 50
272+
TIME_LIMIT: "20:00"
273+
TEST_LEVEL: L0
274+
275+
resume.checkpoint.bert.345m_tp1_pp2_1node:
276+
<<: *selene-test-resume-checkpoint-launcher
277+
variables:
278+
<<: [*VARS]
279+
RUN_MODEL: bert
280+
TP_SIZE: 1
281+
PP_SIZE: 2
282+
NUM_NODES: 1
283+
TIME_LIMIT: "30:00"
284+
TEST_LEVEL: L0
285+
286+
cleanup.selene:
287+
tags:
288+
- ssh_selene_runner
289+
stage: cleanup
290+
variables:
291+
<<: [*VARS]
292+
script:
293+
- set +e
294+
- NUM_CLEANUP=`find ${SELENE_ADLR_CI_PATH}/* -type d -ctime +20 | grep -v data | wc -l`
295+
- find ${SELENE_ADLR_CI_PATH}/* -type d -ctime +20 | grep -v data | xargs rm -rf
296+
- echo "Finished cleaning $NUM_CLEANUP directories older than 20 days everything in Selene"
297+
allow_failure: true
298+
rules:
299+
- when: always

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
The following applies to all files unless otherwise noted:
22

3-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3+
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
44
#
55
# Redistribution and use in source and binary forms, with or without
66
# modification, are permitted provided that the following conditions

0 commit comments

Comments
 (0)