Skip to content

Commit d8b13cb

Browse files
authored
Disable FP8 in Mcore integration test on older GPUs (NVIDIA#1357)
Debug Mcore integration test Avoid FP8 on Ampere and older. Generate synthetic data instead of depending on external data. Signed-off-by: Tim Moon <[email protected]>
1 parent d978e80 commit d8b13cb

File tree

3 files changed

+21
-4
lines changed

3 files changed

+21
-4
lines changed
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Megatron-LM
2+
vocab.json
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
#version: 0.2

qa/L1_pytorch_mcore_integration/test.sh

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,27 @@ set -e
88
: ${TE_PATH:=/opt/transformerengine}
99
: ${MCORE_PATH:=${TE_PATH}/qa/L1_pytorch_mcore_integration/Megatron-LM}
1010

11+
# Check whether FP8 is supported
12+
DEVICE_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/[^0-9]//g')
13+
if [[ ${DEVICE_ARCH} -ge 89 ]]; then
14+
WITH_FP8=1
15+
fi
16+
1117
# Download Megatron-LM if needed
1218
if [ ! -d "${MCORE_PATH}" ]; then
1319
pushd $(dirname ${MCORE_PATH})
1420
git clone -b core_r0.9.0 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
1521
popd
1622
fi
1723

24+
# Create mock vocab
25+
VOCAB_FILE=${TE_PATH}/qa/L1_pytorch_mcore_integration/vocab.json
26+
printf "" > ${VOCAB_FILE}
27+
printf "{" >> ${VOCAB_FILE}
28+
printf "\"<|endoftext|>\": 0" >> ${VOCAB_FILE}
29+
seq 1 4095 | awk '{ printf(", \"%d\": %d", $1, $1) }' >> ${VOCAB_FILE}
30+
printf "}" >> ${VOCAB_FILE}
31+
1832
# Megatron-LM invocation
1933
COMMAND="
2034
NVTE_TORCH_COMPILE=0
@@ -40,17 +54,17 @@ ${MCORE_PATH}/pretrain_gpt.py
4054
--hidden-size 128
4155
--num-attention-heads 8
4256
--seq-length 128
43-
--max-position-embeddings 2048
57+
--max-position-embeddings 128
4458
--micro-batch-size 1
4559
--global-batch-size 8
4660
--train-iters 10
4761
--eval-iters 10
4862
--lr 1e-4
4963
--mock-data
50-
--vocab-file /data/gpt3/pile-cc1-cc2-shuf/bpe/gpt2-vocab.json
51-
--merge-file /data/gpt3/pile-cc1-cc2-shuf/bpe/gpt2-merges.txt
64+
--vocab-file ${VOCAB_FILE}
65+
--merge-file ${TE_PATH}/qa/L1_pytorch_mcore_integration/merges.txt
5266
--transformer-impl transformer_engine
53-
--fp8-format hybrid
67+
${WITH_FP8:+--fp8-format hybrid}
5468
"
5569
COMMAND=$(echo "${COMMAND}" | tr '\n' ' ')
5670

0 commit comments

Comments
 (0)