Skip to content

Commit a067bd0

Browse files
anandhu-enggithub-actions[bot]arjunsuresh
authored
Support uploaded quantised models for SCC25 (#612)
* Add support for using existing quantized llama2 70b models * use r2-downloader tool for model download * Fix path issue for r2-downloader * copy llama2 checkpoint to scratch space * update preprocessed openorca dataset path * make sure target data path is created before softlink * Initialize git lfs when building TRT-LLM * Update download URL from staging to production * Update meta.yaml --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: Arjun Suresh <[email protected]>
1 parent 4eca4b8 commit a067bd0

File tree

7 files changed

+95
-70
lines changed

7 files changed

+95
-70
lines changed

script/app-mlperf-inference-nvidia/customize.py

Lines changed: 15 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -318,18 +318,6 @@ def preprocess(i):
318318
model_path = fp8_model_path
319319

320320
elif "llama2" in env["MLC_MODEL"]:
321-
# path to which the data file is present
322-
target_data_path = os.path.join(
323-
env['MLPERF_SCRATCH_PATH'],
324-
'data',
325-
'llama2-70b')
326-
# path to the dataset file
327-
target_data_file_path = os.path.join(
328-
env['MLPERF_SCRATCH_PATH'],
329-
'data',
330-
'llama2-70b',
331-
'open_orca_gpt4_tokenized_llama.sampled_24576.pkl')
332-
333321
preprocessed_data_for_accuracy_checker = os.path.join(
334322
env['MLPERF_SCRATCH_PATH'],
335323
'preprocessed_data',
@@ -345,41 +333,13 @@ def preprocess(i):
345333

346334
tmp_tp_size = env['MLC_NVIDIA_TP_SIZE']
347335
tmp_pp_size = env['MLC_NVIDIA_PP_SIZE']
348-
if tmp_tp_size == "1":
349-
fp8_model_path = os.path.join(
350-
env['MLPERF_SCRATCH_PATH'],
351-
'models',
352-
'Llama2',
353-
'fp8-quantized-ammo',
354-
f'llama2-70b-chat-hf-tp{tmp_tp_size}pp1-fp8-02072024')
355-
else:
356-
fp8_model_path = os.path.join(
357-
env['MLPERF_SCRATCH_PATH'],
358-
'models',
359-
'Llama2',
360-
'fp8-quantized-ammo',
361-
f'llama2-70b-chat-hf-tp{tmp_tp_size}pp{tmp_pp_size}-fp8')
362-
363-
# check the presence of validation dataset
364-
if not os.path.exists(target_data_file_path):
365-
if env.get('MLC_DATASET_OPENORCA_PREPROCESSED_PATH', '') == '':
366-
return {
367-
'return': 1, 'error': 'Llama2 70B validation dataset not present.'}
368-
if not os.path.exists(target_data_path):
369-
cmds.append(f"mkdir -p {target_data_path}")
370-
cmds.append(
371-
f"ln -sf {env['MLC_DATASET_OPENORCA_PREPROCESSED_PATH']} {target_data_file_path}")
372336

373-
# check the presence of calibration dataset
374-
if not env.get('LLAMA2_PRE_QUANTIZED_CHECKPOINT_PATH'):
375-
if not os.path.exists(target_calibration_data_file_path):
376-
if env.get('MLC_DATASET_OPENORCA_CALIBRATION_PATH', '') == '':
377-
return {
378-
'return': 1, 'error': 'Llama2 70B calibration dataset not present.'}
379-
if not os.path.exists(target_data_path):
380-
cmds.append(f"mkdir -p {target_data_path}")
381-
cmds.append(
382-
f"ln -sf {env['MLC_DATASET_OPENORCA_CALIBRATION_PATH']} {target_calibration_data_file_path}")
337+
fp8_model_path = os.path.join(
338+
env['MLPERF_SCRATCH_PATH'],
339+
'models',
340+
'Llama2',
341+
'fp8-quantized-ammo',
342+
f'llama-2-70b-chat-hf-tp{tmp_tp_size}pp{tmp_pp_size}-fp8')
383343

384344
if not os.path.exists(preprocessed_data_for_accuracy_checker):
385345
if not os.path.exists(preprocessed_data_for_accuracy_checker):
@@ -476,7 +436,15 @@ def preprocess(i):
476436
'open_orca',
477437
'input_ids_padded.npy')
478438
if not os.path.exists(target_preprocessed_data_path):
479-
cmds.append(f"make preprocess_data BENCHMARKS='{model_name}'")
439+
cmds.append(
440+
f"mkdir -p {os.path.dirname(target_preprocessed_data_path)}")
441+
if env.get('MLC_DATASET_OPENORCA_PREPROCESSED_PATH'):
442+
cmds.append(
443+
f"ln -sf {env['MLC_DATASET_OPENORCA_PREPROCESSED_PATH']} {os.path.join(env['MLPERF_SCRATCH_PATH'], "preprocessed_data", "open_orca")}"
444+
)
445+
else:
446+
cmds.append(
447+
f"make preprocess_data BENCHMARKS='{model_name}'")
480448
else:
481449
cmds.append(f"make preprocess_data BENCHMARKS='{model_name}'")
482450

script/build-mlperf-inference-server-nvidia/run.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@ CUR=$PWD
33

44
cd ${MLC_MLPERF_INFERENCE_NVIDIA_CODE_PATH}
55

6+
if [[ ${BUILD_TRTLLM} == "1" ]]; then
7+
git lfs install
8+
fi
9+
610
if [[ ${MLC_MAKE_CLEAN} == "yes" ]]; then
711
make clean
812
fi

script/get-ml-model-llama2/customize.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ def preprocess(i):
88
os_info = i['os_info']
99
env = i['env']
1010

11-
if env.get('MLC_TMP_ML_MODEL_PROVIDER', '') == 'nvidia':
11+
if env.get('MLC_TMP_ML_MODEL_QUANTIZE_LOCALLY', '') == 'nvidia':
1212
if is_true(env.get('MLC_ML_MODEL_QUANTIZE_LOCALLY')):
1313
i['run_script_input']['script_name'] = 'run-nvidia'
1414
gpu_arch = int(
@@ -18,7 +18,28 @@ def preprocess(i):
1818
env['MLC_GPU_ARCH'] = gpu_arch
1919
env['MLC_TMP_REQUIRE_DOWNLOAD'] = 'no'
2020
else:
21-
run_cmd = f"ln -sf {env['LLAMA2_CHECKPOINT_PATH']} {env['MLC_NVIDIA_MLPERF_SCRATCH_PATH']}/models/Llama2/fp8-quantized-ammo/llama-2-70b-chat-hf-tp{env['MLC_NVIDIA_TP_SIZE']}pp{env['MLC_NVIDIA_PP_SIZE']}-{env['MLC_ML_MODEL_PRECISION']}"
21+
target_quantized_model_dir = os.path.join(
22+
env['MLC_NVIDIA_MLPERF_SCRATCH_PATH'],
23+
"models",
24+
"Llama2",
25+
"fp8-quantized-ammo",
26+
f"llama-2-70b-chat-hf-tp{env['MLC_NVIDIA_TP_SIZE']}pp{env['MLC_NVIDIA_PP_SIZE']}-{env['MLC_ML_MODEL_PRECISION']}"
27+
)
28+
29+
target_model_dir = os.path.join(
30+
env['MLC_NVIDIA_MLPERF_SCRATCH_PATH'],
31+
"models",
32+
"Llama2",
33+
"Llama-2-70b-chat-hf"
34+
)
35+
36+
# Ensure target directory exists
37+
os.makedirs(target_quantized_model_dir, exist_ok=True)
38+
os.makedirs(target_model_dir, exist_ok=True)
39+
40+
run_cmd = f"cp -r {env['LLAMA2_QUANTIZED_CHECKPOINT_PATH']}/* {env['MLC_NVIDIA_MLPERF_SCRATCH_PATH']}/models/Llama2/fp8-quantized-ammo/llama-2-70b-chat-hf-tp{env['MLC_NVIDIA_TP_SIZE']}pp{env['MLC_NVIDIA_PP_SIZE']}-{env['MLC_ML_MODEL_PRECISION']}"
41+
run_cmd += f" && cp -r {env['LLAMA2_CHECKPOINT_PATH']}/* {env['MLC_NVIDIA_MLPERF_SCRATCH_PATH']}/models/Llama2/Llama-2-70b-chat-hf"
42+
2243
env['MLC_RUN_CMD'] = run_cmd
2344
else:
2445
path = env.get('LLAMA2_CHECKPOINT_PATH', '').strip()

script/get-ml-model-llama2/meta.yaml

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,33 @@ tests:
5151
variations:
5252
pre-quantized:
5353
group: quantization
54+
deps:
55+
- enable_if_env:
56+
MLC_TMP_ML_MODEL_PRE_QUANTIZED:
57+
- 'yes'
58+
skip_if_env:
59+
LLAMA2_QUANTIZED_CHECKPOINT_PATH:
60+
- 'yes'
61+
env:
62+
MLC_DOWNLOAD_FINAL_ENV_NAME: LLAMA2_QUANTIZED_CHECKPOINT_PATH
63+
MLC_EXTRACT_FINAL_ENV_NAME: LLAMA2_QUANTIZED_CHECKPOINT_PATH
64+
force_cache: true
65+
extra_cache_tags: llama2,llama2-model,llama2-checkpoint,llama2-70b
66+
names:
67+
- dae-quantized-models
68+
tags: download-and-extract
69+
force_env_keys:
70+
- MLC_OUTDIRNAME
71+
update_tags_from_env_with_prefix:
72+
_url.:
73+
- MLC_DOWNLOAD_URL
5474
env:
55-
MLC_ML_MODEL_PRE_QUANTIZED: 'yes'
75+
MLC_TMP_ML_MODEL_PRE_QUANTIZED: 'yes'
5676
quantize-locally:
5777
default: true
5878
group: quantization
5979
env:
60-
MLC_ML_MODEL_QUANTIZE_LOCALLY: 'yes'
80+
MLC_TMP_ML_MODEL_QUANTIZE_LOCALLY: 'yes'
6181
amd:
6282
default_env:
6383
MLC_LLAMA2_QUANTIZATION_DEVICE: ''
@@ -157,8 +177,11 @@ variations:
157177
env:
158178
MLC_DOWNLOAD_URL: https://llama2.mlcommons-storage.org/metadata/llama-2-7b-chat-hf.uri
159179
mlc,r2-downloader,70b,pre-quantized,fp8:
180+
add_deps_recursive:
181+
dae-quantized-models:
182+
tags: _r2-downloader
160183
env:
161-
MLC_DOWNLOAD_URL: https://llama2.mlcommons-storage.org/metadata/llama-2-70b-chat-hf-tp<<MLC_NVIDIA_TP_SIZE>>pp<<MLC_NVIDIA_PP_SIZE>>-<<<MLC_ML_MODEL_PRECISION>>>.uri
184+
MLC_DOWNLOAD_URL: https://llama2.mlcommons-storage.org/metadata/llama2-70b-chat-hf-tp<<<MLC_NVIDIA_TP_SIZE>>>pp<<<MLC_NVIDIA_PP_SIZE>>>-<<<MLC_ML_MODEL_PRECISION>>>.uri
162185
hf:
163186
group: download-source
164187
env:
@@ -212,6 +235,10 @@ variations:
212235
- tags: get,nvidia,scratch,space
213236
names:
214237
- mlperf-inference-nvidia-scratch-space
238+
- env: {}
239+
force_new_env_keys:
240+
- LLAMA2_CHECKPOINT_PATH
241+
tags: get,ml-model,llama2-70b,_fp32,_pytorch
215242
group: model-provider
216243
pytorch:
217244
default: true
@@ -242,7 +269,7 @@ variations:
242269
- tags: get,generic-python-lib,_package.compressed_tensors
243270
pytorch,fp32:
244271
env: {}
245-
pytorch,nvidia,v5.0:
272+
pytorch,nvidia,v5.0,quantize-locally:
246273
deps:
247274
- env:
248275
MLC_GIT_CHECKOUT_PATH_ENV_NAME: MLC_TENSORRT_LLM_CHECKOUT_PATH
@@ -258,10 +285,6 @@ variations:
258285
- cuda
259286
tags: get,cuda
260287
- tags: get,cuda-devices,_with-pycuda
261-
- env: {}
262-
force_new_env_keys:
263-
- LLAMA2_CHECKPOINT_PATH
264-
tags: get,ml-model,llama2-70b,_fp32,_pytorch
265288
- names:
266289
- nvidia-inference-common-code
267290
tags: get,nvidia,inference,common-code

script/get-ml-model-llama2/run-nvidia.sh

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,9 @@ cd ${MLC_TENSORRT_LLM_CHECKOUT_PATH}
1515
make -C docker build
1616
test $? -eq 0 || exit $?
1717

18-
if [ "${MLC_NVIDIA_TP_SIZE}" -eq 1 ]; then
19-
RUN_CMD="bash -c 'git lfs install && git lfs pull && python3 scripts/build_wheel.py -a=${MLC_GPU_ARCH} --clean --install --use_ccache --benchmarks --trt_root /usr/local/tensorrt/ && python examples/quantization/quantize.py --dtype=float16 --output_dir=/mnt/models/Llama2/fp8-quantized-ammo/llama2-70b-chat-hf-tp${MLC_NVIDIA_TP_SIZE}pp${MLC_NVIDIA_PP_SIZE}-fp8-02072024 --model_dir=/mnt/models/Llama2/Llama-2-70b-chat-hf --qformat=fp8 --kv_cache_dtype=fp8 --tp_size ${MLC_NVIDIA_TP_SIZE} --pp_size ${MLC_NVIDIA_PP_SIZE} --calib_dataset=/calib_dataset'"
20-
echo "$RUN_CMD"
21-
else
22-
RUN_CMD="bash -c 'git lfs install && git lfs pull && python3 scripts/build_wheel.py -a=${MLC_GPU_ARCH} --clean --install --use_ccache --benchmarks --trt_root /usr/local/tensorrt/ && python examples/quantization/quantize.py --dtype=float16 --output_dir=/mnt/models/Llama2/fp8-quantized-ammo/llama2-70b-chat-hf-tp${MLC_NVIDIA_TP_SIZE}pp${MLC_NVIDIA_PP_SIZE}-fp8 --model_dir=/mnt/models/Llama2/Llama-2-70b-chat-hf --qformat=fp8 --kv_cache_dtype=fp8 --tp_size ${MLC_NVIDIA_TP_SIZE} --pp_size ${MLC_NVIDIA_PP_SIZE} --calib_dataset=/calib_dataset'"
23-
echo "$RUN_CMD"
24-
fi
18+
RUN_CMD="bash -c 'git lfs install && git lfs pull && python3 scripts/build_wheel.py -a=${MLC_GPU_ARCH} --clean --install --use_ccache --benchmarks --trt_root /usr/local/tensorrt/ && python examples/quantization/quantize.py --dtype=float16 --output_dir=/mnt/models/Llama2/fp8-quantized-ammo/llama-2-70b-chat-hf-tp${MLC_NVIDIA_TP_SIZE}pp${MLC_NVIDIA_PP_SIZE}-fp8 --model_dir=/mnt/models/Llama2/Llama-2-70b-chat-hf --qformat=fp8 --kv_cache_dtype=fp8 --tp_size ${MLC_NVIDIA_TP_SIZE} --pp_size ${MLC_NVIDIA_PP_SIZE} --calib_dataset=/calib_dataset'"
19+
echo "$RUN_CMD"
20+
2521
# TODO: check whether --device nvidia.com/gpu=all would work for docker
2622
DOCKER_RUN_ARGS=" -v ${MLC_NVIDIA_MLPERF_SCRATCH_PATH}:/mnt -v ${MLC_NVIDIA_PREPROCESSED_CALIBRATION_DATASET_PATH}:/calib_dataset -u $(id -u):$(id -g) --userns=keep-id --device nvidia.com/gpu=all -e NVIDIA_VISIBLE_DEVICES=all"
2723
export DOCKER_RUN_ARGS="$DOCKER_RUN_ARGS"

script/get-preprocessed-dataset-openorca/customize.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,22 +51,32 @@ def postprocess(i):
5151
env['PREPROCESSED_DATA_DIR'] = os.path.dirname(
5252
env['MLC_OPENORCA_PREPROCESSED_ROOT'])
5353
if is_true(env.get('MLC_DATASET_CALIBRATION', '')):
54-
env['MLC_DATASET_CALIBRATION_PATH'] = os.path.join(
55-
env['MLC_OPENORCA_PREPROCESSED_ROOT'],
56-
"open_orca_gpt4_tokenized_llama.calibration_1000.pkl")
5754
if env.get('MLC_TMP_DATASET_PREPROCESS_STEP_PROVIDER',
5855
'') == "nvidia":
5956
env['MLC_NVIDIA_PREPROCESSED_CALIBRATION_DATASET_PATH'] = os.path.join(
6057
env['MLC_OPENORCA_PREPROCESSED_ROOT'],
6158
"preprocessed_data",
6259
"mlperf_llama2_openorca_calibration_1k")
63-
env['MLC_GET_DEPENDENT_CACHED_PATH'] = env['MLC_DATASET_CALIBRATION_PATH']
64-
env['MLC_DATASET_OPENORCA_CALIBRATION_PATH'] = env['MLC_DATASET_CALIBRATION_PATH']
60+
else:
61+
env['MLC_DATASET_CALIBRATION_PATH'] = os.path.join(
62+
env['MLC_OPENORCA_PREPROCESSED_ROOT'],
63+
"open_orca_gpt4_tokenized_llama.calibration_1000.pkl")
64+
env['MLC_GET_DEPENDENT_CACHED_PATH'] = env['MLC_DATASET_CALIBRATION_PATH']
65+
env['MLC_DATASET_OPENORCA_CALIBRATION_PATH'] = env['MLC_DATASET_CALIBRATION_PATH']
6566
else:
6667
env['MLC_DATASET_PREPROCESSED_PATH'] = os.path.join(
6768
env['MLC_OPENORCA_PREPROCESSED_ROOT'],
6869
"open_orca_gpt4_tokenized_llama.sampled_24576.pkl")
69-
env['MLC_GET_DEPENDENT_CACHED_PATH'] = env['MLC_DATASET_PREPROCESSED_PATH']
70+
if env.get('MLC_TMP_DATASET_PREPROCESS_STEP_PROVIDER',
71+
'') == "nvidia":
72+
env['MLC_DATASET_OPENORCA_NVIDIA_PREPROCESSED_PATH'] = os.path.join(
73+
env['MLC_OPENORCA_PREPROCESSED_ROOT'],
74+
"preprocessed")
75+
env['MLC_GET_DEPENDENT_CACHED_PATH'] = env['MLC_DATASET_OPENORCA_NVIDIA_PREPROCESSED_PATH']
76+
else:
77+
env['MLC_GET_DEPENDENT_CACHED_PATH'] = env['MLC_DATASET_PREPROCESSED_PATH']
78+
# The openorca mlc preprocessed dataset is used in nvidia
79+
# implementation for checking accuracy
7080
env['MLC_DATASET_OPENORCA_PREPROCESSED_PATH'] = env['MLC_DATASET_PREPROCESSED_PATH']
7181

7282
return {'return': 0}

script/get-preprocessed-dataset-openorca/meta.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,3 +144,6 @@ variations:
144144
nvidia,calibration:
145145
new_env_keys:
146146
- MLC_NVIDIA_PREPROCESSED_CALIBRATION_DATASET_PATH
147+
nvidia,validation:
148+
new_env_keys:
149+
- MLC_DATASET_OPENORCA_NVIDIA_PREPROCESSED_PATH

0 commit comments

Comments
 (0)