diff --git a/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/Dockerfile b/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/Dockerfile index 987eefc5..97785937 100644 --- a/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/Dockerfile +++ b/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/Dockerfile @@ -1,8 +1,7 @@ # Copyright (c) 2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -FROM vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:1.20.0-543 - +FROM vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest # Need node to build doc HTML. Taken from https://stackoverflow.com/a/67491580 RUN apt-get update && apt-get install -y --no-install-recommends \ software-properties-common \ @@ -12,15 +11,16 @@ RUN npm install n -g && \ RUN python3 -m pip install --no-cache-dir --upgrade pip RUN python3 -m pip install --upgrade-strategy eager optimum[habana] -RUN python3 -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0 +RUN python3 -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.21.0 RUN mkdir -p /workspace WORKDIR /workspace -RUN git clone https://github.com/huggingface/optimum-habana && cd optimum-habana && git checkout v1.16.0 +RUN git clone https://github.com/huggingface/optimum-habana && cd optimum-habana && git checkout v1.18.0 WORKDIR /workspace/optimum-habana/examples/text-generation RUN python3 -m pip install -r requirements.txt RUN python3 -m pip install -r requirements_lm_eval.txt COPY . . -COPY Gaudi_1-20.json Gaudi.json -COPY HQT_1-20.zip HQT.zip +COPY Gaudi_1-21.json Gaudi.json +COPY HQT_1-21.zip HQT.zip RUN python3 -m pip install -r requirements_bm.txt +ENV skip_llama2_70b 1 diff --git a/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/Dockerfile.1-20 b/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/Dockerfile.1-20 new file mode 100644 index 00000000..987eefc5 --- /dev/null +++ b/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/Dockerfile.1-20 @@ -0,0 +1,26 @@ +# Copyright (c) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:1.20.0-543 + +# Need node to build doc HTML. Taken from https://stackoverflow.com/a/67491580 +RUN apt-get update && apt-get install -y --no-install-recommends \ + software-properties-common \ + npm +RUN npm install n -g && \ + n latest + +RUN python3 -m pip install --no-cache-dir --upgrade pip +RUN python3 -m pip install --upgrade-strategy eager optimum[habana] +RUN python3 -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0 +RUN mkdir -p /workspace +WORKDIR /workspace +RUN git clone https://github.com/huggingface/optimum-habana && cd optimum-habana && git checkout v1.16.0 + +WORKDIR /workspace/optimum-habana/examples/text-generation +RUN python3 -m pip install -r requirements.txt +RUN python3 -m pip install -r requirements_lm_eval.txt +COPY . . +COPY Gaudi_1-20.json Gaudi.json +COPY HQT_1-20.zip HQT.zip +RUN python3 -m pip install -r requirements_bm.txt diff --git a/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/Gaudi_1-21.json b/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/Gaudi_1-21.json new file mode 100644 index 00000000..7e82cdc3 --- /dev/null +++ b/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/Gaudi_1-21.json @@ -0,0 +1,636 @@ +{ + "Gaudi3": [ + { + "model": "Llama3.1_8b", + "num_cards": "1", + "input_len": "", + "output_len": "", + "bs": "", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 1 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_measure.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "" + }, + { + "model": "Llama3.1_8b", + "num_cards": "1", + "input_len": "128", + "output_len": "128", + "bs": "1536", + "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 1536 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "24631" + }, + { + "model": "Llama3.1_8b", + "num_cards": "1", + "input_len": "128", + "output_len": "2048", + "bs": "768", + "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 768 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "21793" + }, + { + "model": "Llama3.1_8b", + "num_cards": "1", + "input_len": "2048", + "output_len": "128", + "bs": "256", + "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 256 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "2883" + }, + { + "model": "Llama3.1_8b", + "num_cards": "1", + "input_len": "2048", + "output_len": "2048", + "bs": "256", + "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 256 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "9125" + }, + { + "model": "Llama3.1_70b", + "num_cards": "2", + "input_len": "", + "output_len": "", + "bs": "", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_measure.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "" + }, + { + "model": "Llama3.1_70b", + "num_cards": "2", + "input_len": "128", + "output_len": "128", + "bs": "2048", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 2048 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "5471" + }, + { + "model": "Llama3.1_70b", + "num_cards": "2", + "input_len": "128", + "output_len": "2048", + "bs": "450", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 450 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "6484" + }, + { + "model": "Llama3.1_70b", + "num_cards": "2", + "input_len": "2048", + "output_len": "128", + "bs": "223", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 223 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "682" + }, + { + "model": "Llama3.1_70b", + "num_cards": "2", + "input_len": "2048", + "output_len": "2048", + "bs": "175", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 175 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "2931" + }, + { + "model": "Llama3.1_70b", + "num_cards": "8", + "input_len": "", + "output_len": "", + "bs": "", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_measure.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "" + }, + { + "model": "Llama3.1_70b", + "num_cards": "8", + "input_len": "128", + "output_len": "128", + "bs": "4000", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 4000 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "18317" + }, + { + "model": "Llama3.1_70b", + "num_cards": "8", + "input_len": "128", + "output_len": "2048", + "bs": "768", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 768 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "20943" + }, + { + "model": "Llama3.1_70b", + "num_cards": "8", + "input_len": "2048", + "output_len": "128", + "bs": "512", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 512 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "2303" + }, + { + "model": "Llama3.1_70b", + "num_cards": "8", + "input_len": "2048", + "output_len": "2048", + "bs": "600", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --attn_batch_split 2 --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 600 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "10604" + }, + { + "model": "Llama3.3_70b", + "num_cards": "8", + "input_len": "", + "output_len": "", + "bs": "", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.3-70B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_measure.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "" + }, + { + "model": "Llama3.3_70b", + "num_cards": "8", + "input_len": "128", + "output_len": "128", + "bs": "3986", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.3-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 3986 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "16883" + }, + { + "model": "Llama3.3_70b", + "num_cards": "8", + "input_len": "128", + "output_len": "2048", + "bs": "2048", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.3-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 2048 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "25071" + }, + { + "model": "Llama3.3_70b", + "num_cards": "8", + "input_len": "2048", + "output_len": "128", + "bs": "774", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.3-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 774 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "1890" + }, + { + "model": "Llama3.3_70b", + "num_cards": "8", + "input_len": "2048", + "output_len": "2048", + "bs": "719", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.3-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 719 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "11087" + }, + { + "model": "Llama3.1_405b", + "num_cards": "8", + "input_len": "", + "output_len": "", + "bs": "", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-405B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_measure.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "" + }, + { + "model": "Llama3.1_405b", + "num_cards": "8", + "input_len": "128", + "output_len": "128", + "bs": "2996", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-405B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 2996 --flash_attention_causal_mask --book_source --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "3489" + }, + { + "model": "Llama3.1_405b", + "num_cards": "8", + "input_len": "128", + "output_len": "2048", + "bs": "460", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-405B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 460 --flash_attention_causal_mask --book_source --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "4974" + }, + { + "model": "Llama3.1_405b", + "num_cards": "8", + "input_len": "2048", + "output_len": "128", + "bs": "195", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-405B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 195 --flash_attention_causal_mask --book_source --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "397" + }, + { + "model": "Llama3.1_405b", + "num_cards": "8", + "input_len": "2048", + "output_len": "2048", + "bs": "180", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-405B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 180 --flash_attention_causal_mask --book_source --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "2212" + } + ], + "Gaudi2": [ + { + "model": "Llama2_70b", + "num_cards": "2", + "input_len": "", + "output_len": "", + "bs": "", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-2-70b-hf --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_measure.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "" + }, + { + "model": "Llama2_70b", + "num_cards": "2", + "input_len": "128", + "output_len": "128", + "bs": "1750", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 1750 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "2935" + }, + { + "model": "Llama2_70b", + "num_cards": "2", + "input_len": "128", + "output_len": "2048", + "bs": "256", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 256 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "3963" + }, + { + "model": "Llama2_70b", + "num_cards": "2", + "input_len": "2048", + "output_len": "128", + "bs": "95", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 95 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "318" + }, + { + "model": "Llama2_70b", + "num_cards": "2", + "input_len": "2048", + "output_len": "2048", + "bs": "159", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-2-70b-hf --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 159 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "1767" + }, + { + "model": "Llama3.1_8b", + "num_cards": "1", + "input_len": "", + "output_len": "", + "bs": "", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 1 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_measure.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "" + }, + { + "model": "Llama3.1_8b", + "num_cards": "1", + "input_len": "128", + "output_len": "128", + "bs": "2816", + "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 2816 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 3", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "19907" + }, + { + "model": "Llama3.1_8b", + "num_cards": "1", + "input_len": "128", + "output_len": "2048", + "bs": "512", + "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 512 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "14866" + }, + { + "model": "Llama3.1_8b", + "num_cards": "1", + "input_len": "2048", + "output_len": "128", + "bs": "179", + "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 179 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 3", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "2099" + }, + { + "model": "Llama3.1_8b", + "num_cards": "1", + "input_len": "2048", + "output_len": "2048", + "bs": "256", + "run_cmd": "python3 run_generation.py --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 256 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "6060" + }, + { + "model": "Llama3.1_70b", + "num_cards": "2", + "input_len": "", + "output_len": "", + "bs": "", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_measure.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "" + }, + { + "model": "Llama3.1_70b", + "num_cards": "2", + "input_len": "128", + "output_len": "128", + "bs": "1792", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 1792 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "3421" + }, + { + "model": "Llama3.1_70b", + "num_cards": "2", + "input_len": "128", + "output_len": "2048", + "bs": "256", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 256 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "3827" + }, + { + "model": "Llama3.1_70b", + "num_cards": "2", + "input_len": "2048", + "output_len": "128", + "bs": "142", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 142 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "462" + }, + { + "model": "Llama3.1_70b", + "num_cards": "2", + "input_len": "2048", + "output_len": "2048", + "bs": "139", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 2 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 139 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "1665" + }, + { + "model": "Llama3.1_70b", + "num_cards": "8", + "input_len": "", + "output_len": "", + "bs": "", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_lm_eval.py -o acc_llama_quant.json --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --warmup 0 --use_hpu_graphs --use_kv_cache --trim_logits --batch_size 1 --bucket_size=128 --bucket_internal --trust_remote_code --tasks hellaswag lambada_openai piqa winogrande --bf16 --attn_softmax_bf16 --use_flash_attention --flash_attention_recompute --flash_attention_causal_mask", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_measure.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "" + }, + { + "model": "Llama3.1_70b", + "num_cards": "8", + "input_len": "128", + "output_len": "128", + "bs": "4000", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 128 --bf16 --batch_size 4000 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "10404" + }, + { + "model": "Llama3.1_70b", + "num_cards": "8", + "input_len": "128", + "output_len": "2048", + "bs": "768", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 128 --bf16 --batch_size 768 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "13639" + }, + { + "model": "Llama3.1_70b", + "num_cards": "8", + "input_len": "2048", + "output_len": "128", + "bs": "383", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 128 --max_input_tokens 2048 --bf16 --batch_size 383 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute --attn_batch_split 2", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "1553" + }, + { + "model": "Llama3.1_70b", + "num_cards": "8", + "input_len": "2048", + "output_len": "2048", + "bs": "476", + "run_cmd": "python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py --model_name_or_path meta-llama/Llama-3.1-70B-Instruct --attn_softmax_bf16 --trim_logits --warmup 2 --use_kv_cache --use_hpu_graphs --limit_hpu_graphs --bucket_size=128 --bucket_internal --max_new_tokens 2048 --max_input_tokens 2048 --bf16 --batch_size 476 --flash_attention_causal_mask --use_flash_attention --flash_attention_recompute", + "env_vars": { + "HF_DATASETS_TRUST_REMOTE_CODE": "true", + "QUANT_CONFIG": "./quantization_config/maxabs_quant.json", + "TQDM_DISABLE": "1" + }, + "ref_perf": "6715" + } + ] +} diff --git a/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/HQT_1-21.zip b/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/HQT_1-21.zip new file mode 100644 index 00000000..e31ebce5 Binary files /dev/null and b/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8/HQT_1-21.zip differ