Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .cd/tests/test_vllm_autocalc.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ def test_build_context(monkeypatch, minimal_config):

def test_overwrite_params(monkeypatch, minimal_config):
monkeypatch.setenv("MODEL", "TEST_MODEL")
monkeypatch.setenv("PT_HPU_LAZY_MODE", "1")
monkeypatch.setenv("PT_HPU_LAZY_MODE", "0")
vg = VarsGenerator(**minimal_config)
vg.overwrite_params()
assert vg.context["PT_HPU_LAZY_MODE"] == 1
assert vg.context["PT_HPU_LAZY_MODE"] == 0


def test_context(monkeypatch, minimal_config):
Expand Down
3 changes: 1 addition & 2 deletions .github/workflows/create-release-branch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ jobs:
--build-arg VLLM_COMMIT_ARG=${{ needs.prepare-release-branch.outputs.commit_id }} \
-t hpu-plugin-v1-${{ needs.prepare-release-branch.outputs.tag_name }} \
-f - . <<EOF
FROM vault.habana.ai/gaudi-docker/1.23.0/ubuntu24.04/habanalabs/pytorch-installer-2.9.0:latest
FROM vault.habana.ai/gaudi-docker/1.23.0/ubuntu24.04/habanalabs/pytorch-upstream-installer-2.9.0:latest

COPY ./ /workspace/vllm-gaudi
WORKDIR /workspace
Expand Down Expand Up @@ -291,7 +291,6 @@ jobs:
-e HABANA_VISIBLE_DEVICES=all \
-e HF_HOME=/workspace/hf_cache \
-e VLLM_SKIP_WARMUP=true \
-e PT_HPU_LAZY_MODE=1 \
-e HF_TOKEN=${{ secrets.HF_TOKEN }} \
-v /mnt/hf_cache:/workspace/hf_cache \
hpu-plugin-v1-${{ needs.prepare-release-branch.outputs.tag_name }} \
Expand Down
3 changes: 1 addition & 2 deletions .github/workflows/hourly-ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ jobs:
run: |
echo "Attempting to build Docker image..."
docker build --no-cache -t hpu-plugin-v1-test-env-hourly-ci -f - . <<EOF
FROM vault.habana.ai/gaudi-docker/1.23.0/ubuntu24.04/habanalabs/pytorch-installer-2.9.0:latest
FROM vault.habana.ai/gaudi-docker/1.23.0/ubuntu24.04/habanalabs/pytorch-upstream-installer-2.9.0:latest

COPY ./ /workspace/vllm-gaudi
WORKDIR /workspace
Expand Down Expand Up @@ -230,7 +230,6 @@ jobs:
-e HABANA_VISIBLE_DEVICES=all \
-e HF_HOME=/workspace/hf_cache \
-e VLLM_SKIP_WARMUP=true \
-e PT_HPU_LAZY_MODE=1 \
-e HF_TOKEN=${{ secrets.HF_TOKEN }} \
-v /mnt/hf_cache:/workspace/hf_cache \
hpu-plugin-v1-test-env-hourly-ci \
Expand Down
3 changes: 1 addition & 2 deletions .github/workflows/pre-merge.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ jobs:
--build-arg VLLM_COMMIT_ARG=${{ env.TEST_VLLM_COMMIT }} \
-t hpu-plugin-v1-test-env-pre-merge-${{ needs.retrieve_head_sha.outputs.head_sha }} \
-f - . <<EOF
FROM vault.habana.ai/gaudi-docker/1.23.0/ubuntu24.04/habanalabs/pytorch-installer-2.9.0:latest
FROM vault.habana.ai/gaudi-docker/1.23.0/ubuntu24.04/habanalabs/pytorch-upstream-installer-2.9.0:latest

ARG VLLM_COMMIT_ARG

Expand Down Expand Up @@ -536,7 +536,6 @@ jobs:
-e HABANA_VISIBLE_DEVICES=all \
-e HF_HOME=/workspace/hf_cache \
-e VLLM_SKIP_WARMUP=true \
-e PT_HPU_LAZY_MODE=1 \
-e HF_TOKEN=${{ secrets.HF_TOKEN }} \
-v /mnt/hf_cache:/workspace/hf_cache \
hpu-plugin-v1-test-env-pre-merge-${{ needs.retrieve_head_sha.outputs.head_sha }} \
Expand Down
29 changes: 8 additions & 21 deletions .jenkins/test_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,18 @@ stages:
- name: gsm8k_small_g3_tp1
flavor: g3
command: >-
export PT_HPU_LAZY_MODE=1 && cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small.txt -t 1
cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small.txt -t 1
- name: gsm8k_small_g3_tp2
flavor: g3.s
command: >-
export PT_HPU_LAZY_MODE=1 &&
cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small.txt -t 2
- name: gsm8k_small_g2_tp1
flavor: g2
command: >-
export PT_HPU_LAZY_MODE=1 &&
cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small.txt -t 1
- name: gsm8k_small_g2_tp2
flavor: g2.s
command: >-
export PT_HPU_LAZY_MODE=1 &&
cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small.txt -t 2
- name: full_test_suite
steps:
Expand All @@ -36,88 +33,78 @@ stages:
- name: gsm8k_medium_g3_tp1
flavor: g3
command: >-
export PT_HPU_LAZY_MODE=1 && cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-medium.txt -t 1
cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-medium.txt -t 1
- name: gsm8k_medium_g3_tp2
flavor: g3.s
command: >-
export PT_HPU_LAZY_MODE=1 &&
cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-medium.txt -t 2
# Chendi: Failed on G2 due to Qwen3-30B might be too large
# - name: gsm8k_medium_g2_tp1
# flavor: g2
# command: >-
# export PT_HPU_LAZY_MODE=1 &&
# cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-medium.txt -t 1
- name: gsm8k_medium_g2_tp2
flavor: g2.s
command: >-
export PT_HPU_LAZY_MODE=1 &&
cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-medium.txt -t 2
- name: gsm8k_large_g3_tp2
flavor: g3.s
command: >-
export PT_HPU_LAZY_MODE=1 &&
cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-large.txt -t 2
- name: gsm8k_large_g2_tp4
flavor: g2.m
command: >-
export PT_HPU_LAZY_MODE=1 &&
cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-large.txt -t 4
# Chendi: comment firstly since this one takes longest time and might fail on resource
# - name: gsm8k_huge_g3_tp8
# flavor: g3.l
# command: >-
# export PT_HPU_LAZY_MODE=1 &&
# cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-huge.txt -t 8
- name: gsm8k_small_g3_tp1_fp8
flavor: g3
command: >-
cd .jenkins/lm-eval-harness &&
PT_HPU_LAZY_MODE=1
bash run-tests.sh -c configs/models-fp8-g3-tp1.txt -t 1
- name: gsm8k_small_g3_tp2_fp8
flavor: g3.s
command: >-
cd .jenkins/lm-eval-harness &&
PT_HPU_LAZY_MODE=1
cd .jenkins/lm-eval-harness &&
bash run-tests.sh -c configs/models-fp8.txt -t 2
- name: gsm8k_small_g3_tp1_fp8_unified_attn
flavor: g3
command: >-
cd .jenkins/lm-eval-harness &&
PT_HPU_LAZY_MODE=1
cd .jenkins/lm-eval-harness &&
VLLM_UNIFIED_ATTN=true
bash run-tests.sh -c configs/models-fp8-g3-tp1.txt -t 1
- name: gsm8k_small_g3_tp2_fp8_unified_attn
flavor: g3.s
command: >-
cd .jenkins/lm-eval-harness &&
PT_HPU_LAZY_MODE=1
VLLM_UNIFIED_ATTN=true
bash run-tests.sh -c configs/models-fp8.txt -t 2
- name: gsm8k_fp8_llama4_scout_g3_tp2_compressed_tensor
flavor: g3.s
command: >-
cd .jenkins/lm-eval-harness &&
VLLM_WEIGHT_LOAD_FORCE_SYNC=1 VLLM_CONTIGUOUS_PA=False PT_HPU_LAZY_MODE=1
VLLM_WEIGHT_LOAD_FORCE_SYNC=1 VLLM_CONTIGUOUS_PA=False
bash run-tests.sh -c configs/models-fp8-compressedtensor.txt -t 2
# Chendi: crash on model weight loading, need to fix
# - name: gsm8k_fp8_qwen3_30B_g3_tp1_block_scale_dynamic
# flavor: g3
# command: >-
# cd .jenkins/lm-eval-harness &&
# VLLM_CONTIGUOUS_PA=False PT_HPU_LAZY_MODE=1
# VLLM_CONTIGUOUS_PA=False
# bash run-tests.sh -c configs/models-fp8-blockfp8.txt -t 1
# Chendi: crash on model weight loading, need to fix
# - name: gsm8k_fp8_qwen3_30B_g3_tp1_block_scale_dequant
# flavor: g3
# command: >-
# cd .jenkins/lm-eval-harness &&
# VLLM_CONTIGUOUS_PA=False PT_HPU_LAZY_MODE=1 VLLM_HPU_FORCE_CHANNEL_FP8=0
# VLLM_CONTIGUOUS_PA=False VLLM_HPU_FORCE_CHANNEL_FP8=0
# bash run-tests.sh -c configs/models-fp8-blockfp8.txt -t 1
- name: multimodal_llama4_scout_g3_tp2_ep
flavor: g3.s
command: >-
cd .jenkins/vision &&
PT_HPU_LAZY_MODE=1 VLLM_WEIGHT_LOAD_FORCE_SYNC=1
VLLM_WEIGHT_LOAD_FORCE_SYNC=1
bash run-tests.sh -c configs/models-llama4-scout.txt -t 2
6 changes: 1 addition & 5 deletions calibration/calibrate_model.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,7 @@ create_quant_config() {

#note(kwisniewski98): mixtral models has attention masked to not cause regression in accuracy
if [[ $model_name_lower =~ ^mixtral ]]; then
if [[ $PT_HPU_LAZY_MODE == 0 ]]; then
tmp_config="{\"mode\": \"QUANTIZE\",\"observer\": \"maxabs\",\"scale_method\": \"maxabs_hw\", \"scale_format\": \"CONST\",\"allowlist\": {\"types\": [],\"names\": []},\"blocklist\": {\"types\": [],\"names\": [\"self_attn\", \"lm_head\"]},\"dump_stats_path\": \"$1/$2/$3/inc_output\"}"
else
tmp_config="{\"mode\": \"QUANTIZE\",\"observer\": \"maxabs\",\"scale_method\": \"maxabs_hw\",\"allowlist\": {\"types\": [],\"names\": []},\"blocklist\": {\"types\": [],\"names\": [\"self_attn\", \"lm_head\"]},\"dump_stats_path\": \"$1/$2/$3/inc_output\"}"
fi
tmp_config="{\"mode\": \"QUANTIZE\",\"observer\": \"maxabs\",\"scale_method\": \"maxabs_hw\", \"scale_format\": \"CONST\",\"allowlist\": {\"types\": [],\"names\": []},\"blocklist\": {\"types\": [],\"names\": [\"self_attn\", \"lm_head\"]},\"dump_stats_path\": \"$1/$2/$3/inc_output\"}"
elif [[ $model_name_lower =~ ^deepseek ]]; then
tmp_config="{\"mode\": \"QUANTIZE\",\"observer\": \"maxabs\",\"scale_method\": \"maxabs_hw\", \"scale_format\": \"scalar\", \"allowlist\": {\"types\": [],\"names\": []},\"blocklist\": {\"types\": [],\"names\": [\"lm_head\", \"mlp\\\.gate\\\b\"]},\"dump_stats_path\": \"$1/$2/$3/inc_output\"}"
else
Expand Down
2 changes: 1 addition & 1 deletion docs/configuration/calibration/calibration_one_node.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ Before you start, familiarize with [notes and recommendations](calibration.md#no
# OR
./calibrate_model.sh -m facebook/opt-125m -d dataset-processed.pkl -o inc/
# OR Calibrate DeepSeek models with dataset NeelNanda/pile-10k
PT_HPU_LAZY_MODE=1 ./calibrate_model.sh -m deepseek-ai/DeepSeek-R1 -d NeelNanda/pile-10k -o inc/ -t 8
./calibrate_model.sh -m deepseek-ai/DeepSeek-R1 -d NeelNanda/pile-10k -o inc/ -t 8
```

Where:
Expand Down
2 changes: 1 addition & 1 deletion docs/configuration/warm-up/managing_warm-up.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ VLLM_DECODE_BS_BUCKET_STEP=128 \
VLLM_DECODE_BS_BUCKET_MAX=128 \
VLLM_PROMPT_SEQ_BUCKET_MAX=1024 \
VLLM_DECODE_BLOCK_BUCKET_MAX=1024 \
PT_HPU_WEIGHT_SHARING=0 PT_HPU_MAX_COMPOUND_OP_SIZE=30 PT_HPU_LAZY_MODE=1 PT_HPU_ENABLE_LAZY_COLLECTIVES=true vllm serve meta-llama/Llama-3.1-8B-instruct -tp 1 --weights-load-device cpu --max-model-len 8192
PT_HPU_WEIGHT_SHARING=0 PT_HPU_MAX_COMPOUND_OP_SIZE=30 PT_HPU_ENABLE_LAZY_COLLECTIVES=true vllm serve meta-llama/Llama-3.1-8B-instruct -tp 1 --weights-load-device cpu --max-model-len 8192
```

This results in the following:
Expand Down
1 change: 0 additions & 1 deletion examples/nixl/run_accuracy_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ MODELS=(
#)

export VLLM_SKIP_WARMUP="true"
export PT_HPU_LAZY_MODE=1

NIXL_BUFFER_DEVICE=${NIXL_BUFFER_DEVICE:-"cpu"}
VLLM_NIXL_BACKEND=${VLLM_NIXL_BACKEND:-"UCX"}
Expand Down
1 change: 0 additions & 1 deletion examples/nixl/run_benchmark_profile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ MODELS=(
CMD="hl-prof-config --use-template profile_api --hw-trace off"
eval "$CMD &"
export VLLM_SKIP_WARMUP=True
export PT_HPU_LAZY_MODE=1
export HABANA_PROFILE=1
Enable full vLLM Profiler and instruct where to save the profiling:
export VLLM_PROFILER_ENABLED=1
Expand Down
1 change: 0 additions & 1 deletion examples/nixl/run_benchmark_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ MODELS=(
"/root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/"
)
#export VLLM_SKIP_WARMUP=True
export PT_HPU_LAZY_MODE=1
export VLLM_EXPONENTIAL_BUCKETING=False
#export VLLM_PROMPT_BS_BUCKET_MIN=1
#export VLLM_PROMPT_SEQ_BUCKET_MIN=1
Expand Down
1 change: 0 additions & 1 deletion examples/nixl/run_hpu_disagg_accuracy_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ MODELS=(
)

export VLLM_SKIP_WARMUP="true"
#export PT_HPU_LAZY_MODE=1

# Number of prefill and decode instances to create
NUM_PREFILL_INSTANCES=${NUM_PREFILL_INSTANCES:-1} # Default to 1
Expand Down
Loading
Loading