Skip to content

Commit 0cc8bb6

Browse files
authored
[CI] update lm_eval with lastet version for vllm args update (#42)
Bring back lm_eval CI by 1. use lm_eval latest version because of vllm API update - EleutherAI/lm-evaluation-harness#3176 2. update test_common.py to work with latest lm_eval Signed-off-by: Chendi.Xue <[email protected]>
1 parent f1d3f04 commit 0cc8bb6

File tree

4 files changed

+39
-36
lines changed

4 files changed

+39
-36
lines changed

.github/workflows/hourly-ci.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,8 @@ jobs:
8888
RUN git checkout main
8989
9090
# Pinning versions in requirements might be good practice for CI consistency
91-
RUN pip install lm_eval pytest pytest_asyncio
91+
RUN pip install pytest pytest_asyncio
92+
RUN pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git
9293
9394
ENV no_proxy=localhost,127.0.0.1
9495
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true

.github/workflows/pre-merge.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ jobs:
4747
RUN git checkout main
4848
4949
# Pinning versions in requirements might be good practice for CI consistency
50-
RUN pip install lm_eval pytest pytest_asyncio
50+
RUN pip install pytest pytest_asyncio
51+
RUN pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git
5152
5253
ENV no_proxy=localhost,127.0.0.1
5354
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true

tests/full_tests/ci_gsm8k_tests.sh

Lines changed: 34 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -50,39 +50,39 @@ if [ $? -ne 0 ]; then
5050
fi
5151
echo "Test with deepseek_v2 + inc passed"
5252

53-
# # gsm8k test
54-
# # used to check HPUattn + MLP
55-
# echo "Testing GSM8K on ganite-8b"
56-
# echo VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 \
57-
# pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/granite-8b.yaml
58-
# VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 \
59-
# pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/granite-8b.yaml
60-
# if [ $? -ne 0 ]; then
61-
# echo "Error: Test failed for granite-8b" >&2
62-
# exit -1
63-
# fi
64-
# echo "Test with granite-8b passed"
53+
# gsm8k test
54+
# used to check HPUattn + MLP
55+
echo "Testing GSM8K on ganite-8b"
56+
echo VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 \
57+
pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/granite-8b.yaml
58+
VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 \
59+
pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/granite-8b.yaml
60+
if [ $? -ne 0 ]; then
61+
echo "Error: Test failed for granite-8b" >&2
62+
exit -1
63+
fi
64+
echo "Test with granite-8b passed"
6565

66-
# # used to check MLA + MOE
67-
# echo "Testing GSM8K on deepseek v2 lite"
68-
# # deepseek-R1
69-
# echo VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/DeepSeek-V2-Lite-chat.yaml
70-
# VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 \
71-
# pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/DeepSeek-V2-Lite-chat.yaml
72-
# if [ $? -ne 0 ]; then
73-
# echo "Error: Test failed for deepseek R1" >&2
74-
# exit -1
75-
# fi
76-
# echo "Test with deepseek R1 passed"
66+
# used to check MLA + MOE
67+
echo "Testing GSM8K on deepseek v2 lite"
68+
# deepseek-R1
69+
echo VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/DeepSeek-V2-Lite-chat.yaml
70+
VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 \
71+
pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/DeepSeek-V2-Lite-chat.yaml
72+
if [ $? -ne 0 ]; then
73+
echo "Error: Test failed for deepseek R1" >&2
74+
exit -1
75+
fi
76+
echo "Test with deepseek R1 passed"
7777

78-
# # used to check HPUATTN + MOE + ExpertParallel
79-
# echo "Testing GSM8K on QWEN3-30B-A3B"
80-
# echo VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 TP_SIZE=2 \
81-
# pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/Qwen3-30B-A3B.yaml
82-
# VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 TP_SIZE=2 \
83-
# pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/Qwen3-30B-A3B.yaml
84-
# if [ $? -ne 0 ]; then
85-
# echo "Error: Test failed for QWEN3-30B-A3B" >&2
86-
# exit -1
87-
# fi
88-
# echo "Test with QWEN3-30B-A3B passed"
78+
# used to check HPUATTN + MOE + ExpertParallel
79+
echo "Testing GSM8K on QWEN3-30B-A3B"
80+
echo VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 TP_SIZE=2 \
81+
pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/Qwen3-30B-A3B.yaml
82+
VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 TP_SIZE=2 \
83+
pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/Qwen3-30B-A3B.yaml
84+
if [ $? -ne 0 ]; then
85+
echo "Error: Test failed for QWEN3-30B-A3B" >&2
86+
exit -1
87+
fi
88+
echo "Test with QWEN3-30B-A3B passed"

tests/models/language/generation/test_common.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ def launch_lm_eval(eval_config):
3737
'batch_size': max_num_seqs,
3838
'enable_expert_parallel': eval_config.get('enable_expert_parallel',
3939
False),
40+
'chat_template_args': eval_config.get('chat_template_args', {}),
4041
}
4142
if kv_cache_dtype is not None:
4243
model_args['kv_cache_dtype'] = kv_cache_dtype

0 commit comments

Comments
 (0)