[CI] update lm_eval with lastet version for vllm args update (#42)

xuechendi · web-flow · commit 0cc8bb6e2635 · 2025-07-23T23:29:42.000-05:00
Bring back lm_eval CI by 1. use lm_eval latest version because of vllm API update - EleutherAI/lm-evaluation-harness#3176 2. update test_common.py to work with latest lm_eval Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
diff --git a/.github/workflows/hourly-ci.yaml b/.github/workflows/hourly-ci.yaml
@@ -88,7 +88,8 @@ jobs:
           RUN git checkout main
 
           # Pinning versions in requirements might be good practice for CI consistency
-          RUN pip install lm_eval pytest pytest_asyncio
+          RUN pip install pytest pytest_asyncio
+          RUN pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git
 
           ENV no_proxy=localhost,127.0.0.1
           ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
diff --git a/.github/workflows/pre-merge.yaml b/.github/workflows/pre-merge.yaml
@@ -47,7 +47,8 @@ jobs:
           RUN git checkout main
 
           # Pinning versions in requirements might be good practice for CI consistency
-          RUN pip install lm_eval pytest pytest_asyncio
+          RUN pip install pytest pytest_asyncio
+          RUN pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git
 
           ENV no_proxy=localhost,127.0.0.1
           ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
diff --git a/tests/full_tests/ci_gsm8k_tests.sh b/tests/full_tests/ci_gsm8k_tests.sh
@@ -50,39 +50,39 @@ if [ $? -ne 0 ]; then
 fi
 echo "Test with deepseek_v2 + inc passed"
 
-# # gsm8k test
-# # used to check HPUattn + MLP
-# echo "Testing GSM8K on ganite-8b"
-# echo VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 \
-# pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/granite-8b.yaml
-# VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 \
-# pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/granite-8b.yaml
-# if [ $? -ne 0 ]; then
-#     echo "Error: Test failed for granite-8b" >&2
-#     exit -1
-# fi
-# echo "Test with granite-8b passed"
+# gsm8k test
+# used to check HPUattn + MLP
+echo "Testing GSM8K on ganite-8b"
+echo VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 \
+pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/granite-8b.yaml
+VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 \
+pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/granite-8b.yaml
+if [ $? -ne 0 ]; then
+    echo "Error: Test failed for granite-8b" >&2
+    exit -1
+fi
+echo "Test with granite-8b passed"
 
-# # used to check MLA + MOE
-# echo "Testing GSM8K on deepseek v2 lite"
-# # deepseek-R1
-# echo VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/DeepSeek-V2-Lite-chat.yaml
-# VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 \
-# pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/DeepSeek-V2-Lite-chat.yaml
-# if [ $? -ne 0 ]; then
-#     echo "Error: Test failed for deepseek R1" >&2
-#     exit -1
-# fi
-# echo "Test with deepseek R1 passed"
+# used to check MLA + MOE
+echo "Testing GSM8K on deepseek v2 lite"
+# deepseek-R1
+echo VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/DeepSeek-V2-Lite-chat.yaml
+VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 \
+pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/DeepSeek-V2-Lite-chat.yaml
+if [ $? -ne 0 ]; then
+    echo "Error: Test failed for deepseek R1" >&2
+    exit -1
+fi
+echo "Test with deepseek R1 passed"
 
-# # used to check HPUATTN + MOE + ExpertParallel
-# echo "Testing GSM8K on QWEN3-30B-A3B"
-# echo VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 TP_SIZE=2 \
-# pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/Qwen3-30B-A3B.yaml
-# VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 TP_SIZE=2 \
-# pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/Qwen3-30B-A3B.yaml
-# if [ $? -ne 0 ]; then
-#     echo "Error: Test failed for QWEN3-30B-A3B" >&2
-#     exit -1
-# fi
-# echo "Test with QWEN3-30B-A3B passed"
+# used to check HPUATTN + MOE + ExpertParallel
+echo "Testing GSM8K on QWEN3-30B-A3B"
+echo VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 TP_SIZE=2 \
+pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/Qwen3-30B-A3B.yaml
+VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 TP_SIZE=2 \
+pytest -v -s vllm-gaudi/tests/models/language/generation/test_common.py --model_card_path vllm-gaudi/tests/full_tests/model_cards/Qwen3-30B-A3B.yaml
+if [ $? -ne 0 ]; then
+    echo "Error: Test failed for QWEN3-30B-A3B" >&2
+    exit -1
+fi
+echo "Test with QWEN3-30B-A3B passed"
diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
@@ -37,6 +37,7 @@ def launch_lm_eval(eval_config):
         'batch_size': max_num_seqs,
         'enable_expert_parallel': eval_config.get('enable_expert_parallel',
                                                   False),
+        'chat_template_args': eval_config.get('chat_template_args', {}),
     }
     if kv_cache_dtype is not None:
         model_args['kv_cache_dtype'] = kv_cache_dtype

Original file line number	Diff line number	Diff line change
`@@ -37,6 +37,7 @@ def launch_lm_eval(eval_config):`
`37`	`37`	`'batch_size': max_num_seqs,`
`38`	`38`	`'enable_expert_parallel': eval_config.get('enable_expert_parallel',`
`39`	`39`	`False),`
	`40`	`+ 'chat_template_args': eval_config.get('chat_template_args', {}),`
`40`	`41`	`}`
`41`	`42`	`if kv_cache_dtype is not None:`
`42`	`43`	`model_args['kv_cache_dtype'] = kv_cache_dtype`