[testing] reduce runtime of HunYuanMoEV1IntegrationTest:test_model_generation (huggingface#41373)

ydshieh · web-flow · commit 3927ffed31e3 · 2025-10-10T22:27:01.000+02:00
* fix

* fix

* fix

---------

Co-authored-by: ydshieh &lt;ydshieh@users.noreply.github.com&gt;
diff --git a/tests/models/hunyuan_v1_moe/test_modeling_hunyuan_v1_moe.py b/tests/models/hunyuan_v1_moe/test_modeling_hunyuan_v1_moe.py
@@ -16,6 +16,7 @@
 import unittest
 
 import pytest
+import torch
 from parameterized import parameterized
 
 from transformers import is_torch_available
@@ -99,10 +100,12 @@ def tearDown(self):
     def test_model_generation(self):
         # we will compele this when model file change over
         # pass
-        EXPECTED_ANSWER = "\nOkay, I need to write a short summary about the benefits of regular exercise. Let me start by recalling what I know. First,"
+        EXPECTED_ANSWER = "\nOkay, I need to write a"
         prompt = "Write a short summary of the benefits of regular exercise"
         tokenizer = AutoTokenizer.from_pretrained("tencent/Hunyuan-A13B-Instruct")
-        model = AutoModelForCausalLM.from_pretrained("tencent/Hunyuan-A13B-Instruct", device_map="auto")
+        model = AutoModelForCausalLM.from_pretrained(
+            "tencent/Hunyuan-A13B-Instruct", device_map="auto", dtype=torch.bfloat16
+        )
         messages = [
             {"role": "user", "content": prompt},
         ]
@@ -112,7 +115,7 @@ def test_model_generation(self):
             add_generation_prompt=True,
             return_tensors="pt",
         )
-        generated_ids = model.generate(tokenized_chat.to(model.device), max_new_tokens=30, top_k=1)
+        generated_ids = model.generate(tokenized_chat.to(model.device), max_new_tokens=10, top_k=1)
         text = tokenizer.decode(generated_ids[0])
         output = text.split("<think>")[1]
         self.assertEqual(EXPECTED_ANSWER, output)