fix

wangzaijun · wangzaijun · commit a32563a64c9c · 2025-11-24T08:49:56.000Z
diff --git a/test/kernel/llama_gqa_decode_vsm_tuning.py b/test/kernel/llama_gqa_decode_vsm_tuning.py
@@ -93,12 +93,14 @@ def inner_alloc_func(shape, dtype=torch.float32, device="cuda"):
 
     graph.replay()
 
-    torch.cuda.synchronize()
-    start = time.time()
-    # graph.replay()
-    torch.cuda.synchronize()
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    graph.replay()
+    end_event.record()
+    end_event.synchronize()
 
-    cost_time = (time.time() - start) * 1000
+    cost_time = start_event.elapsed_time(end_event=end_event)
 
     logger.info(f"fp16 {test_seq_len} cost time: {cost_time} ms")
     return cost_time
diff --git a/test/kernel/llama_gqa_diverse_decode_stage1_tuning.py b/test/kernel/llama_gqa_diverse_decode_stage1_tuning.py
@@ -122,12 +122,14 @@ def test_decode_attentions(
 
     graph.replay()
 
-    torch.cuda.synchronize()
-    start = time.time()
-    # graph.replay()
-    torch.cuda.synchronize()
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    graph.replay()
+    end_event.record()
+    end_event.synchronize()
 
-    cost_time = (time.time() - start) * 1000
+    cost_time = start_event.elapsed_time(end_event=end_event)
 
     logger.info(f"fp16 {test_seq_len} cost time: {cost_time} ms")
     return cost_time
diff --git a/unit_tests/models/llama/test_ppl_int8kv_flash_decoding_diverse_stage1.py b/unit_tests/models/llama/test_ppl_int8kv_flash_decoding_diverse_stage1.py
@@ -15,7 +15,7 @@ def setup_tensors():
     max_batch_group_size = 4
     quant_group_size = 8
 
-    test_dtype = torch.float32
+    test_dtype = torch.bfloat16
 
     kv_shape = (batch_size * seq_len, kv_head_num, head_dim)
     kv_scale_shape = (batch_size * seq_len, kv_head_num, head_dim // quant_group_size)