minor fix

fhl2000 · fhl2000 · commit 4c6fc32dbb0c · 2025-06-25T11:12:11.000+08:00
Signed-off-by: fhl &lt;2410591650@qq.com&gt;
diff --git a/vllm/compilation/cuda_piecewise_backend.py b/vllm/compilation/cuda_piecewise_backend.py
@@ -37,6 +37,8 @@ class ConcreteSizeEntry:
     # during capture, and check if they are the same during replay
     input_addresses: Optional[list[int]] = None
 
+    usage_type: Optional[str] = None
+
 
 class CUDAPiecewiseBackend:
 
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
@@ -66,7 +66,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         def transform_w_q(x):
             assert isinstance(x, BasevLLMParameter)
             permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
-            breakpoint()
+            
             x.data = ops.gptq_marlin_repack(x.data.contiguous(),
                                             perm=layer.g_idx_sort_indices,
                                             size_k=c.partition_weight_shape[0],
@@ -105,7 +105,7 @@ def transform_w_s(x):
                     num_bits=c.weight_type.size_bits))
         else:
             setattr(layer, self.w_zp_name, marlin_make_empty_g_idx(device))
-        breakpoint()
+        
         self._transform_param(layer, self.w_q_name, transform_w_q)
         self._transform_param(layer, self.w_s_name, transform_w_s)
 
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -252,7 +252,7 @@ def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
                           group_size: int) -> torch.Tensor:
 
     scale_perm, scale_perm_single = get_scale_perms()
-    breakpoint()
+
     if group_size < size_k and group_size != -1:
         s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
     else:
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
@@ -139,7 +139,7 @@ def _get_sliding_window_configs(
 
 class FlashAttentionMetadataBuilder(
         AttentionMetadataBuilder[FlashAttentionMetadata]):
-    full_cudagraph_supported: ClassVar[bool] = get_flash_attn_version() == 3
+    full_cudagraph_supported: ClassVar[bool] = get_flash_attn_version() >= 2
 
     def __init__(self, runner: "GPUModelRunner", kv_cache_spec: AttentionSpec,
                  block_table: BlockTable):