Skip to content

Commit 4c6fc32

Browse files
committed
minor fix
Signed-off-by: fhl <[email protected]>
1 parent d5943f0 commit 4c6fc32

File tree

4 files changed

+6
-4
lines changed

4 files changed

+6
-4
lines changed

vllm/compilation/cuda_piecewise_backend.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ class ConcreteSizeEntry:
3737
# during capture, and check if they are the same during replay
3838
input_addresses: Optional[list[int]] = None
3939

40+
usage_type: Optional[str] = None
41+
4042

4143
class CUDAPiecewiseBackend:
4244

vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
6666
def transform_w_q(x):
6767
assert isinstance(x, BasevLLMParameter)
6868
permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
69-
breakpoint()
69+
7070
x.data = ops.gptq_marlin_repack(x.data.contiguous(),
7171
perm=layer.g_idx_sort_indices,
7272
size_k=c.partition_weight_shape[0],
@@ -105,7 +105,7 @@ def transform_w_s(x):
105105
num_bits=c.weight_type.size_bits))
106106
else:
107107
setattr(layer, self.w_zp_name, marlin_make_empty_g_idx(device))
108-
breakpoint()
108+
109109
self._transform_param(layer, self.w_q_name, transform_w_q)
110110
self._transform_param(layer, self.w_s_name, transform_w_s)
111111

vllm/model_executor/layers/quantization/utils/marlin_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
252252
group_size: int) -> torch.Tensor:
253253

254254
scale_perm, scale_perm_single = get_scale_perms()
255-
breakpoint()
255+
256256
if group_size < size_k and group_size != -1:
257257
s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
258258
else:

vllm/v1/attention/backends/flash_attn.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ def _get_sliding_window_configs(
139139

140140
class FlashAttentionMetadataBuilder(
141141
AttentionMetadataBuilder[FlashAttentionMetadata]):
142-
full_cudagraph_supported: ClassVar[bool] = get_flash_attn_version() == 3
142+
full_cudagraph_supported: ClassVar[bool] = get_flash_attn_version() >= 2
143143

144144
def __init__(self, runner: "GPUModelRunner", kv_cache_spec: AttentionSpec,
145145
block_table: BlockTable):

0 commit comments

Comments
 (0)