Skip to content

Commit 6db9457

Browse files
authored
[Misc] Remove LoRA log (#15388)
Signed-off-by: Jee Jee Li <[email protected]>
1 parent 97cfa65 commit 6db9457

File tree

2 files changed

+5
-15
lines changed

2 files changed

+5
-15
lines changed

vllm/config.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2373,12 +2373,6 @@ def verify_with_model_config(self, model_config: ModelConfig):
23732373
self.lora_dtype = model_config.dtype
23742374
elif isinstance(self.lora_dtype, str):
23752375
self.lora_dtype = getattr(torch, self.lora_dtype)
2376-
if model_config.quantization and model_config.quantization not in [
2377-
"awq", "gptq"
2378-
]:
2379-
# TODO support marlin
2380-
logger.warning("%s quantization is not tested with LoRA yet.",
2381-
model_config.quantization)
23822376

23832377
def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
23842378
# Reminder: Please update docs/source/features/compatibility_matrix.md

vllm/lora/punica_wrapper/punica_gpu.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,6 @@ def add_shrink(self, y: torch.Tensor, x: torch.Tensor,
7878
...], scale: float, **kwargs):
7979
"""
8080
Performs GEMM for multiple slices of lora_a.
81-
When `is_prefill is` true, it indicates that it is currently the
82-
prefill stage, and the `_shrink_prefill` function should be called.
83-
Otherwise, it is the decode stage, and the _shrink_decode function
84-
should be called.
8581
8682
Semantics:
8783
for i in range(len(lora_a_stacked)):
@@ -129,7 +125,7 @@ def add_expand(self,
129125
lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]):
130126
bias's weight
131127
output_slices (Tuple[int, ...]): Every slice's size
132-
add_inputs (bool): Defaults to True.
128+
add_inputs (bool): Defaults to True.
133129
"""
134130
y_org = y
135131
y = y.view(-1, y.shape[-1])
@@ -226,7 +222,7 @@ def add_lora_linear(self,
226222

227223
if buffer is None:
228224
r = lora_b_stacked[0].size(-1)
229-
# We set the buffer to be float32 by default ,refer to:
225+
# We set the buffer to be float32 by default, refer to:
230226
# https://github.com/triton-lang/triton/issues/1387
231227
buffer = torch.zeros( # type: ignore
232228
(len(output_slices), x.size(0), r),
@@ -268,16 +264,16 @@ def add_lora_logits(self,
268264
y (torch.Tensor): Output tensor.
269265
x (torch.Tensor): Input tensor.
270266
lora_a_stacked (torch.Tensor): lora_a's weights.
271-
lora_b_stacked (torch.Tensor):lora_b's weights.
267+
lora_b_stacked (torch.Tensor): lora_b's weights.
272268
scale (float): Scaling factor.
273-
buffer (Optional[torch.Tensor]):Default to None.
269+
buffer (Optional[torch.Tensor]): Default to None.
274270
"""
275271
y_org = y
276272
y = y.view(-1, y.shape[-1])
277273
x = x.view(-1, x.shape[-1])
278274
r = lora_b_stacked.size(-1)
279275
if buffer is None:
280-
# We set the buffer to be float32 by default ,refer to:
276+
# We set the buffer to be float32 by default, refer to:
281277
# https://github.com/triton-lang/triton/issues/1387
282278
buffer = torch.zeros((x.size(0), r),
283279
dtype=torch.float32,

0 commit comments

Comments
 (0)