Skip to content

Commit 1feed99

Browse files
committed
fix: Handle no max_batch_weight case for exllamav2 GPTQ
1 parent d723faa commit 1feed99

File tree

1 file changed

+4
-1
lines changed

1 file changed

+4
-1
lines changed

server/text_generation_server/server.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,10 @@ async def serve_inner(
284284
elif EXLLAMA_VERSION == "2":
285285
# NOTE: We're assuming that in this case, max_batch_weight == max_batch_tokens
286286
# This will likely need to change soon when we rework the batching parameters
287-
create_exllama_buffers(max_batch_weight)
287+
max_batch_tokens = max_batch_weight if max_batch_weight is not None else (
288+
max_batch_size * max_sequence_length
289+
)
290+
create_exllama_buffers(max_batch_tokens)
288291
for _, submodule in model.model.named_modules():
289292
if isinstance(submodule, Ex4bitLinearV2):
290293
submodule.post_init() # make q matrix and set scratch space

0 commit comments

Comments
 (0)