We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent d723faa commit 1feed99Copy full SHA for 1feed99
server/text_generation_server/server.py
@@ -284,7 +284,10 @@ async def serve_inner(
284
elif EXLLAMA_VERSION == "2":
285
# NOTE: We're assuming that in this case, max_batch_weight == max_batch_tokens
286
# This will likely need to change soon when we rework the batching parameters
287
- create_exllama_buffers(max_batch_weight)
+ max_batch_tokens = max_batch_weight if max_batch_weight is not None else (
288
+ max_batch_size * max_sequence_length
289
+ )
290
+ create_exllama_buffers(max_batch_tokens)
291
for _, submodule in model.model.named_modules():
292
if isinstance(submodule, Ex4bitLinearV2):
293
submodule.post_init() # make q matrix and set scratch space
0 commit comments