Skip to content

Commit 4d0b358

Browse files
authored
[405B] Set max_tokens to 2k (#2088)
* [405B] Set max_tokens to 2k Maximum reference output tokens is under 2k, so we have to limit generation outputs as well.
1 parent 421d20b commit 4d0b358

File tree

3 files changed

+3
-5
lines changed

3 files changed

+3
-5
lines changed

.github/workflows/build_wheels.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ name: Build loadgen wheels and release them into PYPI
33
on:
44
release:
55
types: [published]
6-
6+
77
push:
88
branches:
99
- master

language/llama3.1-405b/SUT_VLLM.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def __init__(
7272
"top_p": 1,
7373
"top_k": 1,
7474
"seed": 42,
75-
"max_tokens": 20000,
75+
"max_tokens": 2000,
7676
"min_tokens": 2
7777
}
7878
self.sampling_params = SamplingParams(**gen_kwargs)

language/mixtral-8x7b/SUT.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,6 @@ def put(self, value):
119119
self.first_token.put((value, self.response_ids[0]))
120120

121121
self.is_first_token = False
122-
123122

124123
self.tokens_cache.append(value)
125124

@@ -413,7 +412,7 @@ def process_queries(self):
413412

414413
batch_texts = [self.data_object.input_texts[qitem.index]]
415414
batch_ids = self.tokenizer.batch_encode_plus(
416-
batch_texts, return_tensors="pt", padding=True)
415+
batch_texts, return_tensors="pt", padding=True)
417416
batch_ids = batch_ids.to(self.device)
418417
_, length = batch_ids.input_ids.shape
419418

@@ -427,7 +426,6 @@ def process_queries(self):
427426
response_ids=[qitem.id],
428427
)
429428

430-
431429
_ = self.model.generate(
432430
**batch_ids,
433431
num_return_sequences=1,

0 commit comments

Comments
 (0)