Skip to content

Commit 7dbe456

Browse files
njhilltjohnson31415joerunde
authored
Update transformers library (#81)
Signed-off-by: Nick Hill <[email protected]> Signed-off-by: Travis Johnson <[email protected]> Signed-off-by: Joe Runde <[email protected]> Co-authored-by: Travis Johnson <[email protected]> Co-authored-by: Joe Runde <[email protected]>
1 parent a7b41b6 commit 7dbe456

File tree

5 files changed

+58
-47
lines changed

5 files changed

+58
-47
lines changed

Dockerfile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,9 @@ RUN cd server && \
164164
make gen-server && \
165165
pip install ".[accelerate]" --no-cache-dir
166166

167+
# temp: install newer transformers lib that optimum clashes with
168+
RUN pip install transformers==4.40.0 tokenizers==0.19.1 --no-cache-dir
169+
167170
# Patch codegen model changes into transformers
168171
RUN cp server/transformers_patch/modeling_codegen.py ${SITE_PACKAGES}/transformers/models/codegen/modeling_codegen.py
169172

@@ -287,6 +290,9 @@ COPY server server
287290
# Ref: https://onnxruntime.ai/docs/install/#install-onnx-runtime-gpu-cuda-12x
288291
RUN cd server && make gen-server && pip install ".[accelerate, ibm-fms, onnx-gpu, quantize]" --no-cache-dir --extra-index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
289292

293+
# temp: install newer transformers lib that optimum clashes with
294+
RUN pip install transformers==4.40.0 tokenizers==0.19.1 --no-cache-dir
295+
290296
# Patch codegen model changes into transformers 4.35
291297
RUN cp server/transformers_patch/modeling_codegen.py ${SITE_PACKAGES}/transformers/models/codegen/modeling_codegen.py
292298

server/poetry.lock

Lines changed: 40 additions & 40 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

server/pyproject.toml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,15 @@ safetensors = "^0.4.3"
2222
sentencepiece = "^0.2.0"
2323
datasets = { version = "^2.15.0", optional = true }
2424
texttable = { version = "^1.7.0", optional = true }
25-
transformers = "4.38.2"
26-
optimum = { version = "^1.18.0", extras = ["onnxruntime-gpu"], optional = true }
27-
onnxruntime = { version = "^1.17.1", optional = true }
25+
# optimum 1.19 clashes with this atm
26+
# transformers = "4.40.0"
27+
optimum = { version = "^1.19.0", extras = ["onnxruntime-gpu"], optional = true }
28+
onnxruntime = { version = "^1.17.3", optional = true }
2829
onnxruntime-gpu = { version = "^1.17.1", optional = true }
2930
onnx = { version = "^1.16.0", optional = true }
3031
einops = "^0.7.0"
3132
ibm-fms = { version = "^0.0", optional = true }
32-
fms-extras = {git = "https://github.com/foundation-model-stack/fms-extras", rev = "fdb1636de4261fd4102da659ab45d3fcc33fe8ef", optional = true}
33+
fms-extras = { git = "https://github.com/foundation-model-stack/fms-extras", rev = "fdb1636de4261fd4102da659ab45d3fcc33fe8ef", optional = true }
3334

3435
# Explicitly install some transitive dependencies to avoid CVEs
3536
jinja2 = ">=3.1.3"

server/tests/test_logit_processors.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,8 @@ def test_alignment_repetition_penalty_logits_processor():
5555
dtype=torch.float32,
5656
device=None,
5757
)
58-
v_warped = vectorized_proc(input_ids=INPUT_IDS, scores=FULL_SCORES)
58+
# Vectorized penalty happens in place; clone the score tensor!
59+
v_warped = vectorized_proc(input_ids=INPUT_IDS, scores=FULL_SCORES.clone())
5960
# apply each penalty one at a time using the nonvectorized warper
6061
s_warped = []
6162
for penalty, logits, ids in zip(penalties, FULL_SCORES, INPUT_IDS):

server/text_generation_server/models/causal_lm.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,10 @@ def from_pb(
182182
inputs_embeds[i, -input_length:-orig_length] = p
183183
# Update attention mask with virtual prefix tokens
184184
attention_mask[i, -input_length-padding_right_offset:-padding_right_offset] = 1
185-
input_ids = None
185+
# input_ids could be set to None here but this causes a problem in latest
186+
# transformers prepare_inputs_for_generation impls.
187+
# See https://github.com/huggingface/transformers/pull/29821
188+
input_ids = all_input_ids
186189
else:
187190
input_ids = all_input_ids
188191
inputs_embeds = None
@@ -268,7 +271,7 @@ def concatenate(cls, batches: List["CausalLMBatch"]) -> "CausalLMBatch":
268271
# Create empty tensor
269272
# input_ids is always of shape [batch_size, 1]
270273
# We do not need to pad it
271-
if input_ids is None:
274+
if input_ids is None or input_ids.shape[-1] != 1:
272275
input_ids = batch.input_ids.new_empty((total_batch_size, 1))
273276
# Copy to correct indices
274277
input_ids[start_index:end_index] = batch.input_ids

0 commit comments

Comments
 (0)