Update transformers library (#81)

njhill · tjohnson31415 · joerunde · web-flow · commit 7dbe456ae00e · 2024-04-19T07:39:01.000-07:00
Signed-off-by: Nick Hill &lt;nickhill@us.ibm.com&gt;
Signed-off-by: Travis Johnson &lt;tsjohnso@us.ibm.com&gt;
Signed-off-by: Joe Runde &lt;Joseph.Runde@ibm.com&gt;
Co-authored-by: Travis Johnson &lt;tsjohnso@us.ibm.com&gt;
Co-authored-by: Joe Runde &lt;Joseph.Runde@ibm.com&gt;
diff --git a/Dockerfile b/Dockerfile
@@ -164,6 +164,9 @@ RUN cd server && \
     make gen-server && \
     pip install ".[accelerate]" --no-cache-dir
 
+# temp: install newer transformers lib that optimum clashes with
+RUN pip install transformers==4.40.0 tokenizers==0.19.1 --no-cache-dir
+
 # Patch codegen model changes into transformers
 RUN cp server/transformers_patch/modeling_codegen.py ${SITE_PACKAGES}/transformers/models/codegen/modeling_codegen.py
 
@@ -287,6 +290,9 @@ COPY server server
 # Ref: https://onnxruntime.ai/docs/install/#install-onnx-runtime-gpu-cuda-12x
 RUN cd server && make gen-server && pip install ".[accelerate, ibm-fms, onnx-gpu, quantize]" --no-cache-dir --extra-index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
 
+# temp: install newer transformers lib that optimum clashes with
+RUN pip install transformers==4.40.0 tokenizers==0.19.1 --no-cache-dir
+
 # Patch codegen model changes into transformers 4.35
 RUN cp server/transformers_patch/modeling_codegen.py ${SITE_PACKAGES}/transformers/models/codegen/modeling_codegen.py
 
diff --git a/server/poetry.lock b/server/poetry.lock
diff --git a/server/pyproject.toml b/server/pyproject.toml
@@ -22,14 +22,15 @@ safetensors = "^0.4.3"
 sentencepiece = "^0.2.0"
 datasets = { version = "^2.15.0", optional = true }
 texttable = { version = "^1.7.0", optional = true }
-transformers = "4.38.2"
-optimum = { version = "^1.18.0", extras = ["onnxruntime-gpu"], optional = true }
-onnxruntime = { version = "^1.17.1", optional = true }
+# optimum 1.19 clashes with this atm
+# transformers = "4.40.0"
+optimum = { version = "^1.19.0", extras = ["onnxruntime-gpu"], optional = true }
+onnxruntime = { version = "^1.17.3", optional = true }
 onnxruntime-gpu = { version = "^1.17.1", optional = true }
 onnx = { version = "^1.16.0", optional = true }
 einops = "^0.7.0"
 ibm-fms = { version = "^0.0", optional = true }
-fms-extras = {git = "https://github.com/foundation-model-stack/fms-extras", rev = "fdb1636de4261fd4102da659ab45d3fcc33fe8ef", optional = true}
+fms-extras = { git = "https://github.com/foundation-model-stack/fms-extras", rev = "fdb1636de4261fd4102da659ab45d3fcc33fe8ef", optional = true }
 
 # Explicitly install some transitive dependencies to avoid CVEs
 jinja2 = ">=3.1.3"
diff --git a/server/tests/test_logit_processors.py b/server/tests/test_logit_processors.py
@@ -55,7 +55,8 @@ def test_alignment_repetition_penalty_logits_processor():
         dtype=torch.float32,
         device=None,
     )
-    v_warped = vectorized_proc(input_ids=INPUT_IDS, scores=FULL_SCORES)
+    # Vectorized penalty happens in place; clone the score tensor!
+    v_warped = vectorized_proc(input_ids=INPUT_IDS, scores=FULL_SCORES.clone())
     # apply each penalty one at a time using the nonvectorized warper
     s_warped = []
     for penalty, logits, ids in zip(penalties, FULL_SCORES, INPUT_IDS):
diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
@@ -182,7 +182,10 @@ def from_pb(
                 inputs_embeds[i, -input_length:-orig_length] = p
                 # Update attention mask with virtual prefix tokens
                 attention_mask[i, -input_length-padding_right_offset:-padding_right_offset] = 1
-            input_ids = None
+            # input_ids could be set to None here but this causes a problem in latest
+            # transformers prepare_inputs_for_generation impls.
+            # See https://github.com/huggingface/transformers/pull/29821
+            input_ids = all_input_ids
         else:
             input_ids = all_input_ids
             inputs_embeds = None
@@ -268,7 +271,7 @@ def concatenate(cls, batches: List["CausalLMBatch"]) -> "CausalLMBatch":
             # Create empty tensor
             # input_ids is always of shape [batch_size, 1]
             # We do not need to pad it
-            if input_ids is None:
+            if input_ids is None or input_ids.shape[-1] != 1:
                 input_ids = batch.input_ids.new_empty((total_batch_size, 1))
             # Copy to correct indices
             input_ids[start_index:end_index] = batch.input_ids