fix AutoModel and bump transformers version to 4.45 (meta-llama#686)

wukaixingxp · web-flow · commit 9c7a5b421f20 · 2024-09-26T12:01:18.000-07:00
diff --git a/.watchman-cookie-devgpu003.cco3.facebook.com-3137776-2746 b/.watchman-cookie-devgpu003.cco3.facebook.com-3137776-2746
diff --git a/requirements.txt b/requirements.txt
@@ -8,7 +8,7 @@ black[jupyter]
 datasets
 fire
 peft
-transformers>=4.43.1
+transformers>=4.45.1
 sentencepiece
 py7zr
 scipy
diff --git a/src/llama_recipes/finetuning.py b/src/llama_recipes/finetuning.py
@@ -21,8 +21,8 @@
     AutoTokenizer,
     BitsAndBytesConfig,
     AutoProcessor, 
+    LlamaForCausalLM,
     MllamaForConditionalGeneration,
-    AutoModel,
 )
 from transformers.models.llama.modeling_llama import LlamaDecoderLayer
 from transformers.models.mllama.modeling_mllama import  MllamaSelfAttentionDecoderLayer,MllamaCrossAttentionDecoderLayer,MllamaVisionEncoderLayer
@@ -132,9 +132,11 @@ def main(**kwargs):
     )
         processor = AutoProcessor.from_pretrained(train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name)
         processor.tokenizer.padding_side='right'
+        model.supports_gradient_checkpointing = True
+        model.language_model.supports_gradient_checkpointing = True
     elif config.model_type == "llama":
         is_vision = False
-        model = AutoModel.from_pretrained(
+        model = LlamaForCausalLM.from_pretrained(
             train_config.model_name,
             quantization_config=bnb_config,
             use_cache=use_cache,