use AutoModel

wukaixingxp · wukaixingxp · commit 57afa0b51e3b · 2024-09-24T16:02:32.000-07:00
diff --git a/recipes/quickstart/finetuning/finetune_vision_model.md b/recipes/quickstart/finetuning/finetune_vision_model.md
@@ -1,7 +1,7 @@
 ## Fine-Tuning Meta Llama Multi Modal Models recipe
 This recipe steps you through how to finetune a Llama 3.2 vision model on the OCR VQA task using the [OCRVQA](https://huggingface.co/datasets/HuggingFaceM4/the_cauldron/viewer/ocrvqa?row=0) dataset.
 
-**Disclaimer**: As our vision models already have a very good OCR ability, here we just use the OCRVQA dataset to demonstrate the steps needed for fine-tuning our vision models.
+**Disclaimer**: As our vision models already have a very good OCR ability, here we just use the OCRVQA dataset only for demonstration purposes of the required steps for fine-tuning our vision models with llama-recipes.
 
 ### Fine-tuning steps
 
diff --git a/src/llama_recipes/finetuning.py b/src/llama_recipes/finetuning.py
@@ -20,9 +20,9 @@
     AutoConfig,
     AutoTokenizer,
     BitsAndBytesConfig,
-    LlamaForCausalLM,
     AutoProcessor, 
-    MllamaForConditionalGeneration
+    MllamaForConditionalGeneration,
+    AutoModel,
 )
 from transformers.models.llama.modeling_llama import LlamaDecoderLayer
 from transformers.models.mllama.modeling_mllama import  MllamaSelfAttentionDecoderLayer,MllamaCrossAttentionDecoderLayer,MllamaVisionEncoderLayer
@@ -134,7 +134,7 @@ def main(**kwargs):
         processor.tokenizer.padding_side='right'
     elif config.model_type == "llama":
         is_vision = False
-        model = LlamaForCausalLM.from_pretrained(
+        model = AutoModel.from_pretrained(
             train_config.model_name,
             quantization_config=bnb_config,
             use_cache=use_cache,