not working, need create dataloader function

wukaixingxp · wukaixingxp · commit bb990be96712 · 2024-09-21T12:41:30.000-07:00
diff --git a/recipes/quickstart/finetuning/datasets/vqa_dataset.py b/recipes/quickstart/finetuning/datasets/vqa_dataset.py
@@ -5,6 +5,7 @@
 import copy
 from datasets import load_dataset
 import itertools
+import torch
 # check system prompt token seq or user prompt token seq is in the current token list
 def check_header(targets,seq):
     for i in range(len(seq)-3):
@@ -20,13 +21,8 @@ def tokenize_dialog(dialog, images, processor):
     # If vocab size is above 128000, use the chat template to generate the tokens as it is from Llama 3 family models
     text_prompt = processor.apply_chat_template(dialog)
     #print("text_prompt",text_prompt)
-    batch = processor(images=images, text=text_prompt)
-    dialog_tokens = batch["input_ids"].tolist()[0]
-    #print("dialog_tokens",dialog_tokens)
-    #print("dialog_tokens",dialog_tokens)
-    attention_mask = batch["attention_mask"].tolist()[0]
-    #print("attention_mask",attention_mask)
-    labels = copy.copy(dialog_tokens)
+    batch = processor(images=images, text=text_prompt,padding = True, return_tensors="pt")    
+    labels = copy.copy(batch["input_ids"].tolist()[0])
     eot_indices = [i for i,n in enumerate(labels) if n == 128009]
     last_idx = 0
     # system prompt header "<|start_header_id|>system<|end_header_id|>" has been tokenized to [128006, 9125, 128007]
@@ -43,25 +39,34 @@ def tokenize_dialog(dialog, images, processor):
     assistant_header_seq = [128006, 78191, 128007]
     labels = replace_target(assistant_header_seq,labels)
     #print("labels",labels)
+    # print("pixel_values .shape",batch["pixel_values"].shape)
+    # print("batch_size, num_concurrent_media, num_tiles, num_channels, height, width = pixel_values.shape")
 
-
-    combined_tokens = {
-        # "input_ids": list(itertools.chain(*(t for t in dialog_tokens))),
-        # "labels": list(itertools.chain(*(t for t in labels_tokens))),
-        "input_ids": dialog_tokens,
-        "labels": labels,
-        "attention_mask": [1]*len(dialog_tokens),
-        "pixel_values": batch["pixel_values"].tolist()[0],
-        "image_sizes": batch["image_sizes"].tolist()[0]
-    }
+    batch["labels"] = torch.tensor(labels)
+    #pixel_values .shape torch.Size([1, 1, 4, 3, 560, 560])
+    batch["pixel_values"] = torch.squeeze(batch["pixel_values"], 1)
+    # pixel_values .shape torch.Size([1, 4, 3, 560, 560])
+    print("pixel_values .shape",batch["pixel_values"].shape)
+    # exit()
+    # combined_tokens = {
+    #     # "input_ids": list(itertools.chain(*(t for t in dialog_tokens))),
+    #     # "labels": list(itertools.chain(*(t for t in labels_tokens))),
+    #     "input_ids": dialog_tokens,
+    #     "labels": labels,
+    #     "attention_mask": [1]*len(dialog_tokens),
+    #     "pixel_values": batch["pixel_values"],
+    #     "aspect_ratio_ids": batch["aspect_ratio_ids"],
+    #     "aspect_ratio_mask": batch["aspect_ratio_mask"],
+    #     "cross_attention_mask": batch["cross_attention_mask"]
+    # }
     # input_ids =  list(itertools.chain(*(t for t in dialog_tokens))),
     # labels = list(itertools.chain(*(t for t in labels_tokens))),
     # attention_mask = [1]*len(list(itertools.chain(*(t for t in dialog_tokens)))),
     # pixel_values =  batch["pixel_values"],
     # image_sizes = batch["image_sizes"]
 #    print("combined_tokens",combined_tokens[image_sizes])
     
-    return combined_tokens
+    return batch
 def image_tokenize(sample, processor):
     processor.tokenizer.padding_side = "right" # during training, one always uses padding on the right
     images,sample_text = sample["images"],sample["messages"]
diff --git a/src/llama_recipes/finetuning.py b/src/llama_recipes/finetuning.py
@@ -26,11 +26,8 @@
     BitsAndBytesConfig,
     LlamaForCausalLM,
     LlamaConfig,
-    AutoConfig, 
-    AutoModel,
-    LlavaNextForConditionalGeneration,
-    LlavaNextProcessor
-
+    AutoProcessor, 
+    MllamaForConditionalGeneration
 )
 from transformers.models.llama.modeling_llama import LlamaDecoderLayer
 from transformers.models.clip.modeling_clip import CLIPEncoder, CLIPEncoderLayer
@@ -126,20 +123,32 @@ def main(**kwargs):
 
     # Load the pre-trained model and setup its configuration
     use_cache = False if train_config.enable_fsdp else None
-    model = LlavaNextForConditionalGeneration.from_pretrained(
+    if "11B" in train_config.model_name or "90B" in train_config.model_name:
+        is_vision = True
+        model = MllamaForConditionalGeneration.from_pretrained(
         train_config.model_name,
         quantization_config=bnb_config,
         #use_cache=use_cache,
         attn_implementation="sdpa" if train_config.use_fast_kernels else None,
         device_map="auto" if train_config.quantization and not train_config.enable_fsdp else None,
         torch_dtype=torch.float16 if train_config.use_fp16 else torch.bfloat16,
     )
+        processor = AutoProcessor.from_pretrained(train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name)
+        processor.tokenizer.padding_side='right'
+    else:
+        model = LlamaForCausalLM.from_pretrained(
+            train_config.model_name,
+            quantization_config=bnb_config,
+            use_cache=use_cache,
+            attn_implementation="sdpa" if train_config.use_fast_kernels else None,
+            device_map="auto" if train_config.quantization and not train_config.enable_fsdp else None,
+            torch_dtype=torch.float16 if train_config.use_fp16 else torch.bfloat16,
+        )
 
     # Load the tokenizer and add special tokens
     tokenizer = AutoTokenizer.from_pretrained(train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name)
     tokenizer.pad_token_id = tokenizer.eos_token_id
-    processor = LlavaNextProcessor.from_pretrained(train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name)
-    processor.tokenizer.padding_side='right'
+        
     # If there is a mismatch between tokenizer vocab size and embedding matrix,
     # throw a warning and then expand the embedding matrix
     if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
@@ -183,18 +192,16 @@ def main(**kwargs):
             device_id = torch.xpu.current_device()
         elif torch.cuda.is_available():
             device_id = torch.cuda.current_device()
-        # print(dir(model))
-        # for layer in model.named_children():
-        #     print(f"Layer: {layer}")
-            
-        # layernorm = model.CLIPVisionTransformer.CLIPEncoder.LayerNorm
-        # for name, param in layernorm.named_parameters():
-        #     print(f"Parameter: {name}, Shape: {param.shape}, Dtype: {param.dtype}")
-        # exit()
+        if train_config.use_peft:
+            wrapping_policy = my_auto_wrapping_policy
+        else:
+            if is_vision:
+                wrapping_policy = ModuleWrapPolicy([CLIPEncoderLayer, LlamaDecoderLayer])
+            else:
+                wrapping_policy = ModuleWrapPolicy([LlamaDecoderLayer])
         model = FSDP(
             model,
-            auto_wrap_policy= ModuleWrapPolicy([CLIPEncoderLayer, LlamaDecoderLayer]),
-            #auto_wrap_policy= my_auto_wrapping_policy, #if train_config.use_peft else wrapping_policy,
+            auto_wrap_policy= wrapping_policy,
             cpu_offload=CPUOffload(offload_params=True) if fsdp_config.fsdp_cpu_offload else None,
             mixed_precision=mixed_precision_policy if not fsdp_config.pure_bf16 else None,
             sharding_strategy=fsdp_config.sharding_strategy,
@@ -205,10 +212,9 @@ def main(**kwargs):
             param_init_fn=(lambda module: module.to_empty(device=torch.device("cuda"), recurse=False))
             if train_config.low_cpu_fsdp and rank != 0 else None,
         )
-        #print(model)
         if fsdp_config.fsdp_activation_checkpointing:            
             model.enable_input_require_grads()
-            model.gradient_checkpointing_enable()
+            #model.gradient_checkpointing_enable()
             apply_fsdp_checkpointing(model)                      
     elif not train_config.quantization and not train_config.enable_fsdp:
         if is_xpu_available():
@@ -217,23 +223,23 @@ def main(**kwargs):
             model.to("cuda")
 
     dataset_config = generate_dataset_config(train_config, kwargs)
+    if is_vision:
+        dataset_processer = processor
+    else:
+        dataset_processer = tokenizer
+
+    # Load and preprocess the dataset for training and validation
 
-     # Load and preprocess the dataset for training and validation
-    # dataset_train = get_preprocessed_dataset(
-    #     processor,
-    #     dataset_config,
-    #     split="train",
-    # )
     dataset_train = get_preprocessed_dataset(
-        processor,
+        dataset_processer,
         dataset_config,
         split="train",
     )
     if not train_config.enable_fsdp or rank == 0:
         print(f"--> Training Set Length = {len(dataset_train)}")
 
     dataset_val = get_preprocessed_dataset(
-        processor,
+        dataset_processer,
         dataset_config,
         split="test",
     )
diff --git a/src/llama_recipes/utils/config_utils.py b/src/llama_recipes/utils/config_utils.py
@@ -75,7 +75,7 @@ def generate_dataset_config(train_config, kwargs):
     return  dataset_config
 
 
-def get_dataloader_kwargs(train_config, dataset, tokenizer, mode):
+def get_dataloader_kwargs(train_config, dataset, tokenizer, mode,collate_fn=None):
         kwargs = {}
         batch_size = train_config.batch_size_training if mode=="train" else train_config.val_batch_size
         if train_config.batching_strategy == "padding":
@@ -89,7 +89,10 @@ def get_dataloader_kwargs(train_config, dataset, tokenizer, mode):
                 )
             else:
                 kwargs["batch_sampler"] = LengthBasedBatchSampler(dataset, batch_size, drop_last=True, shuffle=mode=="train")
-            kwargs["collate_fn"] = DataCollatorForSeq2Seq(tokenizer)
+            if not collate_fn:
+                kwargs["collate_fn"] = collate_fn
+            else:
+                kwargs["collate_fn"] = DataCollatorForSeq2Seq(tokenizer)
         elif train_config.batching_strategy == "packing":
             if train_config.enable_fsdp:
                 kwargs["sampler"] = DistributedSampler(
diff --git a/src/llama_recipes/utils/train_utils.py b/src/llama_recipes/utils/train_utils.py
@@ -154,6 +154,8 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
                     with autocast():
                         assert(next(model.parameters()).device == batch['input_ids'].device)
                         #print("batch: ", batch)
+                        pixel_values = batch['pixel_values']
+                        print("pixel_values.shape input",pixel_values.shape)
                         loss = model(**batch).loss
                     loss = loss / gradient_accumulation_steps
                     #print("loss",loss)