fixed weight preservation + half of train loop

divnori · divnori · commit 0339bb51ff7a · 2024-01-05T18:38:50.000-05:00
diff --git a/xtra_labs/llm_finetune/draft.py b/xtra_labs/llm_finetune/draft.py
@@ -61,40 +61,41 @@ def generate(start_text, model, tokenizer, num_steps=20, temp=1.):
 # TEXT: some background on LLM benchmarking
 # Load benchmark dataset and evaluate model
 dataset = pd.read_csv("benchmark.csv")
-category_accs_1300m, avg_acc_1300m = run_benchmark(model, tokenizer, dataset)
+# category_accs_1300m, avg_acc_1300m = run_benchmark(model, tokenizer, dataset)
 
 # TEXT: ask them to make a prediction on how accuracy will be affected by different model sizes
 
 # Benchmark smaller model
-model_name_350m = "facebook/opt-350m" 
-model_350m = transformers.AutoModelForCausalLM.from_pretrained(model_name_350m, device_map="auto")
-tokenizer_350m = transformers.AutoTokenizer.from_pretrained(model_name_350m)
+# model_name_350m = "facebook/opt-350m" 
+# model_350m = transformers.AutoModelForCausalLM.from_pretrained(model_name_350m, device_map="auto")
+# tokenizer_350m = transformers.AutoTokenizer.from_pretrained(model_name_350m)
 
-category_accs_350m, avg_acc_350m = run_benchmark(model_350m, tokenizer_350m, dataset)
+# category_accs_350m, avg_acc_350m = run_benchmark(model_350m, tokenizer_350m, dataset)
 
 # Benchmark larger model
-model_name_2700m = "facebook/opt-2.7b" 
-model_2700m = transformers.AutoModelForCausalLM.from_pretrained(model_name_2700m, device_map="auto")
-tokenizer_2700m = transformers.AutoTokenizer.from_pretrained(model_name_2700m)
+# model_name_2700m = "facebook/opt-2.7b" 
+# model_2700m = transformers.AutoModelForCausalLM.from_pretrained(model_name_2700m, device_map="auto")
+# tokenizer_2700m = transformers.AutoTokenizer.from_pretrained(model_name_2700m)
 
-category_accs_2700m, avg_acc_2700m = run_benchmark(model_2700m, tokenizer_2700m, dataset)
+# category_accs_2700m, avg_acc_2700m = run_benchmark(model_2700m, tokenizer_2700m, dataset)
 
 # Spider plot
 
-benchmark_data = {"350M-Model": category_accs_350m, "1300M-Model": category_accs_1300m, "2700M-Model": category_accs_2700m}
-make_spider_plot(benchmark_data)
+# benchmark_data = {"350M-Model": category_accs_350m, "1300M-Model": category_accs_1300m, "2700M-Model": category_accs_2700m}
+# make_spider_plot(benchmark_data)
 
 # Part 2
 
 # inspect current model
-print(model)
+# print(model)
 
 # new LoRA linear layer class
 class LoRALinear(nn.Linear):
     def __init__(
             self, 
             in_features: int,
             out_features: int,
+            pretrained_weight: torch.Tensor,
             r: int = 8,
             lora_alpha: int = 1,
             **kwargs
@@ -105,6 +106,7 @@ def __init__(
         self.lora_alpha = lora_alpha
 
         nn.Linear.__init__(self, in_features, out_features, **kwargs)
+        self.weight.data = pretrained_weight
 
         # from https://github.com/microsoft/LoRA/blob/main/loralib/layers.py
         if r > 0:
@@ -113,16 +115,6 @@ def __init__(
             self.scaling = self.lora_alpha / self.r
             # Freezing the pre-trained weight matrix
             self.weight.requires_grad = False
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        # from https://github.com/microsoft/LoRA/blob/main/loralib/layers.py
-        nn.Linear.reset_parameters(self)
-        if hasattr(self, 'lora_A'):
-            # initialize B the same way as the default for nn.Linear and A to zero
-            # this is different than what is described in the paper but should not affect performance
-            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
-            nn.init.zeros_(self.lora_B)
 
     def forward(self, x: torch.Tensor):
         if self.r > 0:
@@ -136,48 +128,36 @@ def forward(self, x: torch.Tensor):
 def replace_linear_with_lora(module):
     for name, child in module.named_children():
         if isinstance(child, nn.Linear):
-            setattr(module, name, LoRALinear(child.in_features, child.out_features))
+            setattr(module, name, LoRALinear(child.in_features, child.out_features, child.weight))
+            break
         else:
             replace_linear_with_lora(child)
 
 replace_linear_with_lora(model)
 
 # inspect new model
-print(model)
+# print(model)
 
 # load chat dataset
 dataset_name = "timdettmers/openassistant-guanaco"
 ft_dataset = load_dataset(dataset_name, split="train")
 
-# train model
-log_dir = "/scratch/checkpoints/test-sft/opt1.3b_768/"
+# train model (barebones loop)
 batch_size = 4
 context_length = 768
-args = transformers.TrainingArguments(log_dir, 
-    per_device_train_batch_size=batch_size, 
-    logging_first_step=True,
-    logging_steps=20,
-    save_steps=100,
-)
-
-class PrinterCallback(transformers.TrainerCallback):
-    def on_log(self, args, state, control, model, logs=None, **kwargs):
-        start_text = "### Human: When the weather is sunny, what color is the sky?### Assistant:"
-        generate(start_text, model, tokenizer, num_steps=200, until="###")
-
-trainer = SFTTrainer(
-    model,
-    args=args,
-    train_dataset=ft_dataset,
-    dataset_text_field="text",
-    max_seq_length=context_length,
-    callbacks=[PrinterCallback()]
-)
-trainer.train()
+
+model = model.to("cuda")
+for batch in ft_dataset:
+    prompt = batch["text"]
+    encoding = tokenizer(prompt)
+    input_ids = torch.IntTensor(encoding["input_ids"]).to("cuda").unsqueeze(0)
+    attention_mask = torch.Tensor(encoding["attention_mask"]).to("cuda").unsqueeze(0)
+    outputs = model(input_ids, attention_mask)
+    
 
 # evaluate finetuned model on benchmark
 category_accs_1300m_ft, avg_acc_1300m_ft = run_benchmark(model, tokenizer, dataset)
 
 # add to spider plot 
-benchmark_data = {"350M-Model": category_accs_350m, "1300M-Model": category_accs_1300m, "1300M-Model-Finetuned": category_accs_1300m_ft, "2700M-Model": category_accs_2700m}
-make_spider_plot(benchmark_data)
+# benchmark_data = {"350M-Model": category_accs_350m, "1300M-Model": category_accs_1300m, "1300M-Model-Finetuned": category_accs_1300m_ft, "2700M-Model": category_accs_2700m}
+# make_spider_plot(benchmark_data)