23
23
# TEXT: overview of LLM lab
24
24
# Load pretrained LLM (medium size model)
25
25
26
- model_name = "facebook/opt-125m"
27
26
# model_name = "facebook/opt-1.3b"
27
+ model_name = "facebook/opt-125m"
28
28
# had to load non TF version to run benchmarking code
29
29
model = transformers .AutoModelForCausalLM .from_pretrained (model_name , device_map = "auto" )
30
30
tokenizer = transformers .AutoTokenizer .from_pretrained (model_name )
@@ -66,7 +66,7 @@ def generate(start_text, model, tokenizer, num_steps=20, temp=1.):
66
66
# TEXT: some background on LLM benchmarking
67
67
# Load benchmark dataset and evaluate model
68
68
benchmark_dataset = pd .read_csv ("benchmark.csv" )
69
- # category_accs_1300m, avg_acc_1300m = run_benchmark(model, tokenizer, benchmark_dataset)
69
+ category_accs_1300m , avg_acc_1300m = run_benchmark (model , tokenizer , benchmark_dataset )
70
70
71
71
# TEXT: ask them to make a prediction on how accuracy will be affected by different model sizes
72
72
@@ -87,17 +87,18 @@ def generate(start_text, model, tokenizer, num_steps=20, temp=1.):
87
87
# Spider plot
88
88
89
89
# benchmark_data = {"350M-Model": category_accs_350m, "1300M-Model": category_accs_1300m, "2700M-Model": category_accs_2700m}
90
+ # benchmark_data = {"350M-Model": category_accs_1300m}
90
91
# make_spider_plot(benchmark_data)
91
92
92
93
# Part 2
93
94
94
- def count_grad_parameters (model ):
95
- return sum (p .numel () for p in model .parameters () if p .requires_grad )
96
-
97
95
# inspect current model
98
96
# print(model)
99
- first_lin_layer = model .model .decoder .layers [0 ].self_attn .k_proj
100
- print (count_grad_parameters (model ))
97
+ print (sum (p .numel () for p in model .parameters () if p .requires_grad ))
98
+
99
+ # # freeze all parameter gradients
100
+ for param in model .parameters ():
101
+ param .requires_grad = False
101
102
102
103
# new LoRA linear layer class
103
104
class LoRALinear (nn .Module ):
@@ -108,7 +109,8 @@ def __init__(
108
109
pretrained_weight : torch .Tensor ,
109
110
pretrained_bias : torch .Tensor ,
110
111
r : int = 8 ,
111
- lora_alpha : int = 1 ,
112
+ lora_alpha : int = 8 ,
113
+ lora_dropout : float = 0.1 ,
112
114
** kwargs
113
115
):
114
116
super (LoRALinear , self ).__init__ ()
@@ -121,17 +123,21 @@ def __init__(
121
123
self .weight = nn .Parameter (pretrained_weight )
122
124
self .weight .requires_grad = False
123
125
124
- self .bias = nn .Parameter (pretrained_bias )
125
- self .bias .requires_grad = False
126
+ if pretrained_bias is not None :
127
+ self .bias = nn .Parameter (pretrained_bias )
128
+ self .bias .requires_grad = False
129
+ else :
130
+ self .bias = None
126
131
127
132
# from https://github.com/microsoft/LoRA/blob/main/loralib/layers.py
128
133
self .lora_A = nn .Parameter (self .weight .new_zeros ((r , in_features )))
129
134
self .lora_B = nn .Parameter (self .weight .new_zeros ((out_features , r )))
130
135
self .scaling = self .lora_alpha / self .r
136
+ self .lora_dropout = nn .Dropout (p = lora_dropout )
131
137
132
138
def forward (self , x : torch .Tensor ):
133
- result = F .linear (x , self .weight , bias = self .bias )
134
- result += self .lora_A .transpose (0 , 1 ) @ self .lora_B .transpose (0 , 1 ) * self .scaling
139
+ result = F .linear (x , self .weight , bias = self .bias )
140
+ result += ( self .lora_dropout ( x ) @ self . lora_A .transpose (0 , 1 ) @ self .lora_B .transpose (0 , 1 ) ) * self .scaling
135
141
return result
136
142
137
143
# replace linear layers in model recursively
@@ -144,10 +150,10 @@ def replace_linear_with_lora(module):
144
150
145
151
replace_linear_with_lora (model )
146
152
153
+ print (sum (p .numel () for p in model .parameters () if p .requires_grad ))
154
+
147
155
# inspect new model
148
- first_lin_layer = model .model .decoder .layers [0 ].self_attn .k_proj
149
- print (count_grad_parameters (model ))
150
- exit ()
156
+ # print(model)
151
157
152
158
# load chat dataset
153
159
dataset_name = "timdettmers/openassistant-guanaco"
@@ -206,4 +212,5 @@ def replace_linear_with_lora(module):
206
212
207
213
# add to spider plot
208
214
# benchmark_data = {"350M-Model": category_accs_350m, "1300M-Model": category_accs_1300m, "1300M-Model-Finetuned": category_accs_1300m_ft, "2700M-Model": category_accs_2700m}
209
- # make_spider_plot(benchmark_data)
215
+ benchmark_data = {"350M-Model" : category_accs_1300m , "350M-Model-Finetuned" : category_accs_1300m_ft }
216
+ make_spider_plot (benchmark_data )
0 commit comments