@@ -153,8 +153,7 @@ def __init__(
153
153
self .q_lora_rank ,
154
154
bias = False ,
155
155
quant_config = quant_config ,
156
- prefix = f"{ prefix } .q_a_proj" ,
157
- return_bias = False ,
156
+ prefix = f"{ prefix } .q_a_proj"
158
157
)
159
158
self .q_a_layernorm = RMSNorm (self .q_lora_rank ,
160
159
eps = config .rms_norm_eps )
@@ -163,26 +162,23 @@ def __init__(
163
162
self .num_heads * self .qk_head_dim ,
164
163
bias = False ,
165
164
quant_config = quant_config ,
166
- prefix = f"{ prefix } .q_b_proj" ,
167
- return_bias = False ,
165
+ prefix = f"{ prefix } .q_b_proj"
168
166
)
169
167
else :
170
168
self .q_proj = ColumnParallelLinear (
171
169
self .hidden_size ,
172
170
self .num_heads * self .qk_head_dim ,
173
171
bias = False ,
174
172
quant_config = quant_config ,
175
- prefix = f"{ prefix } .q_proj" ,
176
- return_bias = False ,
173
+ prefix = f"{ prefix } .q_proj"
177
174
)
178
175
179
176
self .kv_a_proj_with_mqa = ReplicatedLinear (
180
177
self .hidden_size ,
181
178
self .kv_lora_rank + self .qk_rope_head_dim ,
182
179
bias = False ,
183
180
quant_config = quant_config ,
184
- prefix = f"{ prefix } .kv_a_proj_with_mqa" ,
185
- return_bias = False ,
181
+ prefix = f"{ prefix } .kv_a_proj_with_mqa"
186
182
)
187
183
self .kv_a_layernorm = RMSNorm (self .kv_lora_rank ,
188
184
eps = config .rms_norm_eps )
@@ -191,16 +187,14 @@ def __init__(
191
187
self .num_heads * (self .qk_nope_head_dim + self .v_head_dim ),
192
188
bias = False ,
193
189
quant_config = quant_config ,
194
- prefix = f"{ prefix } .kv_b_proj" ,
195
- return_bias = False ,
190
+ prefix = f"{ prefix } .kv_b_proj"
196
191
)
197
192
self .o_proj = CustomDeepseekV2RowParallelLinear (
198
193
self .num_heads * self .v_head_dim ,
199
194
self .hidden_size ,
200
195
bias = False ,
201
196
quant_config = quant_config ,
202
- prefix = f"{ prefix } .o_proj" ,
203
- return_bias = False ,
197
+ prefix = f"{ prefix } .o_proj"
204
198
)
205
199
206
200
if rope_scaling :
0 commit comments