Skip to content

Commit 4661acb

Browse files
Update train_a_generative_llm.py
Restore again.
1 parent 255cf6c commit 4661acb

File tree

1 file changed

+9
-9
lines changed

1 file changed

+9
-9
lines changed

train_a_generative_llm.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@
126126
num_lateral_connection_tries_per_unit = 32
127127

128128
# The learning rate for Srage I-a
129-
learning_rate = 0.001 # 0.003025583248301791
129+
learning_rate = 0.003025583248301791
130130

131131
# Number of epochs for Training Stage I-a
132132
epochs = 41
@@ -157,7 +157,7 @@
157157
## Training Stage I-b parameters: ###
158158

159159
# LR Scheduler for training stage I-b
160-
INITIAL_LR_STAGE_I_B = 0.001 # 0.0039295722955565125
160+
INITIAL_LR_STAGE_I_B = 0.0039295722955565125
161161

162162
# A fixed number for the initial warmup
163163
WARMUP_EPOCHS_STAGE_I_B = 7
@@ -203,24 +203,24 @@
203203
##### Attention blocks' and attention mimetic blocks' constants: #######
204204

205205
# --- SingleHeadChunkedAttention Block Constants ---
206-
K_PROJ_CHUNKED = 8
206+
K_PROJ_CHUNKED = 5
207207
DFF_CHUNKED = EMBEDDING_DIM # Can be tuned independently, but likely to coincide.
208-
DROPOUT_RATE_CHUNKED = 0.05
208+
DROPOUT_RATE_CHUNKED = 0.1
209209

210210
# --- MAMBA Block Constants ---
211-
MAMBA_D_STATE = 8
212-
MAMBA_D_CONV = 3
211+
MAMBA_D_STATE = 12
212+
MAMBA_D_CONV = 4
213213
MAMBA_EXPAND = 2
214214
MAMBA_DROPOUT = 0.05
215215

216216
# --- VoxelAttentionLayer Constants ---
217-
VOXEL_MAX_GRID_SIZE = 8
217+
VOXEL_MAX_GRID_SIZE = 5
218218
VOXEL_CA_STEPS = 3
219219
VOXEL_DROPOUT = 0.1
220220

221221
# --- Linformer Block Constants (Adjusted for tiny model) ---
222-
LINFORMER_K_PROJ = 8
223-
LINFORMER_DFF = 32
222+
LINFORMER_K_PROJ = 16
223+
LINFORMER_DFF = 64
224224
LINFORMER_DROPOUT = 0.05
225225
LINFORMER_FFN_DROPOUT = 0.05
226226

0 commit comments

Comments
 (0)