File tree Expand file tree Collapse file tree 1 file changed +9
-9
lines changed
Expand file tree Collapse file tree 1 file changed +9
-9
lines changed Original file line number Diff line number Diff line change 126126num_lateral_connection_tries_per_unit = 32
127127
128128# The learning rate for Srage I-a
129- learning_rate = 0.003025583248301791
129+ learning_rate = 0.001 # 0. 003025583248301791
130130
131131# Number of epochs for Training Stage I-a
132132epochs = 41
157157## Training Stage I-b parameters: ###
158158
159159# LR Scheduler for training stage I-b
160- INITIAL_LR_STAGE_I_B = 0.0039295722955565125
160+ INITIAL_LR_STAGE_I_B = 0.001 # 0. 0039295722955565125
161161
162162# A fixed number for the initial warmup
163163WARMUP_EPOCHS_STAGE_I_B = 7
203203##### Attention blocks' and attention mimetic blocks' constants: #######
204204
205205# --- SingleHeadChunkedAttention Block Constants ---
206- K_PROJ_CHUNKED = 5
206+ K_PROJ_CHUNKED = 8
207207DFF_CHUNKED = EMBEDDING_DIM # Can be tuned independently, but likely to coincide.
208- DROPOUT_RATE_CHUNKED = 0.1
208+ DROPOUT_RATE_CHUNKED = 0.05
209209
210210# --- MAMBA Block Constants ---
211- MAMBA_D_STATE = 12
212- MAMBA_D_CONV = 4
211+ MAMBA_D_STATE = 8
212+ MAMBA_D_CONV = 3
213213MAMBA_EXPAND = 2
214214MAMBA_DROPOUT = 0.05
215215
216216# --- VoxelAttentionLayer Constants ---
217- VOXEL_MAX_GRID_SIZE = 5
217+ VOXEL_MAX_GRID_SIZE = 8
218218VOXEL_CA_STEPS = 3
219219VOXEL_DROPOUT = 0.1
220220
221221# --- Linformer Block Constants (Adjusted for tiny model) ---
222- LINFORMER_K_PROJ = 16
223- LINFORMER_DFF = 64
222+ LINFORMER_K_PROJ = 8
223+ LINFORMER_DFF = 32
224224LINFORMER_DROPOUT = 0.05
225225LINFORMER_FFN_DROPOUT = 0.05
226226
You can’t perform that action at this time.
0 commit comments