@@ -96,23 +96,23 @@ def objective(trial):
9696 MAX_NEW_TOKENS = MAX_SEQ_LENGTH - GENERATION_PROMPT_LEN
9797
9898 # General Model & Training Params
99- POSITIONAL_EMBEDDING_DROPOUT = trial .suggest_float ("POSITIONAL_EMBEDDING_DROPOUT" , low = 0.0 , high = 0.99 )
100- activation = trial .suggest_categorical ("activation" , ["softplus" , 'relu' ]) # ['softplus', 'relu', 'gelu', 'swish'])
99+ POSITIONAL_EMBEDDING_DROPOUT = trial .suggest_float ("POSITIONAL_EMBEDDING_DROPOUT" , low = 0.0 , high = 0.4 )
100+ activation = "softplus" # trial.suggest_categorical("activation", ["softplus", 'relu']) # ['softplus', 'relu', 'gelu', 'swish'])
101101 predecessor_level_connection_affinity_factor_first = trial .suggest_float (
102- "predecessor_level_connection_affinity_factor_first" , low = 7 , high = 30.0 )
103- predecessor_level_connection_affinity_factor_main = trial .suggest_float (
104- "predecessor_level_connection_affinity_factor_main" , low = 7 , high = 25 )
105- max_consecutive_lateral_connections = trial .suggest_int ("max_consecutive_lateral_connections" , low = 3 , high = 11 )
106- p_lateral_connection = trial .suggest_float ("p_lateral_connection" , low = 0.5089543226843299 , high = 0.6600677622410013 )
107- num_lateral_connection_tries_per_unit = trial .suggest_int ("num_lateral_connection_tries_per_unit" , low = 10 , high = 32 )
108- learning_rate = trial .suggest_float ("learning_rate" , low = 0.0004407017676344875 , high = 0.0006456391530117352 )
109- epochs = trial .suggest_int ("epochs" , low = 71 , high = 120 )
102+ "predecessor_level_connection_affinity_factor_first" , low = 11.75 , high = 30.0 )
103+ predecessor_level_connection_affinity_factor_main = 12.45 # trial.suggest_float(
104+ # "predecessor_level_connection_affinity_factor_main", low=7, high=25)
105+ max_consecutive_lateral_connections = trial .suggest_int ("max_consecutive_lateral_connections" , low = 6 , high = 9 )
106+ p_lateral_connection = 0.628396083507019 # trial.suggest_float("p_lateral_connection", low=0.5089543226843299, high=0.6600677622410013)
107+ num_lateral_connection_tries_per_unit = trial .suggest_int ("num_lateral_connection_tries_per_unit" , low = 18 , high = 30 )
108+ learning_rate = 0.000474 # trial.suggest_float("learning_rate", low=0.0004407017676344875, high=0.0006456391530117352)
109+ epochs = trial .suggest_int ("epochs" , low = 105 , high = 130 )
110110 batch_size = 20 # trial.suggest_categorical("batch_size", [10, 15, 20])
111111 gradient_accumulation_steps = trial .suggest_int ("gradient_accumulation_steps" , low = 3 , high = 5 )
112112 minimum_levels = 2
113- maximum_levels = trial .suggest_int ("maximum_levels" , low = minimum_levels , high = 3 )
113+ maximum_levels = 2 # trial.suggest_int("maximum_levels", low=minimum_levels, high=3)
114114 minimum_units_per_level = 2
115- maximum_units_per_level = trial .suggest_int ("maximum_units_per_level" , low = minimum_units_per_level , high = 2 )
115+ maximum_units_per_level = 2 # trial.suggest_int("maximum_units_per_level", low=minimum_units_per_level, high=2)
116116 minimum_neurons_per_unit = 2
117117 maximum_neurons_per_unit = trial .suggest_int ("maximum_neurons_per_unit" , low = minimum_neurons_per_unit , high = 4 )
118118
@@ -126,36 +126,36 @@ def objective(trial):
126126
127127 # Tokenization & Embedding Params
128128 tokenizer_checkpoint = "HuggingFaceTB/SmolLM3-3B" # Fixed value
129- EMBEDDING_N = trial .suggest_int ("EMBEDDING_N" , low = 6 , high = 8 )
129+ EMBEDDING_N = trial .suggest_int ("EMBEDDING_N" , low = 6 , high = 7 )
130130
131131 # --- Derived Parameters ---
132132 # These depend on other parameters and are calculated after suggestion.
133133 EMBEDDING_DIM = int (EMBEDDING_N * 2 )
134134
135135 # Attention Block Constants
136136 K_PROJ_CHUNKED = trial .suggest_categorical ("K_PROJ_CHUNKED" , [4 , 5 , 8 , 10 ])
137- DFF_CHUNKED = trial .suggest_int ("DFF_CHUNKED" , low = 5 , high = EMBEDDING_DIM ) # EMBEDDING_DIM # Derived from EMBEDDING_DIM
138- DROPOUT_RATE_CHUNKED = trial .suggest_float ("DROPOUT_RATE_CHUNKED" , low = 0.0 , high = 0.3000892345724095 )
137+ DFF_CHUNKED = trial .suggest_int ("DFF_CHUNKED" , low = 8 , high = 14 )
138+ DROPOUT_RATE_CHUNKED = 0.05258 # trial.suggest_float("DROPOUT_RATE_CHUNKED", low=0.0, high=0.3000892345724095)
139139
140140 # Mamba Block Constants
141- MAMBA_D_STATE = trial .suggest_int ("MAMBA_D_STATE" , low = 12 , high = 40 ) # Assuming range, not just 1 or 40
141+ MAMBA_D_STATE = trial .suggest_int ("MAMBA_D_STATE" , low = 13 , high = 25 )
142142 MAMBA_D_CONV = trial .suggest_int ("MAMBA_D_CONV" , low = 3 , high = 5 )
143143 MAMBA_EXPAND = trial .suggest_categorical ("MAMBA_EXPAND" , [2 , 4 ])
144- MAMBA_DROPOUT = trial .suggest_float ("MAMBA_DROPOUT" , low = 0.0 , high = 0.25 )
144+ MAMBA_DROPOUT = 0.0765 # trial.suggest_float("MAMBA_DROPOUT", low=0.0, high=0.25)
145145
146146 # VoxelAttentionLayer Constants
147- VOXEL_MAX_GRID_SIZE = trial .suggest_int ("VOXEL_MAX_GRID_SIZE" , low = 4 , high = 7 )
148- VOXEL_CA_STEPS = trial .suggest_int ("VOXEL_CA_STEPS" , low = 2 , high = 3 )
149- VOXEL_DROPOUT = trial .suggest_float ("VOXEL_DROPOUT" , low = 0.0 , high = 0.6 )
147+ VOXEL_MAX_GRID_SIZE = 7 # trial.suggest_int("VOXEL_MAX_GRID_SIZE", low=4, high=7)
148+ VOXEL_CA_STEPS = 2 # trial.suggest_int("VOXEL_CA_STEPS", low=2, high=3)
149+ VOXEL_DROPOUT = trial .suggest_float ("VOXEL_DROPOUT" , low = 0.0 , high = 0.4 )
150150
151151 # Linformer Block Constants
152- LINFORMER_K_PROJ = trial .suggest_int ("LINFORMER_K_PROJ" , low = 8 , high = 14 )
153- LINFORMER_DFF = trial .suggest_int ("LINFORMER_DFF" , low = 39 , high = 60 )
154- LINFORMER_DROPOUT = trial .suggest_float ("LINFORMER_DROPOUT" , low = 0.0 , high = 0.4115802273193311 )
155- LINFORMER_FFN_DROPOUT = trial .suggest_float ("LINFORMER_FFN_DROPOUT" , low = 0.0 , high = 0.542531687435192 )
152+ LINFORMER_K_PROJ = 11 # trial.suggest_int("LINFORMER_K_PROJ", low=8, high=14)
153+ LINFORMER_DFF = 42 # trial.suggest_int("LINFORMER_DFF", low=39, high=60)
154+ LINFORMER_DROPOUT = trial .suggest_float ("LINFORMER_DROPOUT" , low = 0.0 , high = 0.333511123637477 )
155+ LINFORMER_FFN_DROPOUT = trial .suggest_float ("LINFORMER_FFN_DROPOUT" , low = 0.0 , high = 0.54 )
156156
157157 # Adapter Block Constants
158- ADAPTER_DROPOUT = trial .suggest_float ("ADAPTER_DROPOUT" , low = 0.0 , high = 0.8856103097594802 )
158+ ADAPTER_DROPOUT = trial .suggest_float ("ADAPTER_DROPOUT" , low = 0.0 , high = 0.45 )
159159
160160 # Tokenization
161161
0 commit comments