Skip to content

Commit 5fc6ec6

Browse files
Refine hyperparameter suggestions in training script
Update values for new run.
1 parent 1953892 commit 5fc6ec6

File tree

1 file changed

+25
-25
lines changed

1 file changed

+25
-25
lines changed

llm_train_hpo_script.py

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -96,23 +96,23 @@ def objective(trial):
9696
MAX_NEW_TOKENS = MAX_SEQ_LENGTH - GENERATION_PROMPT_LEN
9797

9898
# General Model & Training Params
99-
POSITIONAL_EMBEDDING_DROPOUT = trial.suggest_float("POSITIONAL_EMBEDDING_DROPOUT", low=0.0, high=0.99)
100-
activation = trial.suggest_categorical("activation", ["softplus", 'relu']) # ['softplus', 'relu', 'gelu', 'swish'])
99+
POSITIONAL_EMBEDDING_DROPOUT = trial.suggest_float("POSITIONAL_EMBEDDING_DROPOUT", low=0.0, high=0.4)
100+
activation = "softplus" # trial.suggest_categorical("activation", ["softplus", 'relu']) # ['softplus', 'relu', 'gelu', 'swish'])
101101
predecessor_level_connection_affinity_factor_first = trial.suggest_float(
102-
"predecessor_level_connection_affinity_factor_first", low=7, high=30.0)
103-
predecessor_level_connection_affinity_factor_main = trial.suggest_float(
104-
"predecessor_level_connection_affinity_factor_main", low=7, high=25)
105-
max_consecutive_lateral_connections = trial.suggest_int("max_consecutive_lateral_connections", low=3, high=11)
106-
p_lateral_connection = trial.suggest_float("p_lateral_connection", low=0.5089543226843299, high=0.6600677622410013)
107-
num_lateral_connection_tries_per_unit = trial.suggest_int("num_lateral_connection_tries_per_unit", low=10, high=32)
108-
learning_rate = trial.suggest_float("learning_rate", low=0.0004407017676344875, high=0.0006456391530117352)
109-
epochs = trial.suggest_int("epochs", low=71, high=120)
102+
"predecessor_level_connection_affinity_factor_first", low=11.75, high=30.0)
103+
predecessor_level_connection_affinity_factor_main = 12.45 # trial.suggest_float(
104+
# "predecessor_level_connection_affinity_factor_main", low=7, high=25)
105+
max_consecutive_lateral_connections = trial.suggest_int("max_consecutive_lateral_connections", low=6, high=9)
106+
p_lateral_connection = 0.628396083507019 # trial.suggest_float("p_lateral_connection", low=0.5089543226843299, high=0.6600677622410013)
107+
num_lateral_connection_tries_per_unit = trial.suggest_int("num_lateral_connection_tries_per_unit", low=18, high=30)
108+
learning_rate = 0.000474 # trial.suggest_float("learning_rate", low=0.0004407017676344875, high=0.0006456391530117352)
109+
epochs = trial.suggest_int("epochs", low=105, high=130)
110110
batch_size = 20 # trial.suggest_categorical("batch_size", [10, 15, 20])
111111
gradient_accumulation_steps = trial.suggest_int("gradient_accumulation_steps", low=3, high=5)
112112
minimum_levels = 2
113-
maximum_levels = trial.suggest_int("maximum_levels", low=minimum_levels, high=3)
113+
maximum_levels = 2 # trial.suggest_int("maximum_levels", low=minimum_levels, high=3)
114114
minimum_units_per_level = 2
115-
maximum_units_per_level = trial.suggest_int("maximum_units_per_level", low=minimum_units_per_level, high=2)
115+
maximum_units_per_level = 2 # trial.suggest_int("maximum_units_per_level", low=minimum_units_per_level, high=2)
116116
minimum_neurons_per_unit = 2
117117
maximum_neurons_per_unit = trial.suggest_int("maximum_neurons_per_unit", low=minimum_neurons_per_unit, high=4)
118118

@@ -126,36 +126,36 @@ def objective(trial):
126126

127127
# Tokenization & Embedding Params
128128
tokenizer_checkpoint = "HuggingFaceTB/SmolLM3-3B" # Fixed value
129-
EMBEDDING_N = trial.suggest_int("EMBEDDING_N", low=6, high=8)
129+
EMBEDDING_N = trial.suggest_int("EMBEDDING_N", low=6, high=7)
130130

131131
# --- Derived Parameters ---
132132
# These depend on other parameters and are calculated after suggestion.
133133
EMBEDDING_DIM = int(EMBEDDING_N * 2)
134134

135135
# Attention Block Constants
136136
K_PROJ_CHUNKED = trial.suggest_categorical("K_PROJ_CHUNKED", [4, 5, 8, 10])
137-
DFF_CHUNKED = trial.suggest_int("DFF_CHUNKED", low=5, high=EMBEDDING_DIM) # EMBEDDING_DIM # Derived from EMBEDDING_DIM
138-
DROPOUT_RATE_CHUNKED = trial.suggest_float("DROPOUT_RATE_CHUNKED", low=0.0, high=0.3000892345724095)
137+
DFF_CHUNKED = trial.suggest_int("DFF_CHUNKED", low=8, high=14)
138+
DROPOUT_RATE_CHUNKED = 0.05258 # trial.suggest_float("DROPOUT_RATE_CHUNKED", low=0.0, high=0.3000892345724095)
139139

140140
# Mamba Block Constants
141-
MAMBA_D_STATE = trial.suggest_int("MAMBA_D_STATE", low=12, high=40) # Assuming range, not just 1 or 40
141+
MAMBA_D_STATE = trial.suggest_int("MAMBA_D_STATE", low=13, high=25)
142142
MAMBA_D_CONV = trial.suggest_int("MAMBA_D_CONV", low=3, high=5)
143143
MAMBA_EXPAND = trial.suggest_categorical("MAMBA_EXPAND", [2, 4])
144-
MAMBA_DROPOUT = trial.suggest_float("MAMBA_DROPOUT", low=0.0, high=0.25)
144+
MAMBA_DROPOUT = 0.0765 # trial.suggest_float("MAMBA_DROPOUT", low=0.0, high=0.25)
145145

146146
# VoxelAttentionLayer Constants
147-
VOXEL_MAX_GRID_SIZE = trial.suggest_int("VOXEL_MAX_GRID_SIZE", low=4, high=7)
148-
VOXEL_CA_STEPS = trial.suggest_int("VOXEL_CA_STEPS", low=2, high=3)
149-
VOXEL_DROPOUT = trial.suggest_float("VOXEL_DROPOUT", low=0.0, high=0.6)
147+
VOXEL_MAX_GRID_SIZE = 7 # trial.suggest_int("VOXEL_MAX_GRID_SIZE", low=4, high=7)
148+
VOXEL_CA_STEPS = 2 # trial.suggest_int("VOXEL_CA_STEPS", low=2, high=3)
149+
VOXEL_DROPOUT = trial.suggest_float("VOXEL_DROPOUT", low=0.0, high=0.4)
150150

151151
# Linformer Block Constants
152-
LINFORMER_K_PROJ = trial.suggest_int("LINFORMER_K_PROJ", low=8, high=14)
153-
LINFORMER_DFF = trial.suggest_int("LINFORMER_DFF", low=39, high=60)
154-
LINFORMER_DROPOUT = trial.suggest_float("LINFORMER_DROPOUT", low=0.0, high=0.4115802273193311)
155-
LINFORMER_FFN_DROPOUT = trial.suggest_float("LINFORMER_FFN_DROPOUT", low=0.0, high=0.542531687435192)
152+
LINFORMER_K_PROJ = 11 # trial.suggest_int("LINFORMER_K_PROJ", low=8, high=14)
153+
LINFORMER_DFF = 42 # trial.suggest_int("LINFORMER_DFF", low=39, high=60)
154+
LINFORMER_DROPOUT = trial.suggest_float("LINFORMER_DROPOUT", low=0.0, high=0.333511123637477)
155+
LINFORMER_FFN_DROPOUT = trial.suggest_float("LINFORMER_FFN_DROPOUT", low=0.0, high=0.54)
156156

157157
# Adapter Block Constants
158-
ADAPTER_DROPOUT = trial.suggest_float("ADAPTER_DROPOUT", low=0.0, high=0.8856103097594802)
158+
ADAPTER_DROPOUT = trial.suggest_float("ADAPTER_DROPOUT", low=0.0, high=0.45)
159159

160160
# Tokenization
161161

0 commit comments

Comments
 (0)