1111from lume_model .models .ensemble import NNEnsemble
1212from lume_model .models .gp_model import GPModel
1313from trame .widgets import vuetify3 as vuetify
14- from utils import verify_input_variables , timer
14+ from utils import verify_input_variables , timer , load_config_dict
1515from error_manager import add_error
1616from sfapi_manager import monitor_sfapi_job
1717from state_manager import state
@@ -190,8 +190,23 @@ async def training_kernel(self):
190190 client_id = state .sfapi_client_id , secret = state .sfapi_key
191191 ) as client :
192192 perlmutter = await client .compute (Machine .perlmutter )
193+ # Upload the config.yaml to nersc
194+ config_dict = load_config_dict (state .experiment )
195+ config_dict ["simulation_calibration" ] = state .simulation_calibration
196+ target_path = "/global/cfs/cdirs/m558/superfacility/model_training"
197+ [target_path ] = await perlmutter .ls (target_path , directory = True )
198+ with tempfile .TemporaryDirectory () as temp_dir :
199+ temp_file_path = Path (temp_dir ) / "config.yaml"
200+ with open (temp_file_path , "w" ) as temp_file :
201+ yaml .dump (config_dict , temp_file )
202+ temp_file .flush ()
203+ with open (temp_file_path , "rb" ) as temp_file :
204+ print ("Uploading config file to NERSC" )
205+ temp_file .filename = "config.yaml"
206+ await target_path .upload (temp_file )
207+
193208 # set the path of the script used to submit the training job on NERSC
194- script_job = None
209+ training_script = None
195210 # multiple locations supported, to make development easier
196211 # container (production): script is in cwd
197212 # development, starting the gui app from dashboard/: script is in ../ml/
@@ -201,20 +216,20 @@ async def training_kernel(self):
201216 script_path = script_dir / "training_pm.sbatch"
202217 if os .path .exists (script_path ):
203218 with open (script_path , "r" ) as file :
204- script_job = file .read ()
219+ training_script = file .read ()
205220 break
206- if script_job is None :
221+ if training_script is None :
207222 raise RuntimeError ("Could not find training_pm.sbatch" )
208223
209224 # replace the --experiment command line argument in the batch script
210225 # with the current experiment in the state
211- script_job = re .sub (
226+ training_script = re .sub (
212227 pattern = r"--experiment (.*)" ,
213228 repl = rf"--experiment { state .experiment } --model { model_type_tag_dict [state .model_type ]} " ,
214- string = script_job ,
229+ string = training_script ,
215230 )
216231 # submit the training job through the Superfacility API
217- sfapi_job = await perlmutter .submit_job (script_job )
232+ sfapi_job = await perlmutter .submit_job (training_script )
218233 state .model_training_status = "Submitted"
219234 state .flush ()
220235 # print some logs
0 commit comments