Skip to content

Commit 8dc7030

Browse files
authored
Upload config files to nersc (#336)
1 parent 88152c8 commit 8dc7030

File tree

2 files changed

+28
-8
lines changed

2 files changed

+28
-8
lines changed

dashboard/model_manager.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from lume_model.models.ensemble import NNEnsemble
1212
from lume_model.models.gp_model import GPModel
1313
from trame.widgets import vuetify3 as vuetify
14-
from utils import verify_input_variables, timer
14+
from utils import verify_input_variables, timer, load_config_dict
1515
from error_manager import add_error
1616
from sfapi_manager import monitor_sfapi_job
1717
from state_manager import state
@@ -190,8 +190,23 @@ async def training_kernel(self):
190190
client_id=state.sfapi_client_id, secret=state.sfapi_key
191191
) as client:
192192
perlmutter = await client.compute(Machine.perlmutter)
193+
# Upload the config.yaml to nersc
194+
config_dict = load_config_dict(state.experiment)
195+
config_dict["simulation_calibration"] = state.simulation_calibration
196+
target_path = "/global/cfs/cdirs/m558/superfacility/model_training"
197+
[target_path] = await perlmutter.ls(target_path, directory=True)
198+
with tempfile.TemporaryDirectory() as temp_dir:
199+
temp_file_path = Path(temp_dir) / "config.yaml"
200+
with open(temp_file_path, "w") as temp_file:
201+
yaml.dump(config_dict, temp_file)
202+
temp_file.flush()
203+
with open(temp_file_path, "rb") as temp_file:
204+
print("Uploading config file to NERSC")
205+
temp_file.filename = "config.yaml"
206+
await target_path.upload(temp_file)
207+
193208
# set the path of the script used to submit the training job on NERSC
194-
script_job = None
209+
training_script = None
195210
# multiple locations supported, to make development easier
196211
# container (production): script is in cwd
197212
# development, starting the gui app from dashboard/: script is in ../ml/
@@ -201,20 +216,20 @@ async def training_kernel(self):
201216
script_path = script_dir / "training_pm.sbatch"
202217
if os.path.exists(script_path):
203218
with open(script_path, "r") as file:
204-
script_job = file.read()
219+
training_script = file.read()
205220
break
206-
if script_job is None:
221+
if training_script is None:
207222
raise RuntimeError("Could not find training_pm.sbatch")
208223

209224
# replace the --experiment command line argument in the batch script
210225
# with the current experiment in the state
211-
script_job = re.sub(
226+
training_script = re.sub(
212227
pattern=r"--experiment (.*)",
213228
repl=rf"--experiment {state.experiment} --model {model_type_tag_dict[state.model_type]}",
214-
string=script_job,
229+
string=training_script,
215230
)
216231
# submit the training job through the Superfacility API
217-
sfapi_job = await perlmutter.submit_job(script_job)
232+
sfapi_job = await perlmutter.submit_job(training_script)
218233
state.model_training_status = "Submitted"
219234
state.flush()
220235
# print some logs

ml/training_pm.sbatch

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,4 +49,9 @@ echo "Container check/pull took ${SECONDS} seconds."
4949

5050
# CUDA visible devices are ordered inverse to local task IDs
5151
# Reference: nvidia-smi topo -m
52-
srun podman-hpc run --gpu -v /etc/localtime:/etc/localtime -v $HOME/db.profile:/root/db.profile --rm -it ${REGISTRY_NAME}/${IMAGE_NAME}:${IMAGE_VERSION} python -u /app/ml/train_model.py --experiment ${experiment} --model ${model}
52+
srun podman-hpc run --gpu \
53+
-v /etc/localtime:/etc/localtime \
54+
-v $HOME/db.profile:/root/db.profile \
55+
-v /global/cfs/cdirs/m558/superfacility/model_training/config.yaml:/app/ml/config.yaml \
56+
--rm -it ${REGISTRY_NAME}/${IMAGE_NAME}:${IMAGE_VERSION} \
57+
python -u /app/ml/train_model.py --experiment ${experiment} --model ${model}

0 commit comments

Comments
 (0)