Skip to content

Commit 3f541bc

Browse files
Updates for supporting running zppy on Dane (#789)
* Fix E3SM Diags SLURM resources on Dane * update ncremap command for vertical remapping * increase open file descriptor limit * other updates needed supporting mpas-a on dane * use srun for ncremap on dane
1 parent c9bd181 commit 3f541bc

File tree

3 files changed

+29
-3
lines changed

3 files changed

+29
-3
lines changed

zppy/templates/e3sm_diags.bash

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,16 @@
11
#!/bin/bash
22
{% include 'inclusions/slurm_header.bash' %}
3+
4+
{# For SLURM, match CPU allocation to E3SM Diags multiprocessing. #}
5+
{% set cpus_per_task = (num_workers if multiprocessing else 1) %}
6+
7+
{# Dane enforces step-level CPU/memory limits unless explicitly requested. #}
8+
{% if machine == 'dane' %}
9+
#SBATCH --ntasks=1
10+
#SBATCH --cpus-per-task={{ cpus_per_task }}
11+
#SBATCH --mem=0
12+
{% endif %}
13+
314
{% include 'inclusions/boilerplate.bash' %}
415
set -e
516
{{ environment_commands }}
@@ -456,10 +467,19 @@ EOF
456467
cat > e3sm_diags.cfg << EOF
457468
{% include cfg %}
458469
EOF
470+
{% if machine == 'dane' %}
471+
command="srun --ntasks=1 --cpus-per-task={{ cpus_per_task }} --cpu-bind=cores python -u e3sm.py -d e3sm_diags.cfg"
472+
{% else %}
459473
command="srun -n 1 python -u e3sm.py -d e3sm_diags.cfg"
474+
{% endif %}
475+
{% else %}
476+
{% if machine == 'dane' %}
477+
command="srun --ntasks=1 --cpus-per-task={{ cpus_per_task }} --cpu-bind=cores python -u e3sm.py"
460478
{% else %}
461479
command="srun -n 1 python -u e3sm.py"
462480
{% endif %}
481+
{% endif %}
482+
463483

464484
# Run diagnostics
465485
time ${command}

zppy/templates/e3sm_to_cmip.bash

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ EOF
3838
do
3939
if [ -f ${file} ]; then
4040
#ncks --rgr xtr_mth=mss_val --vrt_fl='{{cmip_plevdata}}' ${file} ${file}.plev
41-
ncremap -p mpi --vrt_ntp=log --vrt_xtr=mss_val --vrt_out='{{cmip_plevdata}}' ${file} ${file}.plev
41+
ncremap {% if machine == 'dane' %}--mpi_pfx='srun -n {{ nodes }}'{% else %}-p mpi{% endif %} --vrt_ntp=log --vrt_xtr=mss_val --vrt_out='{{cmip_plevdata}}' ${file} ${file}.plev
4242
if [ $? != 0 ]; then
4343
cd {{ scriptDir }}
4444
echo 'ERROR (1)' > {{ prefix }}.status

zppy/templates/mpas_analysis.bash

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,12 @@ set +e
99
export OMP_NUM_THREADS=1
1010
export HDF5_USE_FILE_LOCKING=FALSE
1111

12+
{% if machine == 'dane' %}
13+
# MPAS-Analysis workaround on dane: avoid "Too many open files" failures by
14+
# increasing the per-process open file descriptor limit (within the job's hard limit).
15+
ulimit -n 65536 2>/dev/null || true
16+
{% endif %}
17+
1218
# Basic definitions
1319
case="{{ case }}"
1420
www="{{ www }}"
@@ -132,7 +138,7 @@ mapMpiTasks = {{ mapMpiTasks }}
132138
# "None" if ESMF should perform remapping in serial without a command, or one of
133139
# "srun" or "mpirun" if it should be run in parallel (or in serial but with a
134140
# command)
135-
{% if machine in ['pm-cpu', 'pm-gpu', 'anvil', 'chrysalis'] %}
141+
{% if machine in ['pm-cpu', 'pm-gpu', 'anvil', 'chrysalis', 'dane'] %}
136142
mapParallelExec = srun
137143
{% elif machine in ['compy'] %}
138144
mapParallelExec = srun --mpi=pmi2
@@ -142,7 +148,7 @@ mapParallelExec = srun --mpi=pmi2
142148
# possibly with some flags if it should be run with that command
143149
{% if machine in ['pm-cpu', 'pm-gpu'] %}
144150
ncremapParallelExec = None
145-
{% elif machine in ['anvil', 'chrysalis'] %}
151+
{% elif machine in ['anvil', 'chrysalis', 'dane'] %}
146152
ncremapParallelExec = srun -n 1
147153
{% elif machine in ['compy'] %}
148154
ncremapParallelExec = srun --mpi=pmi2 -n 1

0 commit comments

Comments
 (0)