Skip to content

Commit c195c63

Browse files
authored
Merge pull request #3638 from CliMA/zs/ci_config
clean up gpu_configs
2 parents 61c6546 + 8debca3 commit c195c63

File tree

6 files changed

+71
-131
lines changed

6 files changed

+71
-131
lines changed

.buildkite/gpu_pipeline/pipeline.yml

Lines changed: 33 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -38,70 +38,70 @@ steps:
3838
steps:
3939

4040
- label: "dry baroclinic wave"
41-
key: "target_gpu_implicit_baroclinic_wave"
41+
key: "baroclinic_wave_helem30"
4242
command:
43-
- mkdir -p target_gpu_implicit_baroclinic_wave
43+
- mkdir -p baroclinic_wave_helem30
4444
- >
45-
nsys profile --delay 100 --trace=nvtx,mpi,cuda,osrt --output=target_gpu_implicit_baroclinic_wave/output_active/report
45+
nsys profile --delay 100 --trace=nvtx,mpi,cuda,osrt --output=baroclinic_wave_helem30/output_active/report
4646
julia --threads=3 --color=yes --project=.buildkite .buildkite/ci_driver.jl
47-
--config_file ${GPU_CONFIG_PATH}target_gpu_implicit_baroclinic_wave.yml
48-
--job_id target_gpu_implicit_baroclinic_wave
49-
artifact_paths: "target_gpu_implicit_baroclinic_wave/output_active/*"
47+
--config_file ${GPU_CONFIG_PATH}baroclinic_wave_helem30.yml
48+
--job_id baroclinic_wave_helem30
49+
artifact_paths: "baroclinic_wave_helem30/output_active/*"
5050
env:
5151
CLIMACOMMS_DEVICE: "CUDA"
5252
agents:
5353
slurm_gpus: 1
5454
slurm_cpus_per_task: 4
5555
slurm_exclusive:
5656

57-
- label: "moist Held-Suarez"
58-
key: "gpu_hs_rhoe_equil_55km_nz63_0M"
57+
- label: "dry baroclinic wave - 4 gpus"
58+
key: "baroclinic_wave_helem30_4process"
5959
command:
60-
- mkdir -p gpu_hs_rhoe_equil_55km_nz63_0M
60+
- mkdir -p baroclinic_wave_helem30_4process
6161
- >
62-
nsys profile --delay 100 --trace=nvtx,mpi,cuda,osrt --output=gpu_hs_rhoe_equil_55km_nz63_0M/output_active/report
62+
srun --cpu-bind=threads --cpus-per-task=4
63+
nsys profile --delay 100 --trace=osrt,nvtx,cuda,mpi,ucx --output=baroclinic_wave_helem30_4process/output_active/report-%q{PMI_RANK}
6364
julia --threads=3 --color=yes --project=.buildkite .buildkite/ci_driver.jl
64-
--config_file ${GPU_CONFIG_PATH}gpu_hs_rhoe_equil_0M.yml
65-
--job_id gpu_hs_rhoe_equil_55km_nz63_0M
66-
artifact_paths: "gpu_hs_rhoe_equil_55km_nz63_0M/output_active/*"
65+
--config_file ${GPU_CONFIG_PATH}baroclinic_wave_helem30.yml
66+
--job_id baroclinic_wave_helem30_4process
67+
artifact_paths: "baroclinic_wave_helem30_4process/output_active/*"
6768
env:
6869
CLIMACOMMS_DEVICE: "CUDA"
70+
CLIMACOMMS_CONTEXT: "MPI"
6971
agents:
70-
slurm_gpus: 1
72+
slurm_gpus_per_task: 1
7173
slurm_cpus_per_task: 4
74+
slurm_ntasks: 4
7275
slurm_exclusive:
73-
74-
- label: "moist Held-Suarez - 4 gpus"
75-
key: "gpu_hs_rhoe_equil_55km_nz63_0M_4process"
76+
77+
- label: "moist Held-Suarez"
78+
key: "held_suarez_equil_helem30"
7679
command:
77-
- mkdir -p gpu_hs_rhoe_equil_55km_nz63_0M_4process
80+
- mkdir -p held_suarez_equil_helem30
7881
- >
79-
srun --cpu-bind=threads --cpus-per-task=4
80-
nsys profile --delay 100 --trace=nvtx,mpi,cuda,osrt --output=gpu_hs_rhoe_equil_55km_nz63_0M_4process/output_active/report-%q{PMI_RANK}
82+
nsys profile --delay 100 --trace=nvtx,mpi,cuda,osrt --output=held_suarez_equil_helem30/output_active/report
8183
julia --threads=3 --color=yes --project=.buildkite .buildkite/ci_driver.jl
82-
--config_file ${GPU_CONFIG_PATH}gpu_hs_rhoe_equil_0M.yml
83-
--job_id gpu_hs_rhoe_equil_55km_nz63_0M_4process
84-
artifact_paths: "gpu_hs_rhoe_equil_55km_nz63_0M_4process/output_active/*"
84+
--config_file ${GPU_CONFIG_PATH}held_suarez_equil_helem30.yml
85+
--job_id held_suarez_equil_helem30
86+
artifact_paths: "held_suarez_equil_helem30/output_active/*"
8587
env:
8688
CLIMACOMMS_DEVICE: "CUDA"
87-
CLIMACOMMS_CONTEXT: "MPI"
8889
agents:
89-
slurm_gpus_per_task: 1
90+
slurm_gpus: 1
9091
slurm_cpus_per_task: 4
91-
slurm_ntasks: 4
9292
slurm_exclusive:
9393

94-
- label: "dry baroclinic wave - 4 gpus"
95-
key: "target_gpu_implicit_baroclinic_wave_4process"
94+
- label: "moist Held-Suarez - 4 gpus"
95+
key: "held_suarez_equil_helem30_4process"
9696
command:
97-
- mkdir -p target_gpu_implicit_baroclinic_wave_4process
97+
- mkdir -p held_suarez_equil_helem30_4process
9898
- >
9999
srun --cpu-bind=threads --cpus-per-task=4
100-
nsys profile --delay 100 --trace=osrt,nvtx,cuda,mpi,ucx --output=target_gpu_implicit_baroclinic_wave_4process/output_active/report-%q{PMI_RANK}
100+
nsys profile --delay 100 --trace=nvtx,mpi,cuda,osrt --output=held_suarez_equil_helem30_4process/output_active/report-%q{PMI_RANK}
101101
julia --threads=3 --color=yes --project=.buildkite .buildkite/ci_driver.jl
102-
--config_file ${GPU_CONFIG_PATH}target_gpu_implicit_baroclinic_wave.yml
103-
--job_id target_gpu_implicit_baroclinic_wave_4process
104-
artifact_paths: "target_gpu_implicit_baroclinic_wave_4process/output_active/*"
102+
--config_file ${GPU_CONFIG_PATH}held_suarez_equil_helem30.yml
103+
--job_id held_suarez_equil_helem30_4process
104+
artifact_paths: "held_suarez_equil_helem30_4process/output_active/*"
105105
env:
106106
CLIMACOMMS_DEVICE: "CUDA"
107107
CLIMACOMMS_CONTEXT: "MPI"

.buildkite/pipeline.yml

Lines changed: 24 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -800,7 +800,7 @@ steps:
800800
CLIMACOMMS_DEVICE: "CUDA"
801801
agents:
802802
slurm_gpus: 1
803-
slurm_mem: 16G
803+
slurm_mem: 16GB
804804

805805
- label: "GPU: baroclinic wave"
806806
key: "baroclinic_wave_gpu"
@@ -812,8 +812,8 @@ steps:
812812
env:
813813
CLIMACOMMS_DEVICE: "CUDA"
814814
agents:
815-
slurm_mem: 16G
816815
slurm_gpus: 1
816+
slurm_mem: 16GB
817817

818818
- label: "GPU: compare BW with CPU"
819819
command: >
@@ -830,73 +830,46 @@ steps:
830830
- "baroclinic_wave_gpu"
831831

832832
- label: "GPU: GPU dry baroclinic wave"
833-
key: "target_gpu_implicit_baroclinic_wave"
833+
key: "baroclinic_wave_helem30"
834834
command:
835-
- mkdir -p target_gpu_implicit_baroclinic_wave
835+
- mkdir -p baroclinic_wave_helem30
836836
- >
837-
nsys profile --delay 100 --trace=nvtx,cuda --output=target_gpu_implicit_baroclinic_wave/output_active/report
837+
nsys profile --delay 100 --trace=nvtx,cuda --output=baroclinic_wave_helem30/output_active/report
838838
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
839-
--config_file ${GPU_CONFIG_PATH}/target_gpu_implicit_baroclinic_wave.yml
840-
--job_id target_gpu_implicit_baroclinic_wave
841-
artifact_paths: "target_gpu_implicit_baroclinic_wave/output_active/*"
839+
--config_file ${GPU_CONFIG_PATH}/baroclinic_wave_helem30.yml
840+
--job_id baroclinic_wave_helem30
841+
artifact_paths: "baroclinic_wave_helem30/output_active/*"
842842
env:
843843
CLIMACOMMS_DEVICE: "CUDA"
844844
agents:
845+
slurm_mem: 32GB
845846
slurm_gpus: 1
846-
slurm_mem: 32G
847+
847848

848849
- label: "GPU: GPU dry baroclinic wave - 4 gpus"
849-
key: "target_gpu_implicit_baroclinic_wave_4process"
850+
key: "baroclinic_wave_helem30_4process"
850851
command:
851-
- mkdir -p target_gpu_implicit_baroclinic_wave_4process
852+
- mkdir -p baroclinic_wave_helem30_4process
852853
# - >
853854
# srun --cpu-bind=threads --cpus-per-task=4
854-
# nsys profile --delay 100 --trace=nvtx,cuda,mpi --output=target_gpu_implicit_baroclinic_wave_4process/output_active/report-%q{PMI_RANK}
855+
# nsys profile --delay 100 --trace=nvtx,cuda,mpi --output=baroclinic_wave_helem30_4process/output_active/report-%q{PMI_RANK}
855856
# julia --threads=3 --color=yes --project=.buildkite .buildkite/ci_driver.jl
856-
# --config_file ${GPU_CONFIG_PATH}/target_gpu_implicit_baroclinic_wave.yml
857-
# --job_id target_gpu_implicit_baroclinic_wave_4process
857+
# --config_file ${GPU_CONFIG_PATH}/baroclinic_wave_helem30.yml
858+
# --job_id baroclinic_wave_helem30_4process
858859
- >
859860
srun --cpu-bind=threads --cpus-per-task=4
860861
julia --threads=3 --color=yes --project=.buildkite .buildkite/ci_driver.jl
861-
--config_file ${GPU_CONFIG_PATH}/target_gpu_implicit_baroclinic_wave.yml
862-
--job_id target_gpu_implicit_baroclinic_wave_4process
863-
artifact_paths: "target_gpu_implicit_baroclinic_wave_4process/output_active/*"
862+
--config_file ${GPU_CONFIG_PATH}/baroclinic_wave_helem30.yml
863+
--job_id baroclinic_wave_helem30_4process
864+
artifact_paths: "baroclinic_wave_helem30_4process/output_active/*"
864865
env:
865866
CLIMACOMMS_CONTEXT: "MPI"
866867
CLIMACOMMS_DEVICE: "CUDA"
867868
agents:
868869
slurm_gpus_per_task: 1
869870
slurm_cpus_per_task: 4
870871
slurm_ntasks: 4
871-
slurm_mem: 32G
872-
873-
- label: "GPU: GPU moist Held-Suarez"
874-
command:
875-
- >
876-
nsys profile --delay 100 --trace=nvtx,cuda --output=gpu_hs_equil_0M/output_active/report
877-
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
878-
--config_file $CONFIG_PATH/gpu_hs_equil_0M.yml
879-
--job_id gpu_hs_equil_0M
880-
artifact_paths: "gpu_hs_equil_0M/output_active/*"
881-
env:
882-
CLIMACOMMS_DEVICE: "CUDA"
883-
agents:
884-
slurm_gpus: 1
885-
slurm_mem: 16G
886-
887-
- label: "GPU: GPU moist Held-Suarez cloud diagnostics per stage"
888-
command:
889-
- >
890-
nsys profile --delay 100 --trace=nvtx,cuda --output=cloud_diag_gpu_hs_equil_0M/output_active/report
891-
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
892-
--config_file $CONFIG_PATH/cloud_diag_gpu_hs_equil_0M.yml
893-
--job_id cloud_diag_gpu_hs_equil_0M
894-
artifact_paths: "cloud_diag_gpu_hs_equil_0M/output_active/*"
895-
env:
896-
CLIMACOMMS_DEVICE: "CUDA"
897-
agents:
898-
slurm_gpus: 1
899-
slurm_mem: 16G
872+
slurm_mem: 32GB
900873

901874
- label: "GPU: test DYAMOND interpolated initial conditions"
902875
command: >
@@ -908,7 +881,7 @@ steps:
908881
CLIMACOMMS_DEVICE: "CUDA"
909882
agents:
910883
slurm_gpus: 1
911-
slurm_mem: 16G
884+
slurm_mem: 16GB
912885

913886
- label: "GPU: EDOnly EDMFX aquaplanet"
914887
command: >
@@ -919,9 +892,9 @@ steps:
919892
env:
920893
CLIMACOMMS_DEVICE: "CUDA"
921894
agents:
922-
slurm_mem: 20GB
923895
slurm_gpus: 1
924-
896+
slurm_mem: 20GB
897+
925898
- label: "GPU: Diagnostic EDMFX aquaplanet"
926899
key: "diagnostic_edmfx_aquaplanet_gpu"
927900
command: >
@@ -933,7 +906,7 @@ steps:
933906
CLIMACOMMS_DEVICE: "CUDA"
934907
agents:
935908
slurm_gpus: 1
936-
slurm_mem: 20G
909+
slurm_mem: 20GB
937910

938911
- label: "GPU: Prognostic EDMFX aquaplanet"
939912
command: >
@@ -945,7 +918,7 @@ steps:
945918
CLIMACOMMS_DEVICE: "CUDA"
946919
agents:
947920
slurm_gpus: 1
948-
slurm_mem: 20G
921+
slurm_mem: 20GB
949922

950923
- group: "Benchmarks"
951924
steps:
@@ -959,7 +932,6 @@ steps:
959932

960933
- label: ":computer: Benchmark: GPU baroclinic wave moist"
961934
command: >
962-
963935
julia --color=yes --project=.buildkite perf/benchmark_step.jl
964936
--config_file $PERF_CONFIG_PATH/bm_baroclinic_wave_moist.yml
965937
--job_id bm_baroclinic_wave_moist_gpu
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
dt_save_state_to_disk: "Inf"
2-
dt_save_to_sol: "Inf"
3-
output_default_diagnostics: false
41
h_elem: 30
5-
initial_condition: "DryBaroclinicWave"
6-
t_end: "1days"
72
z_elem: 63
83
dz_bottom: 30.0
4+
t_end: "1days"
95
dt: "90secs"
6+
dt_save_state_to_disk: "Inf"
7+
dt_save_to_sol: "Inf"
8+
output_default_diagnostics: false
9+
initial_condition: "DryBaroclinicWave"
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
1-
dt_save_state_to_disk: "Inf"
2-
dt_save_to_sol: "Inf"
3-
output_default_diagnostics: false
4-
dt: "90secs"
5-
t_end: "1days"
61
h_elem: 30
72
z_elem: 63
8-
dz_bottom: 30.0
93
z_max: 60000.0
10-
vert_diff: true
11-
moist: "equil"
12-
precip_model: "0M"
4+
dz_bottom: 30.0
135
rayleigh_sponge: true
146
viscous_sponge: true
7+
dt: "90secs"
8+
t_end: "1days"
9+
dt_save_state_to_disk: "Inf"
10+
dt_save_to_sol: "Inf"
11+
output_default_diagnostics: false
12+
moist: "equil"
13+
vert_diff: true
14+
precip_model: "0M"
1515
forcing: "held_suarez"
1616
toml: [toml/longrun_held_suarez.toml]

config/model_configs/cloud_diag_gpu_hs_equil_0M.yml

Lines changed: 0 additions & 17 deletions
This file was deleted.

config/model_configs/gpu_hs_equil_0M.yml

Lines changed: 0 additions & 15 deletions
This file was deleted.

0 commit comments

Comments
 (0)