Skip to content

Commit 01d0346

Browse files
author
Thomas
committed
Added script for scaling tests
1 parent a5453d2 commit 01d0346

File tree

17 files changed

+531
-0
lines changed

17 files changed

+531
-0
lines changed
Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
import matplotlib.pyplot as plt
2+
import numpy as np
3+
import pickle
4+
from pySDC.helpers.stats_helper import get_sorted
5+
from pySDC.projects.GPU.configs.base_config import get_config
6+
from pySDC.projects.GPU.etc.generate_jobscript import write_jobscript, PROJECT_PATH
7+
8+
9+
class ScalingConfig(object):
10+
cluster = None
11+
config = ''
12+
base_resolution = -1
13+
base_resolution_weak = -1
14+
useGPU = False
15+
partition = None
16+
tasks_per_node = None
17+
ndim = 2
18+
tasks_time = 1
19+
max_steps_space = None
20+
max_steps_space_weak = None
21+
22+
def __init__(self, space_time_parallel):
23+
if space_time_parallel in ['False', False]:
24+
self._tasks_time = 1
25+
else:
26+
self._tasks_time = self.tasks_time
27+
28+
def get_resolution_and_tasks(self, strong, i):
29+
if strong:
30+
return self.base_resolution, [1, self._tasks_time, 2**i]
31+
else:
32+
return self.base_resolution_weak * (2**i), [1, self._tasks_time, (2 * self.ndim) ** i]
33+
34+
def run_scaling_test(self, strong=True):
35+
max_steps = self.max_steps_space if strong else self.max_steps_space_weak
36+
for i in range(max_steps):
37+
res, procs = self.get_resolution_and_tasks(strong, i)
38+
39+
sbatch_options = [f'-n {np.prod(procs)}', f'-p {self.partition}']
40+
if self.useGPU:
41+
srun_options = ['--cpus-per-task=4', '--gpus-per-task=1']
42+
sbatch_options += ['--cpus-per-task=4', '--gpus-per-task=1']
43+
else:
44+
srun_options = []
45+
46+
procs = (''.join(f'{me}/' for me in procs))[:-1]
47+
command = f'run_experiment.py --mode=run --res={res} --config={self.config} --procs={procs}'
48+
49+
if self.useGPU:
50+
command += ' --useGPU=True'
51+
52+
write_jobscript(sbatch_options, srun_options, command, self.cluster)
53+
54+
def plot_scaling_test(self, strong, ax, plot_ideal=False, **plotting_params):
55+
timings = {}
56+
57+
max_steps = self.max_steps_space if strong else self.max_steps_space_weak
58+
for i in range(max_steps):
59+
res, procs = self.get_resolution_and_tasks(strong, i)
60+
61+
args = {'useGPU': self.useGPU, 'config': self.config, 'res': res, 'procs': procs, 'mode': None}
62+
63+
config = get_config(args)
64+
65+
path = f'data/{config.get_path(ranks=[me -1 for me in procs])}-stats-whole-run.pickle'
66+
with open(path, 'rb') as file:
67+
stats = pickle.load(file)
68+
69+
timing_step = get_sorted(stats, type='timing_step')
70+
71+
timings[np.prod(procs) / self.tasks_per_node] = np.mean([me[1] for me in timing_step])
72+
73+
ax.loglog(timings.keys(), timings.values(), **plotting_params)
74+
if plot_ideal:
75+
ax.loglog(
76+
timings.keys(),
77+
list(timings.values())[0] * list(timings.keys())[0] / np.array(list(timings.keys())),
78+
ls='--',
79+
color='grey',
80+
label='ideal',
81+
)
82+
ax.set_xlabel(r'$N_\mathrm{nodes}$')
83+
ax.set_ylabel(r'$t_\mathrm{step}$')
84+
85+
86+
class CPUConfig(ScalingConfig):
87+
cluster = 'jusuf'
88+
partition = 'batch'
89+
tasks_per_node = 128
90+
91+
92+
class GPUConfig(ScalingConfig):
93+
cluster = 'booster'
94+
partition = 'booster'
95+
tasks_per_node = 4
96+
useGPU = True
97+
98+
99+
class GrayScottSpaceScalingCPU(CPUConfig, ScalingConfig):
100+
base_resolution = 2048
101+
base_resolution_weak = 256
102+
config = 'GS_scaling'
103+
max_steps_space = 10
104+
max_steps_space_weak = 6
105+
tasks_time = 3
106+
107+
108+
class GrayScottSpaceScalingGPU(GPUConfig, ScalingConfig):
109+
base_resolution_weak = 256 * 32
110+
base_resolution = 2048
111+
config = 'GS_scaling'
112+
max_steps_space = 4
113+
max_steps_space_weak = 4
114+
tasks_time = 3
115+
116+
117+
def plot_scalings(strong, problem, kwargs):
118+
if problem == 'GS':
119+
fig, ax = plt.subplots()
120+
121+
plottings_params = [
122+
{'plot_ideal': strong, 'marker': 'x', 'label': 'CPU'},
123+
{'marker': '>', 'label': 'CPU space time parallel'},
124+
{'marker': '^', 'label': 'GPU'},
125+
]
126+
configs = [
127+
GrayScottSpaceScalingCPU(space_time_parallel=False),
128+
GrayScottSpaceScalingCPU(space_time_parallel=True),
129+
GrayScottSpaceScalingGPU(space_time_parallel=False),
130+
]
131+
132+
for config, params in zip(configs, plottings_params):
133+
config.plot_scaling_test(strong=strong, ax=ax, **params)
134+
ax.legend(frameon=False)
135+
fig.savefig(f'{PROJECT_PATH}/plots/{"strong" if strong else "weak"}_scaling_{problem}.pdf', bbox_inches='tight')
136+
else:
137+
raise NotImplementedError
138+
139+
140+
if __name__ == '__main__':
141+
import argparse
142+
143+
parser = argparse.ArgumentParser()
144+
parser.add_argument('--scaling', type=str, choices=['strong', 'weak'], default='strong')
145+
parser.add_argument('--mode', type=str, choices=['run', 'plot'], default='run')
146+
parser.add_argument('--problem', type=str, default='GS')
147+
parser.add_argument('--XPU', type=str, choices=['CPU', 'GPU'], default='CPU')
148+
parser.add_argument('--space_time', type=str, choices=['True', 'False'], default='False')
149+
150+
args = parser.parse_args()
151+
152+
strong = args.scaling == 'strong'
153+
154+
if args.problem == 'GS':
155+
if args.XPU == 'CPU':
156+
configClass = GrayScottSpaceScalingCPU
157+
else:
158+
configClass = GrayScottSpaceScalingGPU
159+
else:
160+
raise NotImplementedError(f'Don\'t know problem {args.problem!r}')
161+
162+
kwargs = {'space_time_parallel': args.space_time}
163+
config = configClass(**kwargs)
164+
165+
if args.mode == 'run':
166+
config.run_scaling_test(strong=strong)
167+
elif args.mode == 'plot':
168+
plot_scalings(strong=strong, problem=args.problem, kwargs=kwargs)
169+
else:
170+
raise NotImplementedError(f'Don\'t know mode {args.mode!r}')

pySDC/projects/GPU/configs/GS_configs.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ def get_config(args):
1212
return GrayScott_GoL(args)
1313
elif name == 'GS_USkate':
1414
return GrayScott_USkate(args)
15+
elif name == 'GS_scaling':
16+
return GrayScottScaling(args)
1517
else:
1618
return NotImplementedError(f'Don\'t know config {name}')
1719

@@ -183,3 +185,18 @@ def get_description(self, *args, **kwargs):
183185
desc['convergence_controllers'][Adaptivity] = {'e_tol': 1e-3}
184186
self.Tend = 100000
185187
return desc
188+
189+
190+
class GrayScottScaling(GrayScott):
191+
def get_description(self, *args, **kwargs):
192+
desc = super().get_description(*args, **kwargs)
193+
desc['problem_params']['L'] = 2
194+
desc['problem_params']['num_blobs'] = 4
195+
desc['sweeper_params']['skip_residual_computation'] = ('IT_CHECK', 'IT_DOWN', 'IT_UP', 'IT_FINE', 'IT_COARSE')
196+
self.Tend = 100 * desc['level_params']['dt']
197+
return desc
198+
199+
def get_controller_params(self, *args, **kwargs):
200+
params = super().get_controller_params(*args, **kwargs)
201+
params['hook_class'] = []
202+
return params
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
PROJECT_PATH = '/p/project1/ccstma/baumann7/pySDC/pySDC/projects/GPU'
2+
DEFAULT_SBATCH_OPTIONS = ['-A cstma', '--threads-per-core=1', f'--output={PROJECT_PATH}/etc/slurm-out/%j.txt']
3+
DEFAULT_SRUN_OPTIONS = ['--cpu-bind=sockets']
4+
5+
6+
def generate_directories():
7+
import os
8+
9+
for name in ['jobscripts', 'slurm-out']:
10+
path = f'{PROJECT_PATH}/etc/{name}'
11+
os.makedirs(path, exist_ok=True)
12+
13+
14+
def get_jobscript_text(sbatch_options, srun_options, command, cluster):
15+
msg = '#!/usr/bin/bash\n\n'
16+
for op in DEFAULT_SBATCH_OPTIONS + sbatch_options:
17+
msg += f'#SBATCH {op}\n'
18+
19+
msg += f'\nsource {PROJECT_PATH}/etc/venv_{cluster.lower()}/activate.sh\n'
20+
21+
srun_cmd = 'srun'
22+
for op in DEFAULT_SRUN_OPTIONS + srun_options:
23+
srun_cmd += f' {op}'
24+
25+
msg += f'\n{srun_cmd} python {PROJECT_PATH}/{command}'
26+
return msg
27+
28+
29+
def write_jobscript(sbatch_options, srun_options, command, cluster, submit=True):
30+
generate_directories()
31+
32+
text = get_jobscript_text(sbatch_options, srun_options, command, cluster)
33+
34+
path = f'{PROJECT_PATH}/etc/jobscripts/{command.replace(" ", "").replace("/", "_")}-{cluster}.sh'
35+
with open(path, 'w') as file:
36+
file.write(text)
37+
38+
if submit:
39+
import os
40+
41+
os.system(f'sbatch {path}')
42+
43+
44+
if __name__ == '__main__':
45+
sbatch_options = ['--nodes=1']
46+
srun_options = []
47+
command = 'run_problems.py'
48+
write_jobscript(sbatch_options, srun_options, command, 'jusuf')
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/bin/bash
2+
3+
# See https://stackoverflow.com/a/28336473
4+
SOURCE_PATH="${BASH_SOURCE[0]:-${(%):-%x}}"
5+
6+
RELATIVE_PATH="$(dirname "$SOURCE_PATH")"
7+
ABSOLUTE_PATH="$(realpath "${RELATIVE_PATH}")"
8+
9+
[[ "$0" != "${SOURCE_PATH}" ]] && echo "The activation script must be sourced, otherwise the virtual environment will not work." || ( echo "Vars script must be sourced." && exit 1) ;
10+
11+
source "${ABSOLUTE_PATH}"/config.sh
12+
source "${ABSOLUTE_PATH}"/modules.sh
13+
14+
export PYTHONPATH="$(echo "${ENV_DIR}"/lib/python*/site-packages):${PYTHONPATH}"
15+
16+
source "${ENV_DIR}"/bin/activate
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
SOURCE_PATH="${BASH_SOURCE[0]:-${(%):-%x}}"
2+
3+
## Check if this script is sourced
4+
[[ "$0" != "${SOURCE_PATH}" ]] && echo "Setting vars" || ( echo "Vars script must be sourced." && exit 1) ;
5+
## Determine location of this file
6+
RELATIVE_PATH="$(dirname "$SOURCE_PATH")"
7+
ABSOLUTE_PATH="$(realpath "${RELATIVE_PATH}")"
8+
####################################
9+
10+
### User Configuration
11+
export ENV_NAME="GPU" # Default Name of the venv is the directory that contains this file
12+
export ENV_DIR="${ABSOLUTE_PATH}"/venv # Default location of this VENV is "./venv"
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#!/bin/bash
2+
3+
SOURCE_PATH="${BASH_SOURCE[0]:-${(%):-%x}}"
4+
5+
RELATIVE_PATH="$(dirname "$SOURCE_PATH")"
6+
ABSOLUTE_PATH="$(realpath "${RELATIVE_PATH}")"
7+
source "${ABSOLUTE_PATH}"/config.sh
8+
9+
KERNELFILE="${ENV_DIR}"/kernel.sh
10+
11+
echo the name is "$ENV_NAME"
12+
13+
echo "Setting up the kernel script in the following dir: " "${KERNELFILE}"
14+
echo "If you use multiprocessing, edit this file to remove the srun call from the kernel and run it again."
15+
echo '#!/bin/bash
16+
17+
source "'"${ABSOLUTE_PATH}"'"/activate.sh
18+
19+
hostname=$(hostname)
20+
21+
if [[ $hostname == *"login"* || $hostname == *"jsfl"* ]]; then
22+
exec python -m ipykernel "$@"
23+
else
24+
srun python -m ipykernel "$@"
25+
fi
26+
' > "${KERNELFILE}"
27+
28+
chmod a+x "${KERNELFILE}"
29+
30+
mkdir -p ~/.local/share/jupyter/kernels/"${ENV_NAME}"
31+
echo '{
32+
"argv": [
33+
"'"${KERNELFILE}"'",
34+
"-f",
35+
"{connection_file}"
36+
],
37+
"display_name": "'"${ENV_NAME}"'",
38+
"language": "python"
39+
}' > ~/.local/share/jupyter/kernels/"${ENV_NAME}"/kernel.json
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/bin/bash
2+
3+
SOURCE_PATH="${BASH_SOURCE[0]:-${(%):-%x}}"
4+
5+
RELATIVE_PATH="$(dirname "$SOURCE_PATH")"
6+
ABSOLUTE_PATH="$(realpath "${RELATIVE_PATH}")"
7+
source "${ABSOLUTE_PATH}"/config.sh
8+
PYTHONWRAPPER="${ABSOLUTE_PATH}"/python
9+
10+
echo '#!/bin/bash
11+
module purge 2> /dev/null
12+
deactivate 2> /dev/null
13+
source '"'${ABSOLUTE_PATH}'"'/activate.sh
14+
python "$@"
15+
' > "${PYTHONWRAPPER}"
16+
17+
chmod a+x "${PYTHONWRAPPER}"
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
module --force purge
2+
module load Stages/2024
3+
module load GCC
4+
module load ParaStationMPI
5+
module load NCCL
6+
module load MPI-settings/CUDA
7+
module load UCX-settings/RC-CUDA
8+
module load Python
9+
module load CuPy
10+
module load FFTW
11+
module load mpi4py
12+
module load FFmpeg/.6.0
13+
module load SciPy-Stack

0 commit comments

Comments
 (0)