forked from GoogleCloudPlatform/cluster-toolkit
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhcls-blueprint.yaml
More file actions
373 lines (325 loc) · 11.4 KB
/
hcls-blueprint.yaml
File metadata and controls
373 lines (325 loc) · 11.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
# Copyright 2026 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
blueprint_name: hcls-cluster-v6
validators:
- validator: test_apis_enabled
skip: true # skipping this validator, since "service-enablement" will take care of it.
vars:
project_id: ## Set GCP Project ID Here ##
deployment_name: hcls-cluster-v6
region: asia-southeast1
zone: asia-southeast1-b
bucket_force_destroy: false
deployment_groups:
- group: enable_apis
modules:
### Enable APIs ###
- id: services-api
source: community/modules/project/service-enablement
settings:
gcp_service_list:
- file.googleapis.com
- iam.googleapis.com
- pubsub.googleapis.com
- secretmanager.googleapis.com
- serviceusage.googleapis.com
- compute.googleapis.com
- stackdriver.googleapis.com
- group: setup
modules:
### Network ###
- id: network
source: modules/network/vpc
# Private Service Access (PSA) requires the compute.networkAdmin role which is
# included in the Owner role, but not Editor.
# PSA is a best practice for Filestore instances, but can be optionally
# removed by deleting the private_service_access module and any references to
# the module by Filestore modules.
# https://cloud.google.com/vpc/docs/configure-private-services-access#permissions
- id: private_service_access
source: modules/network/private-service-access
use: [network]
### Resource Monitoring ###
- id: hpc-dash
source: modules/monitoring/dashboard
### Storage ###
- id: homefs
source: modules/file-system/filestore
use: [network, private_service_access]
settings:
filestore_share_name: homeshare
local_mount: /home
- id: appsfs
source: modules/file-system/filestore
use: [network, private_service_access]
settings:
filestore_share_name: appsshare
local_mount: /apps
- id: bucket-software
source: modules/file-system/cloud-storage-bucket
settings:
name_prefix: hcls-user-provided-software
random_suffix: true
local_mount: /user_provided_software
force_destroy: $(vars.bucket_force_destroy)
outputs: [gcs_bucket_path]
- id: bucket-input
source: modules/file-system/cloud-storage-bucket
settings:
name_prefix: hcls-inputs
random_suffix: true
local_mount: /data_input
mount_options: defaults,_netdev,implicit_dirs,allow_other,dir_mode=0777,file_mode=766
force_destroy: $(vars.bucket_force_destroy)
- id: bucket-output
source: modules/file-system/cloud-storage-bucket
settings:
name_prefix: hcls-outputs
random_suffix: true
local_mount: /data_output
mount_options: defaults,_netdev,implicit_dirs,allow_other,dir_mode=0777,file_mode=766
force_destroy: $(vars.bucket_force_destroy)
- group: software_installation
modules:
### Software ###
- id: spack-setup
source: community/modules/scripts/spack-setup
settings:
install_dir: /apps/spack
- id: spack-execute
source: community/modules/scripts/spack-execute
use: [spack-setup]
settings:
data_files:
- destination: /tmp/projections-config.yaml
content: |
modules:
default:
tcl:
hash_length: 0
all:
conflict:
- '{name}'
projections:
all: '{name}/{version}-{compiler.name}-{compiler.version}'
- destination: /tmp/slurm-external-config.yaml
content: |
packages:
slurm:
externals:
- spec: slurm@21-08-8-2
prefix: /usr/local
buildable: False
- destination: /share/spack/gromacs_env.yaml
content: |
spack:
definitions:
- compilers:
- gcc@11.3.0
- cudas:
- cuda@11.8.0
- cuda_mpis:
- openmpi@4.1.4+cuda
- mpi_cuda_packages:
- gromacs@2022.3+cuda+mpi
specs:
- $compilers
- matrix:
- [$cudas]
- [$%compilers]
- matrix:
- [$cuda_mpis]
- [$%compilers]
- [$^cudas]
- [target=skylake]
- matrix:
- [$mpi_cuda_packages]
- [$^cudas]
- [$^cuda_mpis]
- [$%compilers]
- [target=skylake]
commands: |
spack config --scope defaults add config:build_stage:/apps/spack/spack-stage
spack config --scope defaults add -f /tmp/projections-config.yaml
spack config --scope site add -f /tmp/slurm-external-config.yaml
NVCC_PREPEND_FLAGS='-arch=all'
spack install gcc@11.3.0 target=x86_64
spack load gcc@11.3.0 target=x86_64
spack compiler find --scope site
if ! spack env list | grep -q gromacs; then
spack env create gromacs /share/spack/gromacs_env.yaml
spack env activate gromacs
spack concretize
spack install
fi
- id: spack-builder-startup
source: modules/scripts/startup-script
settings:
runners:
- type: data
destination: /apps/gromacs/submit_gromacs_water_cpu.sh
content: |
#!/bin/bash
#SBATCH -N 1
#SBATCH --ntasks-per-node 30
#SBATCH -p compute
# Size can be 0000.65 0000.96 0001.5 0003 0006 0012 0024 0048 0096 0192 0384 0768 1536 3072
# Type can be 'pme' or 'rf'
if [ -f /apps/spack/share/spack/setup-env.sh ]; then
source /apps/spack/share/spack/setup-env.sh
else
echo "Spack is not yet installed. Please wait approx. 10 minutes and try again."
exit 1
fi
if ! spack env activate gromacs 2>/dev/null; then
echo "Gromacs is not yet installed. Build may take several hours. Please wait and try again."
exit 1
fi
# Check that gmx_mpi exists
which gmx_mpi
cd $SLURM_SUBMIT_DIR
cp /data_input/gromacs_inputs/water-cut1.0_GMX50_bare/1536/* .
mpirun -n 1 gmx_mpi grompp -f pme.mdp -c conf.gro -p topol.top -o input.tpr
mpirun -n 30 gmx_mpi mdrun -notunepme -dlb yes -v -resethway -noconfout -nsteps 4000 -s input.tpr
- type: data
destination: /apps/gromacs/submit_gromacs_water_gpu.sh
content: |
#!/bin/bash
#SBATCH -N 1
#SBATCH --ntasks-per-node 1
#SBATCH -p gpu
#SBATCH --gpus 1
# Size can be 0000.65 0000.96 0001.5 0003 0006 0012 0024 0048 0096 0192 0384 0768 1536 3072
# Type can be 'pme' or 'rf'
if [ -f /apps/spack/share/spack/setup-env.sh ]; then
source /apps/spack/share/spack/setup-env.sh
else
echo "Spack is not yet installed. Please wait approx. 10 minutes and try again."
exit 1
fi
if ! spack env activate gromacs 2>/dev/null; then
echo "Gromacs is not yet installed. Build may take several hours. Please wait and try again."
exit 1
fi
# Check that gmx_mpi exists
which gmx_mpi
cd $SLURM_SUBMIT_DIR
cp /data_input/gromacs_inputs/water-cut1.0_GMX50_bare/1536/* .
# Significant GPU Optimizations only support constraints=h-bonds
# so we change this here for the water benchmark.
for a in *.mdp; do
sed -i 's/constraints[[:blank:]].*=.*all-bonds.*/constraints = h-bonds/' $a
done
mpirun -n 1 gmx_mpi grompp -f pme.mdp -c conf.gro -p topol.top -o input.tpr
mpirun -n 1 -H localhost \
env GMX_ENABLE_DIRECT_GPU_COMM=1 \
gmx_mpi mdrun -v -nsteps 100000 -resetstep 90000 -noconfout \
-pme gpu -update gpu -nb gpu -gputasks 00 -s input.tpr
- $(spack-execute.spack_runner)
- type: shell
destination: data_staging.sh
content: |
#!/bin/bash
wget --no-verbose -P /data_input/protein_data_bank/ https://files.rcsb.org/download/1AKI.pdb
wget --no-verbose -P /tmp/ https://ftp.gromacs.org/pub/benchmarks/water_GMX50_bare.tar.gz && \
mkdir -p /data_input/gromacs_inputs/ && \
tar xzf /tmp/water_GMX50_bare.tar.gz -C /data_input/gromacs_inputs/ && \
rm /tmp/water_GMX50_bare.tar.gz
# Set permissions for Spack environment
chmod -R a+rwX /apps/spack/var/spack/environments/gromacs
- type: shell
destination: shutdown.sh
content: |
#!/bin/bash
if [ ! -f /etc/block_auto_shutdown ]; then
touch /etc/block_auto_shutdown
shutdown -h +1
fi
- id: spack-builder
source: modules/compute/vm-instance
use: [network, appsfs, bucket-input, spack-builder-startup]
settings:
name_prefix: spack-builder
add_deployment_name_before_prefix: true
threads_per_core: 2
machine_type: c2-standard-16
allow_automatic_updates: false
- group: cluster
modules:
### Remote Desktop ###
- id: desktop
source: community/modules/remote-desktop/chrome-remote-desktop
use:
- network
- homefs
- appsfs
- bucket-input
- bucket-output
- bucket-software
settings:
add_deployment_name_before_prefix: true
name_prefix: chrome-remote-desktop
install_nvidia_driver: true
startup_script: |
find /user_provided_software -name vmd-1.9.*.bin.LINUXAMD64*.tar.gz -exec tar xvzf '{}' -C . \;
cd vmd-1.9.*/
./configure
cd src/
sudo make install
### Slurm Cluster ###
- id: compute_nodeset
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network]
settings:
name: ns
node_count_dynamic_max: 20
machine_type: c2-standard-60
allow_automatic_updates: false
- id: compute_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
use:
- compute_nodeset
settings:
partition_name: compute
- id: gpu_nodeset
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network]
settings:
name: gpu
advanced_machine_features:
threads_per_core: null # Use platform default value
node_count_dynamic_max: 20
machine_type: g2-standard-4
allow_automatic_updates: false
- id: gpu_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
use:
- gpu_nodeset
settings:
partition_name: gpu
- id: slurm_login
source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
use: [network]
- id: slurm_controller
source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
use:
- network
- compute_partition
- gpu_partition
- homefs
- appsfs
- bucket-input
- bucket-output
- slurm_login