-
Notifications
You must be signed in to change notification settings - Fork 16
Expand file tree
/
Copy pathazure_mc.py
More file actions
102 lines (94 loc) · 4.42 KB
/
azure_mc.py
File metadata and controls
102 lines (94 loc) · 4.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# WARNING: for CPU autodetect to work correctly you need to
# 1. Either use ReFrame >= 4.3.3 or temporarily change the 'launcher' for each partition to srun
# 2. Either use ReFrame >= 4.3.3 or run from a clone of the ReFrame repository
# Without this, the autodetect job fails because
# 1. A missing mpirun command
# 2. An incorrect directory structure is assumed when preparing the stagedir for the autodetect job
# Related issues
# 1. https://github.com/reframe-hpc/reframe/issues/2926
# 2. https://github.com/reframe-hpc/reframe/issues/2914
import os
from eessi.testsuite.common_config import (common_eessi_init, common_general_config, common_logging_config,
set_common_required_config)
from eessi.testsuite.constants import EXTRAS, FEATURES, SCALES
# This config will write all staging, output and logging to subdirs under this prefix
# Override with RFM_PREFIX environment variable
reframe_prefix = os.path.join(os.environ['HOME'], 'reframe_runs')
# AWS CITC site configuration
site_configuration = {
'systems': [
{
'name': 'Magic_Castle_Azure',
'descr': 'Magic Castle build and test environment on Azure',
'modules_system': 'lmod',
'hostnames': ['login.*', '.*-node'],
'prefix': reframe_prefix,
'partitions': [
{
'name': 'x86_64-amd-zen4-node',
'access': ['--partition=x86-64-amd-zen4-node', '--export=NONE'],
'descr': 'Zen4, 16 cores, 30 GB',
'prepare_cmds': [
# Avoid
# https://www.eessi.io/docs/known_issues/eessi-2023.06/#eessi-production-repository-v202306
'export OMPI_MCA_btl=^uct,ofi'
'export OMPI_MCA_pml=ucx'
'export OMPI_MCA_mtl=^ofi'
# Use override to avoid fallback to zen3
'export EESSI_SOFTWARE_SUBDIR_OVERRIDE=x86_64/amd/zen4',
common_eessi_init(),
# Required when using srun as launcher with --export=NONE in partition access,
# in order to ensure job steps inherit environment. It doesn't hurt to define
# this even if srun is not used
'export SLURM_EXPORT_ENV=ALL'
],
'extras': {
# For some reason, we cannot ask for the full amount configured as RealMemory in
# /etc/slurm/nodes.conf, so we ask slightly less
EXTRAS.MEM_PER_NODE: 767480
},
},
{
'name': 'aarch64-neoverse-N1-16c-62gb',
'access': ['--partition=aarch64-neoverse-n1-node', '--export=NONE'],
'descr': 'Neoverse N1, 16 cores, 62 GiB',
'prepare_cmds': [
common_eessi_init(),
# Required when using srun as launcher with --export=NONE in partition access,
# in order to ensure job steps inherit environment. It doesn't hurt to define
# this even if srun is not used
'export SLURM_EXPORT_ENV=ALL'
],
'extras': {
# For some reason, we cannot ask for the full amount configured as RealMemory in
# /etc/slurm/nodes.conf, so we ask slightly less
EXTRAS.MEM_PER_NODE: 63480
},
},
]
},
],
'logging': common_logging_config(reframe_prefix),
'general': [
{
# Enable automatic detection of CPU architecture for each partition
# See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information
'remote_detect': True,
**common_general_config(reframe_prefix)
}
],
}
# Add default things to each partition:
partition_defaults = {
'scheduler': 'slurm',
'launcher': 'mpirun',
'features': [
FEATURES.CPU
] + list(SCALES.keys()),
'max_jobs': 1,
}
for system in site_configuration['systems']:
for partition in system['partitions']:
partition.update(partition_defaults)
# Set common Slurm config options
set_common_required_config(site_configuration)