Skip to content

Commit c9bec84

Browse files
committed
merge main
2 parents cd02270 + 781c2d4 commit c9bec84

File tree

8 files changed

+203
-10
lines changed

8 files changed

+203
-10
lines changed

ansible/roles/dnf_repos/defaults/main.yml

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ dnf_repos_filenames:
1717
dnf_repos_version_filenames: "{{ dnf_repos_filenames[ansible_distribution_major_version] }}"
1818

1919
# epel installed separately
20-
dnf_repos_repolist:
20+
dnf_repos_default_repolist:
2121
- file: "{{ dnf_repos_version_filenames.baseos }}"
2222
name: baseos
2323
base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.baseos[ansible_distribution_version] | appliances_repo_to_subpath }}"
@@ -30,6 +30,19 @@ dnf_repos_repolist:
3030
- file: "{{ dnf_repos_version_filenames.extras }}"
3131
name: extras
3232
base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.extras[ansible_distribution_version] | appliances_repo_to_subpath }}"
33+
- file: ceph
34+
name: Ceph
35+
base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.ceph[ansible_distribution_major_version] | appliances_repo_to_subpath }}"
36+
37+
dnf_repos_openhpc_repolist:
38+
- name: OpenHPC
39+
file: OpenHPC
40+
base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.openhpc_base[ansible_distribution_major_version] | appliances_repo_to_subpath }}"
41+
- name: OpenHPC-updates
42+
file: OpenHPC
43+
base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.openhpc_updates[ansible_distribution_major_version] | appliances_repo_to_subpath }}"
44+
45+
dnf_repos_repolist: "{{ dnf_repos_default_repolist + (dnf_repos_openhpc_repolist if (openhpc_install_type | default('ohpc')) == 'ohpc' else []) }}"
3346

3447
dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.epel[ansible_distribution_major_version] | appliances_repo_to_subpath }}"
3548
dnf_repos_epel_description: "epel"

ansible/roles/pulp_site/defaults/main.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,12 @@ pulp_site_rpm_info:
2222
subpath: "{{ appliances_pulp_repos.extras[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}"
2323
- name: "epel-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.epel[pulp_site_target_distribution_version_major].timestamp }}"
2424
subpath: "{{ appliances_pulp_repos.epel[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}"
25+
- name: "ohpc-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.openhpc_base[pulp_site_target_distribution_version_major].timestamp }}"
26+
subpath: "{{ appliances_pulp_repos.openhpc_base[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}"
27+
- name: "ohpc-updates-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.openhpc_updates[pulp_site_target_distribution_version_major].timestamp }}"
28+
subpath: "{{ appliances_pulp_repos.openhpc_updates[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}"
29+
- name: "ceph-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.ceph[pulp_site_target_distribution_version_major].timestamp }}"
30+
subpath: "{{ appliances_pulp_repos.ceph[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}"
2531

2632
pulp_site_rpm_repo_defaults:
2733
remote_username: "{{ pulp_site_upstream_username }}"

docs/production.md

Lines changed: 146 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,151 @@
11
# Production Deployments
22

3-
This page contains some brief notes about differences between the default/demo configuration, as described in the main [README.md](../README.md) and production-ready deployments.
3+
This page contains some brief notes about differences between the default/demo
4+
configuration (as described in the main [README.md](../README.md)) and
5+
production-ready deployments.
6+
7+
- Get it agreed up front what the cluster names will be. Changing this later
8+
requires instance deletion/recreation.
9+
10+
- At least three environments should be created:
11+
- `site`: site-specific base environment
12+
- `production`: production environment
13+
- `staging`: staging environment
14+
15+
A `dev` environment should also be created if considered required, or this
16+
can be left until later.,
17+
18+
These can all be produced using the cookicutter instructions, but the
19+
`production` and `staging` environments will need their
20+
`environments/$ENV/ansible.cfg` file modifying so that they point to the
21+
`site` environment:
22+
23+
```ini
24+
inventory = ../common/inventory,../site/inventory,inventory
25+
```
26+
27+
- To avoid divergence of configuration all possible overrides for group/role
28+
vars should be placed in `environments/site/inventory/group_vars/all/*.yml`
29+
unless the value really is environment-specific (e.g. DNS names for
30+
`openondemand_servername`).
31+
32+
- Where possible hooks should also be placed in `environments/site/hooks/`
33+
and referenced from the `site` and `production` environments, e.g.:
34+
35+
```yaml
36+
# environments/production/hooks/pre.yml:
37+
- name: Import parent hook
38+
import_playbook: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/../site/hooks/pre.yml"
39+
```
40+
41+
- OpenTofu configurations should be defined in the `site` environment and used
42+
as a module from the other environments. This can be done with the
43+
cookie-cutter generated configurations:
44+
- Delete the *contents* of the cookie-cutter generated `terraform/` directories
45+
from the `production` and `staging` environments.
46+
- Create a `main.tf` in those directories which uses `site/terraform/` as a
47+
[module](https://opentofu.org/docs/language/modules/), e.g. :
48+
49+
```
50+
...
51+
module "cluster" {
52+
source = "../../site/terraform/"
53+
54+
cluster_name = "foo"
55+
...
56+
}
57+
```
58+
59+
Note that:
60+
- Environment-specific variables (`cluster_name`) should be hardcoded
61+
into the module block.
62+
- Environment-independent variables (e.g. maybe `cluster_net` if the
63+
same is used for staging and production) should be set as *defaults*
64+
in `environments/site/terraform/variables.tf`, and then don't need to
65+
be passed in to the module.
66+
67+
- Vault-encrypt secrets. Running the `generate-passwords.yml` playbook creates
68+
a secrets file at `environments/$ENV/inventory/group_vars/all/secrets.yml`.
69+
To ensure staging environments are a good model for production this should
70+
generally be moved into the `site` environment. It should be be encrypted
71+
using [Ansible vault](https://docs.ansible.com/ansible/latest/user_guide/vault.html)
72+
and then committed to the repository.
73+
74+
- Ensure created instances have accurate/synchronised time. For VM instances
75+
this is usually provided by the hypervisor, but if not (or for bare metal
76+
instances) it may be necessary to configure or proxy `chronyd` via an
77+
environment hook.
78+
79+
- The cookiecutter provided OpenTofu configurations define resources for home and
80+
state volumes. The former may not be required if the cluster's `/home` is
81+
provided from an external filesystem (or Manila). In any case, in at least
82+
the production environment, and probably also in the staging environment,
83+
the volumes should be manually created and the resources changed to [data
84+
resources](https://opentofu.org/docs/language/data-sources/). This ensures that even if the cluster is deleted via tofu, the
85+
volumes will persist.
86+
87+
For a development environment, having volumes under tofu control via volume
88+
resources is usually appropriate as there may be many instantiations
89+
of this environment.
90+
91+
- Enable `etc_hosts` templating:
92+
93+
```yaml
94+
# environments/site/inventory/groups:
95+
[etc_hosts:children]
96+
cluster
97+
```
498

5-
- Create a site environment. Usually at least production, staging and possibly development environments are required. To avoid divergence of configuration these should all have an `inventory` path referencing a shared, site-specific base environment. Where possible hooks should also be placed in this site-specific environment.
6-
- Vault-encrypt secrets. Running the `generate-passwords.yml` playbook creates a secrets file at `environments/$ENV/inventory/group_vars/all/secrets.yml`. To ensure staging environments are a good model for production this should generally be moved into the site-specific environment. It can be be encrypted using [Ansible vault](https://docs.ansible.com/ansible/latest/user_guide/vault.html) and then committed to the repository.
7-
- Ensure created instances have accurate/synchronised time. For VM instances this is usually provided by the hypervisor, but if not (or for bare metal instances) it may be necessary to configure or proxy `chronyd` via an environment hook.
8-
- Remove production volumes from OpenTofu control. In the default OpenTofu configuration, deleting the resources also deletes the volumes used for persistent state and home directories. This is usually undesirable for production, so these resources should be removed from the OpenTofu configurations and manually deployed once. However note that for development environments leaving them under OpenTofu control is usually best.
999
- Configure Open OpenOndemand - see [specific documentation](openondemand.README.md).
100+
10101
- Remove the `demo_user` user from `environments/$ENV/inventory/group_vars/all/basic_users.yml`
102+
103+
- Modify `environments/site/terraform/nodes.tf` to provide fixed IPs for at least
104+
the control node, and (if not using FIPs) the login node(s):
105+
106+
```
107+
resource "openstack_networking_port_v2" "control" {
108+
...
109+
fixed_ip {
110+
subnet_id = data.openstack_networking_subnet_v2.cluster_subnet.id
111+
ip_address = var.control_ip_address
112+
}
113+
}
114+
```
115+
116+
Note the variable `control_ip_address` is new.
117+
118+
Using fixed IPs will require either using admin credentials or policy changes.
119+
120+
- If floating IPs are required for login nodes, modify the OpenTofu configurations
121+
appropriately.
122+
123+
- Enable persisting login node hostkeys so users do not get annoying ssh warning
124+
messages on reimage:
125+
126+
```yaml
127+
# environments/site/inventory/groups:
128+
[persist_hostkeys:children]
129+
login
130+
```
131+
And configure NFS to include exporting the state directory to these hosts:
132+
133+
```yaml
134+
# environments/common/inventory/group_vars/all/nfs.yml:
135+
nfs_configurations:
136+
# ... potentially, /home defintion from common environment
137+
- comment: Export state directory to login nodes
138+
nfs_enable:
139+
server: "{{ inventory_hostname in groups['control'] }}"
140+
clients: "{{ inventory_hostname in groups['login'] }}"
141+
nfs_server: "{{ nfs_server_default }}"
142+
nfs_export: "/var/lib/state"
143+
nfs_client_mnt_point: "/var/lib/state"
144+
```
145+
See [issue 506](https://github.com/stackhpc/ansible-slurm-appliance/issues/506).
146+
147+
- Consider whether mapping of baremetal nodes to ironic nodes is required. See
148+
[PR 485](https://github.com/stackhpc/ansible-slurm-appliance/pull/485).
149+
150+
- Note [PR 473](https://github.com/stackhpc/ansible-slurm-appliance/pull/473)
151+
may help identify any site-specific configuration.
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"cluster_image": {
3-
"RL8": "openhpc-RL8-250106-1030-1f3298d9",
4-
"RL9": "openhpc-RL9-250106-1112-1f3298d9"
3+
"RL8": "openhpc-RL8-250107-1534-b03caaf3",
4+
"RL9": "openhpc-RL9-250107-1535-b03caaf3"
55
}
66
}

environments/common/inventory/group_vars/all/defaults.yml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,3 +151,24 @@ appliances_pulp_repos:
151151
'8':
152152
timestamp: 20241216T235733
153153
path: epel/8/Everything/x86_64
154+
openhpc_base:
155+
'8':
156+
path: OpenHPC/2/EL_8
157+
timestamp: 20241218T154614
158+
'9':
159+
path: OpenHPC/3/EL_9
160+
timestamp: 20241218T154614
161+
openhpc_updates:
162+
'8':
163+
path: OpenHPC/2/updates/EL_8
164+
timestamp: 20241218T154614
165+
'9':
166+
path: OpenHPC/3/updates/EL_9
167+
timestamp: 20241218T154614
168+
ceph:
169+
'8':
170+
timestamp: 20231104T015751
171+
path: centos/8-stream/storage/x86_64/ceph-quincy
172+
'9':
173+
timestamp: 20240923T233036
174+
path: centos/9-stream/storage/x86_64/ceph-reef

environments/common/inventory/group_vars/all/openhpc.yml

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,15 @@ openhpc_config_extra: {}
3939
openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_extra, list_merge='append') }}"
4040
openhpc_state_save_location: "{{ appliances_state_dir + '/slurmctld' if appliances_state_dir is defined else '/var/spool' }}"
4141

42+
openhpc_install_type: ohpc # 'ohpc' or 'generic', see https://github.com/stackhpc/ansible-slurm-appliance/pull/326
43+
44+
# Empty repo lists from stackhpc.openhpc role defaults, as these repofiles are
45+
# now generated by dnf_repos to allow injecting Ark creds:
46+
ohpc_openhpc_repos:
47+
"9": []
48+
"8": []
49+
50+
# overriding to ensure doesn't overwrite Ark epel repo
4251
ohpc_default_extra_repos:
43-
"9": [] #overriding to ensure doesn't overwrite ark epel repo
52+
"9": []
4453
"8": []
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Empty repo lists from stackhpc.ansible-role-os-manila-mount role defaults, as these repofiles are
2+
# now generated by dnf_repos to allow injecting Ark creds:
3+
os_manila_mount_ceph_rpm_repos: []

requirements.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ roles:
2121
version: v3.1.5
2222
- src: https://github.com/stackhpc/ansible-role-os-manila-mount.git
2323
name: stackhpc.os-manila-mount
24-
version: v24.11.0 # Support ceph quincy for RL9
24+
version: v25.1.1
2525

2626
collections:
2727
- name: containers.podman

0 commit comments

Comments
 (0)