Skip to content

Commit 68de8da

Browse files
committed
Merge branch 'main' into feat/update-osc-ood
2 parents 3807119 + 5bedf73 commit 68de8da

File tree

12 files changed

+336
-16
lines changed

12 files changed

+336
-16
lines changed

.markdownlint.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{
2+
"no-duplicate-heading": {
3+
"siblings_only": true
4+
}
5+
}

ansible/roles/cluster_infra/templates/resources.tf.j2

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,14 @@ resource "openstack_networking_port_v2" "login" {
219219
binding {
220220
vnic_type = "{{ cluster_vnic_type | default('normal') }}"
221221
}
222+
223+
lifecycle {
224+
ignore_changes = [
225+
binding, # fixes running as admin
226+
extra_dhcp_option # required for networking-mlnx neutron plugin
227+
]
228+
}
229+
222230
}
223231

224232
# Storage network
@@ -235,6 +243,14 @@ resource "openstack_networking_port_v2" "login_storage" {
235243
binding {
236244
vnic_type = "{{ cluster_storage_vnic_type | default('normal') }}"
237245
}
246+
247+
lifecycle {
248+
ignore_changes = [
249+
binding, # fixes running as admin
250+
extra_dhcp_option # required for networking-mlnx neutron plugin
251+
]
252+
}
253+
238254
}
239255
{% endif %}
240256

@@ -258,8 +274,15 @@ resource "openstack_networking_port_v2" "control" {
258274

259275
binding {
260276
vnic_type = "{{ cluster_vnic_type | default('normal') }}"
277+
}
261278

279+
lifecycle {
280+
ignore_changes = [
281+
binding, # fixes running as admin
282+
extra_dhcp_option # required for networking-mlnx neutron plugin
283+
]
262284
}
285+
263286
}
264287

265288
# Storage network
@@ -276,6 +299,14 @@ resource "openstack_networking_port_v2" "control_storage" {
276299
binding {
277300
vnic_type = "{{ cluster_storage_vnic_type | default('normal') }}"
278301
}
302+
303+
lifecycle {
304+
ignore_changes = [
305+
binding, # fixes running as admin
306+
extra_dhcp_option # required for networking-mlnx neutron plugin
307+
]
308+
}
309+
279310
}
280311
{% endif %}
281312

@@ -301,6 +332,14 @@ resource "openstack_networking_port_v2" "{{ nodegroup.name }}" {
301332
binding {
302333
vnic_type = "{{ cluster_vnic_type | default('normal') }}"
303334
}
335+
336+
lifecycle {
337+
ignore_changes = [
338+
binding, # fixes running as admin
339+
extra_dhcp_option # required for networking-mlnx neutron plugin
340+
]
341+
}
342+
304343
}
305344

306345
# Storage network
@@ -318,6 +357,14 @@ resource "openstack_networking_port_v2" "{{ nodegroup.name }}_storage" {
318357
binding {
319358
vnic_type = "{{ cluster_storage_vnic_type | default('normal') }}"
320359
}
360+
361+
lifecycle {
362+
ignore_changes = [
363+
binding, # fixes running as admin
364+
extra_dhcp_option # required for networking-mlnx neutron plugin
365+
]
366+
}
367+
321368
}
322369
{% endif %}
323370

docs/opentofu-remote-state.md

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
# OpenTofu remote state
2+
3+
OpenTofu supports a number of [remote state backends](https://opentofu.org/docs/language/state/remote/)
4+
which can be used to persist state independently of where a deployment is run.
5+
This allows deployments to be made from anywhere that can access the state
6+
without corrupting or conflicting with any existing resources from previous
7+
deployments.
8+
9+
Using remote state is therefore strongly recommended for environments which
10+
should only be instantiated once, e.g. `production` and `staging`.
11+
12+
This page provides guidance for configuring remote states using backends
13+
commonly available on OpenStack deployments.
14+
15+
> [!IMPORTANT]
16+
> In the below replace `$ENV` with the relevant environment name.
17+
18+
## GitLab
19+
20+
GitLab can be used with the [http backend](https://opentofu.org/docs/language/settings/backends/http/)
21+
to store separate states for each environment within the GitLab project.
22+
Access is protected by GitLab access tokens, which in the approach below are
23+
persisted to local files. Therefore each repository checkout will need to
24+
authenticate separately, using either a separate token or a shared token from
25+
some external secret store.
26+
27+
The below is based on the [official docs](https://docs.gitlab.com/user/infrastructure/iac/terraform_state/)
28+
but includes some missing details and is modified for common appliance workflows.
29+
30+
### Initial setup
31+
32+
1. Create the backend file:
33+
34+
```shell
35+
cp environments/site/tofu/example-backends/gitlab.tf environments/$ENV/tofu
36+
```
37+
38+
2. Modify `environments/$ENV/tofu/gitlab.tf` to set the default for the
39+
project ID. This can be found by clicking the 3-dot menu at the top right of
40+
the GitLab project page.
41+
42+
```terraform
43+
# environments/$ENV/tofu/backend.tf:
44+
terraform {
45+
backend "http" {}
46+
}
47+
```
48+
49+
3. Commit it.
50+
51+
4. Follow the per-checkout steps below.
52+
53+
### Per-checkout configuration
54+
55+
1. Create an access token in the GitLab UI, using either:
56+
57+
a. If project access tokens are available, create one via
58+
Project > Settings > Access tokens.
59+
The token must have `Maintainer` role and `api` scope.
60+
61+
b. Otherwise create a personal access token via
62+
User profile > Preferences > Access tokens.
63+
The token must have `api` scope.
64+
65+
Copy the generated secret and set an environment variable:
66+
67+
```shell
68+
export TF_VAR_gitlab_access_token=$secret
69+
```
70+
71+
2. If using a personal access token, set the GitLab username as an environment variable:
72+
73+
```shell
74+
export TF_VAR_gitlab_username=$your_username
75+
```
76+
77+
3. With the environment activated, initialise OpenTofu.
78+
79+
If no local state exists run:
80+
81+
```shell
82+
cd environments/$ENV/tofu/
83+
tofu init
84+
```
85+
86+
otherwise append `-migrate-state` to the `init` command to attempt to copy
87+
local state to the new backend.
88+
89+
OpenTofu is now configured to use GitLab to store state for this environment.
90+
91+
Repeat for each environment needing remote state.
92+
93+
> [!CAUTION]
94+
> The GitLab credentials are [persisted](https://opentofu.org/docs/language/settings/backends/configuration/#credentials-and-sensitive-data)
95+
> into a file `environments/$ENV/tofu/.terraform/terraform.tfstate` and any
96+
> plan files. These should therefore not be committed.
97+
98+
### Token expiry
99+
100+
If the project token expires repeat the per-checkout configuration, but using
101+
`opentofu init -reconfigure` instead.
102+
103+
## S3
104+
105+
For clouds with S3-compatible object storage (e.g. Ceph with [radosgw](https://docs.ceph.com/en/latest/radosgw/))
106+
the S3 backend can be used. This approach uses a bucket per environment and
107+
derives credentials from OpenStack credentials, meaning no backend-specific
108+
per-checkout configuration is required.
109+
110+
### Initial setup
111+
112+
1. Create an S3 bucket with a name `${cluster_name}-${environment_name}-tfstate`
113+
where:
114+
115+
- `CLUSTER_NAME` is defined in `environments/$ENV/tofu/main.tf`
116+
- `$ENVIRONMENT_NAME` is the name of the environment directory
117+
118+
e.g.
119+
120+
```shell
121+
openstack container create research-staging-tfstate
122+
```
123+
124+
2. Create `ec2` credentials:
125+
126+
```shell
127+
openstack ec2 credentials create
128+
```
129+
130+
and make a note of the `access` field returned.
131+
132+
3. Create the backend file:
133+
134+
```shell
135+
cp environments/site/tofu/example-backends/s3.tf environments/$ENV/tofu
136+
```
137+
138+
4. Modify `environments/$ENV/tofu/s3.tf` to set the default for `s3_backend_endpoint`.
139+
This is the radosgw address. If not known it can be determined by creating a
140+
public bucket, and then getting the URL using
141+
Project > Containers > (your public bucket) > Link
142+
which provides a URL of the form `https://$ENDPOINT/swift/...`.
143+
144+
5. Add the following to `environments/$ENV/activate`:
145+
146+
```bash
147+
export AWS_ACCESS_KEY_ID=$EC2_CREDENTIALS_ACCESS
148+
export AWS_SECRET_ACCESS_KEY=$(openstack ec2 credentials show $AWS_ACCESS_KEY_ID -f value -c secret)
149+
```
150+
151+
replacing `$EC2_CREDENTIALS_ACCESS` with the `access` field of the created
152+
credentials.
153+
154+
This avoids these credentials being persisted in local files.
155+
156+
6. Copy the lines above into your shell to set them for your current shell.
157+
158+
7. With the environment activated, initialise OpenTofu.
159+
160+
If no local state exists run:
161+
162+
```shell
163+
cd environments/$ENV/tofu/
164+
tofu init
165+
```
166+
167+
otherwise append `-migrate-state` to the `init` command to attempt to copy
168+
local state to the new backend.
169+
170+
8. If this fails, try setting `use_path_style = true` in `environments/$ENV/tofu/s3.tf`.
171+
172+
9. Once it works, commit `environments/$ENV/tofu/s3.tf` and `environments/$ENV/activate`.
173+
174+
OpenTofu is now configured to use the cloud's S3-compatible storage to store
175+
state for this environment.
176+
177+
Repeat for each environment needing remote state.
178+
179+
For more configuration options, see the OpenTofu [s3 backend docs](https://opentofu.org/docs/language/settings/backends/s3/).
180+
181+
### Per-checkout configuration
182+
183+
The ec2 credentials will automatically be loaded when activating the environment.
184+
For a new checkout simply initialise OpenTofu as normal as described in step 7 above.

docs/production.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,9 @@ The value chosen should be the highest value demonstrated during testing. Note
316316
that any time spent blocked due to this parallelism limit does not count
317317
against the (un-overridable) internal OpenTofu timeout of 30 minutes
318318

319+
Consider configuring [OpenTofu remote state](./opentofu-remote-state.md) for any
320+
environments which should be unique, e.g. production and staging.
321+
319322
## Configure appliance
320323

321324
### Production configuration to consider

environments/common/inventory/group_vars/all/openondemand.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ openondemand_apps_jupyter_default:
161161
- node
162162
attributes: # TODO
163163
num_cores:
164-
label: Number of cores FOO
164+
label: Number of cores
165165
value: 1
166166
modules: ""
167167
extra_jupyter_args: ""

environments/common/inventory/groups

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -91,22 +91,22 @@ fail2ban
9191
# Add `openhpc` group to add slurm users via creation of users on each node.
9292

9393
[openondemand]
94-
# Host to run Open Ondemand server on - subset of login
94+
# Host to run Open OnDemand server on - subset of login
9595

9696
[openondemand_desktop]
97-
# Subset of compute to run a interactive desktops on via Open Ondemand
97+
# Subset of compute to run a interactive desktops on via Open OnDemand
9898

9999
[openondemand_jupyter]
100-
# Subset of compute to run a Jupyter Notebook servers on via Open Ondemand
100+
# Subset of compute to run a Jupyter Notebook servers on via Open OnDemand
101101

102102
[openondemand_rstudio]
103-
# Subset of compute to run RStudio servers on via Open Ondemand
103+
# Subset of compute to run RStudio servers on via Open OnDemand
104104

105105
[openondemand_matlab]
106-
# Subset of compute to run RStudio servers on via Open Ondemand
106+
# Subset of compute to run a MATLAB interactive desktop on via Open OnDemand
107107

108108
[openondemand_codeserver]
109-
# Subset of compute to run a Codeserver VSCode instance on via Open Ondemand
109+
# Subset of compute to run a Codeserver VSCode instance on via Open OnDemand
110110

111111
[etc_hosts]
112112
# Hosts to manage /etc/hosts e.g. if no internal DNS. See ansible/roles/etc_hosts/README.md
@@ -217,4 +217,3 @@ extra_packages
217217
# separately from the appliance. e.g
218218
# pulp_host ansible_host=<VM-ip-address>
219219
# Note the host name can't conflict with group names i.e can't be called `pulp` or `pulp_server`
220-

environments/site/inventory/groups

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,27 +44,27 @@ login
4444
openhpc
4545

4646
[openondemand:children]
47-
# Host to run Open Ondemand server on - subset of login
47+
# Host to run Open OnDemand server on - subset of login
4848
login
4949

5050
[openondemand_desktop:children]
51-
# Subset of compute to run a interactive desktops on via Open Ondemand
51+
# Subset of compute to run a interactive desktops on via Open OnDemand
5252
compute
5353

5454
[openondemand_jupyter:children]
55-
# Subset of compute to run a Jupyter Notebook servers on via Open Ondemand
55+
# Subset of compute to run a Jupyter Notebook servers on via Open OnDemand
5656
compute
5757

5858
[openondemand_rstudio:children]
59-
# Subset of compute to run RStudio servers on via Open Ondemand
59+
# Subset of compute to run RStudio servers on via Open OnDemand
6060
compute
6161

6262
[openondemand_matlab:children]
63-
# Subset of compute to run a MATLAB interactive desktop on via Open Ondemand
63+
# Subset of compute to run a MATLAB interactive desktop on via Open OnDemand
6464
compute
6565

6666
[openondemand_codeserver:children]
67-
# Subset of compute to run a Codeserver VSCode instance on via Open Ondemand
67+
# Subset of compute to run a Codeserver VSCode instance on via Open OnDemand
6868
compute
6969

7070
[etc_hosts:children]
@@ -81,6 +81,9 @@ cluster
8181
# Hosts to recompile Slurm for - allows supporting Slurm autodetection method 'nvml'
8282
cuda
8383

84+
[vgpu]
85+
# Hosts where vGPU/MIG should be configured - see docs/mig.md
86+
8487
[eessi:children]
8588
# Hosts on which EESSI stack should be configured
8689
openhpc

environments/site/tofu/control.tf

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,13 @@ resource "openstack_networking_port_v2" "control" {
3434
binding {
3535
vnic_type = lookup(var.vnic_types, each.key, "normal")
3636
}
37+
38+
lifecycle {
39+
ignore_changes = [
40+
binding, # fixes running as admin
41+
extra_dhcp_option # required for networking-mlnx neutron plugin
42+
]
43+
}
3744
}
3845

3946
resource "openstack_compute_instance_v2" "control" {

0 commit comments

Comments
 (0)