Skip to content

Commit 009d956

Browse files
committed
Default to retrying platform create once
Adds new jobCreateRetries property to cluster types to allow tooling that is not fully idempotent to skip the retry of create. Before this change, we only retry on delete.
1 parent 7188b9e commit 009d956

File tree

4 files changed

+23
-10
lines changed

4 files changed

+23
-10
lines changed

azimuth_caas_operator/models/v1alpha1/cluster_type.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ class ClusterTypeSpec(schema.BaseModel):
2929
# The timeout (in seconds) to apply to the kubernetes job resource
3030
# which creates, updates and deletes the cluster instances
3131
jobTimeout: int = pydantic.Field(default=1200)
32+
# The number of retries for the kubernetes job resource
33+
jobCreateRetries: int = 1
3234
# Option to add cloud specific details, like the image
3335
extraVars: schema.Dict[str, schema.Any] = pydantic.Field(default_factory=dict)
3436
# Option to define cluster-type specific details, like inventory

azimuth_caas_operator/tests/models/test_crds.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ def test_cluster_type_crd_json(self):
6161
"jobTimeout": {
6262
"type": "integer"
6363
},
64+
"jobCreateRetries": {
65+
"type": "integer"
66+
},
6467
"extraVars": {
6568
"additionalProperties": {
6669
"x-kubernetes-preserve-unknown-fields": true
@@ -301,6 +304,9 @@ def test_cluster_crd_json(self):
301304
"jobTimeout": {
302305
"type": "integer"
303306
},
307+
"jobCreateRetries": {
308+
"type": "integer"
309+
},
304310
"extraVars": {
305311
"additionalProperties": {
306312
"x-kubernetes-preserve-unknown-fields": true

azimuth_caas_operator/tests/utils/test_ansible_runner.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -174,27 +174,31 @@ def test_get_job_remove(self):
174174
},
175175
clear=True,
176176
)
177-
def test_get_job_remove_with_trust_bundle(self):
177+
def test_get_job_create_with_trust_bundle(self):
178178
cluster = cluster_crd.get_fake()
179179
cluster.spec.leaseName = None
180180
cluster_type = cluster_type_crd.get_fake()
181+
cluster_type.spec.jobCreateRetries = 3
181182

182183
job = ansible_runner.get_job(
183-
cluster, cluster_type.spec, "test1-tfstate", "trust-bundle", remove=True
184+
cluster,
185+
cluster_type.spec,
186+
"test1-tfstate",
187+
"trust-bundle",
184188
)
185189

186190
expected = """\
187191
apiVersion: batch/v1
188192
kind: Job
189193
metadata:
190-
generateName: test1-remove-
194+
generateName: test1-create-
191195
labels:
192-
azimuth-caas-action: remove
196+
azimuth-caas-action: create
193197
azimuth-caas-cluster: test1
194198
namespace: ns1
195199
spec:
196200
activeDeadlineSeconds: 1200
197-
backoffLimit: 1
201+
backoffLimit: 3
198202
template:
199203
spec:
200204
containers:
@@ -206,8 +210,7 @@ def test_get_job_remove_with_trust_bundle(self):
206210
\\nif [ -f /runner/project/requirements.yml ]; then\\n ansible-galaxy install\\
207211
\\ -r /runner/project/requirements.yml\\nelif [ -f /runner/project/roles/requirements.yml\\
208212
\\ ]; then\\n ansible-galaxy install -r /runner/project/roles/requirements.yml\\n\\
209-
fi\\nansible-runner run /runner -j\\nopenstack application credential delete\\
210-
\\ az-caas-test1 || true\\n"
213+
fi\\nansible-runner run /runner -j\\n"
211214
env:
212215
- name: RUNNER_PLAYBOOK
213216
value: sample.yaml
@@ -322,7 +325,7 @@ def test_get_job_remove_with_trust_bundle(self):
322325
- emptyDir: {}
323326
name: ansible-home
324327
- configMap:
325-
name: test1-remove
328+
name: test1-create
326329
name: env
327330
- name: cloudcreds
328331
secret:
@@ -339,7 +342,6 @@ def test_get_job_remove_with_trust_bundle(self):
339342
- configMap:
340343
name: trust-bundle
341344
name: trust-bundle
342-
ttlSecondsAfterFinished: 36000
343345
""" # noqa
344346
self.assertEqual(expected, yaml.safe_dump(job))
345347

azimuth_caas_operator/utils/ansible_runner.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,9 @@ def get_job(
259259

260260
remove_app_cred = remove and (cluster.spec.leaseName is None)
261261

262+
# default to 1 retry for create, if not specified in the type
263+
back_off_limit = 1 if remove else cluster_type_spec.jobCreateRetries
264+
262265
# TODO(johngarbutt): need get secret keyname from somewhere
263266
job_yaml = f"""apiVersion: batch/v1
264267
kind: Job
@@ -477,7 +480,7 @@ def get_job(
477480
if trust_bundle_configmap_name
478481
else ""
479482
}
480-
backoffLimit: {1 if remove else 0}
483+
backoffLimit: {back_off_limit}
481484
# Set timeout so that jobs don't get stuck in configuring state if something goes wrong
482485
activeDeadlineSeconds: {cluster_type_spec.jobTimeout}""" # noqa
483486
return yaml.safe_load(job_yaml)

0 commit comments

Comments
 (0)