Skip to content

Commit 6b3f6b7

Browse files
author
Alan Christie
committed
fix: Much more robust creation (token handling), backoff and retries
1 parent c1d1005 commit 6b3f6b7

File tree

1 file changed

+89
-65
lines changed

1 file changed

+89
-65
lines changed

operator/handlers.py

Lines changed: 89 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""A kopf handler for the Jupyter CRD."""
22

3+
import json
34
import logging
45
import random
56
import os
@@ -132,7 +133,11 @@ def create_v1alpha3(
132133
raise kopf.PermanentError("No longer supported")
133134

134135

135-
@kopf.on.create("squonk.it", "v2", "jupyternotebooks", id="jupyter")
136+
# For TEMPORARY errors (i.e. those that are not kopf.PermanentError)
137+
# we retry after 20 seconds and only retry 4 times
138+
@kopf.on.create(
139+
"squonk.it", "v2", "jupyternotebooks", id="jupyter", backoff=20, retries=4
140+
)
136141
def create(spec: Dict[str, Any], name: str, namespace: str, **_: Any) -> Dict[str, Any]:
137142
"""Handler for CRD create events.
138143
Here we construct the required Kubernetes objects,
@@ -143,9 +148,6 @@ def create(spec: Dict[str, Any], name: str, namespace: str, **_: Any) -> Dict[st
143148
Kubernetes constantly calling back for a given create.
144149
"""
145150

146-
characters = string.ascii_letters + string.digits
147-
token = "".join(random.sample(characters, 16))
148-
149151
logging.info("Starting create (name=%s namespace=%s)...", name, namespace)
150152
logging.info("spec=%s (name=%s)", spec, name)
151153

@@ -201,57 +203,70 @@ def create(spec: Dict[str, Any], name: str, namespace: str, **_: Any) -> Dict[st
201203
# ConfigMaps
202204
# ----------
203205

204-
logging.info("Creating ConfigMaps %s...", name)
206+
core_api = kubernetes.client.CoreV1Api()
205207

206-
bp_cm_body = {
207-
"apiVersion": "v1",
208-
"kind": "ConfigMap",
209-
"metadata": {"name": f"bp-{name}", "labels": {"app": name}},
210-
"data": {".bash_profile": _BASH_PROFILE},
211-
}
208+
# We might be here as another attempt to create the same application
209+
# (an exception may have caused a prior execution to fail).
210+
# The operator is configured to re-try on such occasions
211+
# (with a period based on out 'backoff' value set in our decorator).
212+
# Here, we need to check for the existence of the 'config' ConfigMap.
213+
# If it exists, we read it and get the token we had previously set.
214+
# If there is no ConfigMap (404) we are free to set a new token.
215+
cm_name = f"config-{name}"
216+
json_data_key = "jupyter_notebook_config.json"
217+
token = ""
218+
config_cm = None
219+
try:
220+
config_cm = core_api.read_namespaced_secret(cm_name, namespace)
221+
except kubernetes.client.exceptions.ApiException as ex:
222+
# We 'expect' 404, anything else is an error
223+
if ex.status != 404:
224+
logging.warning(
225+
"Got ApiException [%s/%s] getting CONFIG ConfigMap",
226+
ex.status,
227+
ex.reason,
228+
)
229+
raise ex
230+
if config_cm:
231+
# We retrieved an existing CONFIG - extract the token from it
232+
json_data = json.loads(config_cm.data[json_data_key])
233+
token = json_data["ServerApp"]["token"]
234+
logging.info("Retrieved prior token from CONFIG ConfigMap (%s)", token)
235+
else:
236+
# No prior config - we're free to allocate a new token
237+
characters = string.ascii_letters + string.digits
238+
token = "".join(random.sample(characters, 16))
239+
assert token
212240

213-
startup_cm_body = {
214-
"apiVersion": "v1",
215-
"kind": "ConfigMap",
216-
"metadata": {"name": f"startup-{name}", "labels": {"app": name}},
217-
"data": {"start.sh": _NOTEBOOK_STARTUP},
218-
}
241+
logging.info("Creating ConfigMaps %s...", name)
242+
243+
# We must handle (and ignore) 409 exceptions with the objects we create
244+
# (with the reason 'Conflict'). This is interpreted as 'the object already exists'.
245+
# At this point we do know whether the 'config' ConfigMap exists,
246+
# so we don't need to create that again...
219247

220248
config_vars = {"token": token, "base_url": name}
221249
config_cm_body = {
222250
"apiVersion": "v1",
223251
"kind": "ConfigMap",
224-
"metadata": {"name": f"config-{name}", "labels": {"app": name}},
225-
"data": {"jupyter_notebook_config.json": _NOTEBOOK_CONFIG % config_vars},
252+
"metadata": {"name": cm_name, "labels": {"app": name}},
253+
"data": {json_data_key: _NOTEBOOK_CONFIG % config_vars},
226254
}
255+
kopf.adopt(config_cm_body)
256+
if not config_cm:
257+
# We create it because we know it does not exists.
258+
# No exception handling - any exceptions just get passed up to kopf...
259+
core_api.create_namespaced_config_map(
260+
namespace, config_cm_body, _request_timeout=_REQUEST_TIMEOUT
261+
)
227262

228-
create_response = {
229-
"notebook": {
230-
"url": f"http://{ingress_domain}{ingress_path}?token={token}",
231-
"token": token,
232-
"interface": notebook_interface,
233-
},
234-
"image": image,
235-
"serviceAccountName": service_account,
236-
"resources": {
237-
"requests": {"memory": memory_request},
238-
"limits": {"memory": memory_limit},
239-
},
240-
"project": {"claimName": project_claim_name, "id": project_id},
263+
bp_cm_body = {
264+
"apiVersion": "v1",
265+
"kind": "ConfigMap",
266+
"metadata": {"name": f"bp-{name}", "labels": {"app": name}},
267+
"data": {".bash_profile": _BASH_PROFILE},
241268
}
242-
243269
kopf.adopt(bp_cm_body)
244-
kopf.adopt(startup_cm_body)
245-
kopf.adopt(config_cm_body)
246-
core_api = kubernetes.client.CoreV1Api()
247-
248-
# We currently see a number of 409 exceptions with the objects we create
249-
# (with the reason 'Conflict'). As each one has a unique name
250-
# we have to assume there is a serious underlying problem
251-
# in kopf of kubernetes. For now, if the first object we create
252-
# already exists let us assume they all do?
253-
#
254-
# Added as a work-around for sc-
255270
try:
256271
core_api.create_namespaced_config_map(
257272
namespace, bp_cm_body, _request_timeout=_REQUEST_TIMEOUT
@@ -261,9 +276,16 @@ def create(spec: Dict[str, Any], name: str, namespace: str, **_: Any) -> Dict[st
261276
raise ex
262277
# Warn, but ignore and return a valid 'create' response now.
263278
logging.warning(
264-
"Got ApiException [409/Conflict] creating BP ConfigMap. Ignoring [#10]"
279+
"Got ApiException [409/Conflict] creating BP ConfigMap. Ignoring"
265280
)
266281

282+
startup_cm_body = {
283+
"apiVersion": "v1",
284+
"kind": "ConfigMap",
285+
"metadata": {"name": f"startup-{name}", "labels": {"app": name}},
286+
"data": {"start.sh": _NOTEBOOK_STARTUP},
287+
}
288+
kopf.adopt(startup_cm_body)
267289
try:
268290
core_api.create_namespaced_config_map(
269291
namespace, startup_cm_body, _request_timeout=_REQUEST_TIMEOUT
@@ -273,19 +295,7 @@ def create(spec: Dict[str, Any], name: str, namespace: str, **_: Any) -> Dict[st
273295
raise ex
274296
# Warn, but ignore and return a valid 'create' response now.
275297
logging.warning(
276-
"Got ApiException [409/Conflict] creating STARTUP ConfigMap. Ignoring [#10]"
277-
)
278-
279-
try:
280-
core_api.create_namespaced_config_map(
281-
namespace, config_cm_body, _request_timeout=_REQUEST_TIMEOUT
282-
)
283-
except kubernetes.client.exceptions.ApiException as ex:
284-
if ex.status != 409 or ex.reason != "Conflict":
285-
raise ex
286-
# Warn, but ignore and return a valid 'create' response now.
287-
logging.warning(
288-
"Got ApiException [409/Conflict] creating CONFIG ConfigMap. Ignoring [#10]"
298+
"Got ApiException [409/Conflict] creating STARTUP ConfigMap. Ignoring"
289299
)
290300

291301
logging.info("Created ConfigMaps")
@@ -400,8 +410,9 @@ def create(spec: Dict[str, Any], name: str, namespace: str, **_: Any) -> Dict[st
400410
# Add the instance owner (expected to have been extracted from a label)
401411
c_env.append({"name": "DM_INSTANCE_OWNER", "value": str(instance_owner)})
402412

403-
kopf.adopt(deployment_body)
404413
apps_api = kubernetes.client.AppsV1Api()
414+
415+
kopf.adopt(deployment_body)
405416
try:
406417
apps_api.create_namespaced_deployment(
407418
namespace, deployment_body, _request_timeout=_REQUEST_TIMEOUT
@@ -411,7 +422,7 @@ def create(spec: Dict[str, Any], name: str, namespace: str, **_: Any) -> Dict[st
411422
raise ex
412423
# Warn, but ignore and return a valid 'create' response now.
413424
logging.warning(
414-
"Got ApiException [409/Conflict] creating CONFIG ConfigMap. Ignoring [#10]"
425+
"Got ApiException [409/Conflict] creating CONFIG ConfigMap. Ignoring"
415426
)
416427

417428
logging.info("Created deployment")
@@ -449,7 +460,7 @@ def create(spec: Dict[str, Any], name: str, namespace: str, **_: Any) -> Dict[st
449460
raise ex
450461
# Warn, but ignore and return a valid 'create' response now.
451462
logging.warning(
452-
"Got ApiException [409/Conflict] creating CONFIG ConfigMap. Ignoring [#10]"
463+
"Got ApiException [409/Conflict] creating CONFIG ConfigMap. Ignoring"
453464
)
454465

455466
logging.info("Created service")
@@ -497,8 +508,9 @@ def create(spec: Dict[str, Any], name: str, namespace: str, **_: Any) -> Dict[st
497508
annotations = ingress_body["metadata"]["annotations"]
498509
annotations["cert-manager.io/cluster-issuer"] = ingress_cert_issuer
499510

500-
kopf.adopt(ingress_body)
501511
ext_api = kubernetes.client.NetworkingV1Api()
512+
513+
kopf.adopt(ingress_body)
502514
try:
503515
ext_api.create_namespaced_ingress(
504516
namespace, ingress_body, _request_timeout=_REQUEST_TIMEOUT
@@ -508,12 +520,24 @@ def create(spec: Dict[str, Any], name: str, namespace: str, **_: Any) -> Dict[st
508520
raise ex
509521
# Warn, but ignore and return a valid 'create' response now.
510522
logging.warning(
511-
"Got ApiException [409/Conflict] creating CONFIG ConfigMap. Ignoring [#10]"
523+
"Got ApiException [409/Conflict] creating CONFIG ConfigMap. Ignoring"
512524
)
513525

514526
logging.info("Created ingress")
515527

516528
# Done
517529
# ----
518-
519-
return create_response
530+
return {
531+
"notebook": {
532+
"url": f"http://{ingress_domain}{ingress_path}?token={token}",
533+
"token": token,
534+
"interface": notebook_interface,
535+
},
536+
"image": image,
537+
"serviceAccountName": service_account,
538+
"resources": {
539+
"requests": {"memory": memory_request},
540+
"limits": {"memory": memory_limit},
541+
},
542+
"project": {"claimName": project_claim_name, "id": project_id},
543+
}

0 commit comments

Comments
 (0)