Skip to content

Commit c3dca48

Browse files
authored
Hotfix/SS-1807 patch inconsistent deployment (#478)
1 parent 82686b7 commit c3dca48

File tree

4 files changed

+242
-18
lines changed

4 files changed

+242
-18
lines changed

apps/helpers.py

Lines changed: 84 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -312,8 +312,15 @@ def create_instance_from_form(form, project, app_slug, app_id=None, force_redepl
312312
assert project is not None, "This function requires a project object"
313313

314314
new_app = app_id is None
315+
requested_app_slug = app_slug
315316

316-
logger.debug(f"Creating or updating a user app via UI form for app_id={app_id}, new_app={new_app}")
317+
logger.info(
318+
"create_instance_from_form.start app_id=%s new_app=%s app_slug=%s project_id=%s",
319+
app_id,
320+
new_app,
321+
app_slug,
322+
project.pk,
323+
)
317324

318325
if new_app:
319326
do_deploy = True
@@ -348,6 +355,12 @@ def create_instance_from_form(form, project, app_slug, app_id=None, force_redepl
348355
break
349356

350357
subdomain_name, is_created_by_user = get_subdomain_name(form)
358+
logger.info(
359+
"create_instance_from_form.subdomain_selected app_id=%s subdomain=%s is_created_by_user=%s",
360+
app_id,
361+
subdomain_name,
362+
is_created_by_user,
363+
)
351364

352365
instance = form.save(commit=False)
353366

@@ -361,11 +374,23 @@ def create_instance_from_form(form, project, app_slug, app_id=None, force_redepl
361374
subdomain = Subdomain.objects.get(subdomain=subdomain_name, project=project, is_created_by_user=is_created_by_user)
362375
assert subdomain is not None
363376
assert subdomain.subdomain == subdomain_name
377+
logger.info(
378+
"create_instance_from_form.subdomain_ready app_id=%s subdomain=%s created=%s",
379+
app_id,
380+
subdomain_name,
381+
created,
382+
)
364383

365384
if not new_app:
366385
handle_subdomain_change(instance, subdomain, subdomain_name)
367386

368387
app_slug = handle_shiny_proxy_case(instance, app_slug, app_id)
388+
logger.info(
389+
"create_instance_from_form.app_slug_resolved app_id=%s requested_slug=%s resolved_slug=%s",
390+
app_id,
391+
requested_app_slug,
392+
app_slug,
393+
)
369394

370395
app = get_app(app_slug)
371396

@@ -378,12 +403,37 @@ def create_instance_from_form(form, project, app_slug, app_id=None, force_redepl
378403

379404
setup_instance(instance, subdomain, app, project, user_action)
380405
instance_id = save_instance_and_related_data(instance, form)
406+
logger.info(
407+
"create_instance_from_form.instance_saved app_id=%s instance_id=%s user_action=%s do_deploy=%s",
408+
app_id,
409+
instance_id,
410+
user_action,
411+
do_deploy,
412+
)
381413

382414
if do_deploy:
383-
logger.debug(f"Now deploying resource app with app_id = {app_id}")
384-
deploy_resource.delay(instance.serialize())
415+
serialized_instance = instance.serialize()
416+
logger.info(
417+
"create_instance_from_form.enqueue_on_commit app_id=%s instance_id=%s model=%s pk=%s",
418+
app_id,
419+
instance_id,
420+
serialized_instance.get("model"),
421+
serialized_instance.get("pk"),
422+
)
423+
424+
def enqueue_deploy_task():
425+
logger.info(
426+
"create_instance_from_form.enqueue_dispatch app_id=%s instance_id=%s model=%s pk=%s",
427+
app_id,
428+
instance_id,
429+
serialized_instance.get("model"),
430+
serialized_instance.get("pk"),
431+
)
432+
deploy_resource.delay(serialized_instance)
433+
434+
transaction.on_commit(enqueue_deploy_task)
385435
else:
386-
logger.debug(f"Not re-deploying this app with app_id = {app_id}")
436+
logger.info("create_instance_from_form.deploy_skipped app_id=%s instance_id=%s", app_id, instance_id)
387437

388438
if waffle.switch_is_active("doi_minting_using_invenio"):
389439
image_value_changed = False
@@ -485,10 +535,16 @@ def handle_subdomain_change(instance: Any, subdomain: str, subdomain_name: str)
485535
from .tasks import delete_resource
486536

487537
assert instance is not None, "instance is required"
538+
logger.info(
539+
"handle_subdomain_change.start instance_id=%s current_subdomain=%s requested_subdomain=%s",
540+
instance.pk,
541+
instance.subdomain.subdomain if instance.subdomain else None,
542+
subdomain_name,
543+
)
488544

489545
if instance.subdomain is None:
490546
# The subdomain is not yet created, nothing to do
491-
logger.debug("The subdomain is not yet created, nothing to do")
547+
logger.info("handle_subdomain_change.skip_no_existing_subdomain instance_id=%s", instance.pk)
492548
return
493549

494550
if instance.subdomain.subdomain != subdomain_name:
@@ -498,8 +554,21 @@ def handle_subdomain_change(instance: Any, subdomain: str, subdomain_name: str)
498554
old_subdomain = instance.subdomain
499555
instance.subdomain = subdomain
500556
instance.save(update_fields=["subdomain"])
557+
logger.info(
558+
"handle_subdomain_change.updated instance_id=%s old_subdomain=%s new_subdomain=%s",
559+
instance.pk,
560+
old_subdomain.subdomain if old_subdomain else None,
561+
subdomain_name,
562+
)
501563
if old_subdomain and not old_subdomain.is_created_by_user:
502564
old_subdomain.delete()
565+
logger.info(
566+
"handle_subdomain_change.deleted_old_subdomain instance_id=%s old_subdomain=%s",
567+
instance.pk,
568+
old_subdomain.subdomain,
569+
)
570+
else:
571+
logger.info("handle_subdomain_change.no_change instance_id=%s subdomain=%s", instance.pk, subdomain_name)
503572

504573

505574
def handle_shiny_proxy_case(instance, app_slug, app_id):
@@ -526,6 +595,14 @@ def setup_instance(instance, subdomain, app, project, user_action=None, is_creat
526595
instance.project = project
527596
instance.owner = project.owner
528597
instance.latest_user_action = user_action
598+
logger.info(
599+
"setup_instance.assigned instance_id=%s subdomain=%s app_slug=%s project_id=%s user_action=%s",
600+
instance.pk,
601+
subdomain.subdomain if subdomain else None,
602+
app.slug if app else None,
603+
project.pk if project else None,
604+
user_action,
605+
)
529606

530607

531608
def save_instance_and_related_data(instance: Any, form: Any) -> int:
@@ -535,13 +612,15 @@ def save_instance_and_related_data(instance: Any, form: Any) -> int:
535612
Returns:
536613
- int: The Id of the new or updated app instance.
537614
"""
615+
logger.info("save_instance_and_related_data.start instance_id=%s", instance.pk)
538616
instance.save()
539617
form.save_m2m()
540618
instance.set_k8s_values()
541619
instance.url = get_URI(instance)
542620
# For MLFLOW, we need to set the k8s_values again to update the URL
543621
instance.set_k8s_values()
544622
instance.save(update_fields=["k8s_values", "url"])
623+
logger.info("save_instance_and_related_data.finish instance_id=%s url=%s", instance.id, instance.url)
545624
return instance.id
546625

547626

apps/tasks.py

Lines changed: 143 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,25 @@
2424
logger = get_logger(__name__)
2525

2626
CHART_REGEX = re.compile(r"^(?P<chart>.+):(?P<version>.+)$")
27+
DEPLOY_RESOURCE_MAX_RETRIES = 3
28+
DEPLOY_RESOURCE_RETRY_BASE_SECONDS = 10
29+
DEPLOY_RESOURCE_RETRY_MAX_SECONDS = 30
30+
31+
32+
class MissingSerializedInstanceError(ValueError):
33+
"""Raised when a serialized model/pk cannot be resolved from the database."""
34+
35+
def __init__(self, model: str, pk: int, base_instance_exists: bool):
36+
self.model = model
37+
self.pk = pk
38+
self.base_instance_exists = base_instance_exists
39+
super().__init__(
40+
f"No instance found for model {model} with pk {pk} (base_instance_exists={base_instance_exists})"
41+
)
42+
43+
44+
def _retry_countdown(current_retries: int) -> int:
45+
return min(DEPLOY_RESOURCE_RETRY_BASE_SECONDS * (2**current_retries), DEPLOY_RESOURCE_RETRY_MAX_SECONDS)
2746

2847

2948
@app.task
@@ -198,11 +217,71 @@ def get_manifest_yaml(release_name: str, namespace: str = "default") -> tuple[st
198217
return e.stdout, e.stderr
199218

200219

201-
@shared_task
220+
@shared_task(bind=True, max_retries=DEPLOY_RESOURCE_MAX_RETRIES)
202221
@transaction.atomic
203-
def deploy_resource(serialized_instance):
204-
instance: BaseAppInstance = deserialize(serialized_instance)
205-
logger.info("Deploying resource for instance %s", instance)
222+
def deploy_resource(self, serialized_instance):
223+
model = serialized_instance.get("model") if isinstance(serialized_instance, dict) else None
224+
pk = serialized_instance.get("pk") if isinstance(serialized_instance, dict) else None
225+
task_id = getattr(self.request, "id", None)
226+
227+
logger.info(
228+
"deploy_resource.start task_id=%s model=%s pk=%s retry=%s",
229+
task_id,
230+
model,
231+
pk,
232+
self.request.retries,
233+
)
234+
235+
try:
236+
instance: BaseAppInstance = deserialize(serialized_instance)
237+
except MissingSerializedInstanceError as exc:
238+
retries = self.request.retries
239+
if retries < self.max_retries:
240+
countdown = _retry_countdown(retries)
241+
logger.warning(
242+
"deploy_resource.missing_instance_retry task_id=%s model=%s pk=%s retry=%s/%s "
243+
"countdown=%ss base_instance_exists=%s",
244+
task_id,
245+
exc.model,
246+
exc.pk,
247+
retries + 1,
248+
self.max_retries,
249+
countdown,
250+
exc.base_instance_exists,
251+
)
252+
raise self.retry(exc=exc, countdown=countdown)
253+
254+
logger.error(
255+
"deploy_resource.missing_instance_exhausted task_id=%s model=%s pk=%s retries=%s "
256+
"base_instance_exists=%s",
257+
task_id,
258+
exc.model,
259+
exc.pk,
260+
retries,
261+
exc.base_instance_exists,
262+
)
263+
raise
264+
265+
logger.info(
266+
"deploy_resource.instance_resolved task_id=%s model=%s pk=%s instance_id=%s app_slug=%s",
267+
task_id,
268+
model,
269+
pk,
270+
instance.pk,
271+
instance.app.slug,
272+
)
273+
274+
deleted_on = getattr(instance, "deleted_on", None)
275+
if instance.latest_user_action in {"Deleting", "SystemDeleting"} or deleted_on is not None:
276+
logger.info(
277+
"deploy_resource.skip_deleting task_id=%s instance_id=%s latest_user_action=%s deleted_on=%s",
278+
task_id,
279+
instance.pk,
280+
instance.latest_user_action,
281+
deleted_on,
282+
)
283+
return
284+
206285
values = get_merged_k8s_values(instance, ensure_up_to_date=True)
207286
release = values["subdomain"]
208287
chart: str = instance.chart
@@ -263,17 +342,59 @@ def deploy_resource(serialized_instance):
263342
if not valid_deployment:
264343
logger.warning(f"The deployment manifest file is INVALID for release {release}. {validation_output}")
265344

345+
logger.info(
346+
"deploy_resource.helm_install_start task_id=%s instance_id=%s release=%s namespace=%s chart=%s version=%s",
347+
task_id,
348+
instance.pk,
349+
release,
350+
values["namespace"],
351+
chart,
352+
version,
353+
)
354+
266355
# Install the app using Helm install
267356
output, error = helm_install(release, chart, values["namespace"], values_file, version)
268357
success = not error
269358

359+
if not success:
360+
retries = self.request.retries
361+
logger.warning(
362+
"deploy_resource.helm_install_failed task_id=%s instance_id=%s retry=%s/%s release=%s stderr=%s",
363+
task_id,
364+
instance.pk,
365+
retries,
366+
self.max_retries,
367+
release,
368+
error,
369+
)
370+
if retries < self.max_retries:
371+
countdown = _retry_countdown(retries)
372+
logger.info(
373+
"deploy_resource.helm_install_retry task_id=%s instance_id=%s retry=%s/%s countdown=%ss",
374+
task_id,
375+
instance.pk,
376+
retries + 1,
377+
self.max_retries,
378+
countdown,
379+
)
380+
raise self.retry(exc=RuntimeError(error or "Helm install failed"), countdown=countdown)
381+
382+
logger.info(
383+
"deploy_resource.helm_install_done task_id=%s instance_id=%s success=%s release=%s",
384+
task_id,
385+
instance.pk,
386+
success,
387+
release,
388+
)
389+
270390
helm_info = {"success": success, "info": {"stdout": output, "stderr": error}}
271391

272392
instance.info = dict(helm=helm_info)
273393
# instance.app_status.status = "Created" if success else "Failed"
274394

275395
# Only update the info field to avoid overriding other modified fields elsewhere
276396
instance.save(update_fields=["info"])
397+
logger.info("deploy_resource.info_saved task_id=%s instance_id=%s success=%s", task_id, instance.pk, success)
277398

278399
# In development, also generate and validate the k8s deployment manifest
279400
if settings.DEBUG:
@@ -287,6 +408,14 @@ def deploy_resource(serialized_instance):
287408
if deployment_file:
288409
subprocess.run(["rm", "-f", deployment_file])
289410

411+
logger.info(
412+
"deploy_resource.finish task_id=%s instance_id=%s success=%s valid_deployment=%s",
413+
task_id,
414+
instance.pk,
415+
success,
416+
valid_deployment,
417+
)
418+
290419

291420
@shared_task
292421
@transaction.atomic
@@ -303,7 +432,13 @@ def delete_resource(serialized_instance, initiated_by_str: str):
303432
- serialized_instance: A serialized version of the app to be deleted.
304433
- initiated_by_str: A string of enum AppActionOrigin indicating the source of the deletion (user|system).
305434
"""
306-
logger.debug(f"Type of serialized_instance is {type(serialized_instance)}")
435+
logger.info(
436+
"delete_resource.start model=%s pk=%s initiated_by=%s payload_type=%s",
437+
serialized_instance.get("model") if isinstance(serialized_instance, dict) else None,
438+
serialized_instance.get("pk") if isinstance(serialized_instance, dict) else None,
439+
initiated_by_str,
440+
type(serialized_instance),
441+
)
307442

308443
initiated_by = AppActionOrigin(initiated_by_str)
309444
assert initiated_by == AppActionOrigin.USER or initiated_by == AppActionOrigin.SYSTEM
@@ -362,12 +497,14 @@ def deserialize(serialized_instance):
362497

363498
model_class = apps.get_model(app_label, model_name)
364499
instance = model_class.objects.get(pk=pk)
500+
logger.info("deserialize.resolved model=%s pk=%s concrete_model=%s", model, pk, model_class.__name__)
365501

366502
return instance
367503
except (KeyError, ValueError) as e:
368504
raise ValueError(f"Invalid serialized data format: {e}")
369505
except ObjectDoesNotExist:
370-
raise ValueError(f"No instance found for model {model} with pk {pk}")
506+
base_instance_exists = BaseAppInstance.objects.filter(pk=pk).exists()
507+
raise MissingSerializedInstanceError(model=model, pk=pk, base_instance_exists=base_instance_exists)
371508

372509

373510
@app.task

0 commit comments

Comments
 (0)