Skip to content

Commit ab1d7c7

Browse files
authored
Builds: run a healthcheck command in the background (#12332)
Inside the Docker container that runs all the build process, we execute a simple `curl` command to hit our API at `/build/<id>/healthcheck/` every `RTD_BUILD_HEALTHCHECK_DELAY` seconds to communicate the backend the build is healthy. The backend runs a periodic task every 1 minute and check for those builds that haven't had activity in the last `RTD_BUILD_HEALTHCHECK_TIMEOUT` seconds and cancel them. Closes #11870
1 parent 52338ed commit ab1d7c7

File tree

9 files changed

+160
-1
lines changed

9 files changed

+160
-1
lines changed

readthedocs/api/v2/views/model_views.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from django.db.models import When
1414
from django.http import Http404
1515
from django.template.loader import render_to_string
16+
from django.utils import timezone
1617
from rest_framework import decorators
1718
from rest_framework import status
1819
from rest_framework import viewsets
@@ -297,6 +298,31 @@ def concurrent(self, request, **kwargs):
297298
}
298299
return Response(data)
299300

301+
@decorators.action(
302+
detail=True,
303+
permission_classes=[HasBuildAPIKey],
304+
methods=["post"],
305+
)
306+
def healthcheck(self, request, **kwargs):
307+
build = self.get_object()
308+
log.debug(
309+
"Healthcheck received.",
310+
build_id=build.pk,
311+
project_slug=build.version.project.slug,
312+
)
313+
build_api_key = request.build_api_key
314+
if build.version.project.slug != build_api_key.project.slug:
315+
log.warning(
316+
"Project slug doesn't match the one attached to the API key.",
317+
api_key_id=build_api_key.id,
318+
project_slug=build.version.project.slug,
319+
)
320+
raise Http404()
321+
322+
build.healthcheck = timezone.now()
323+
build.save()
324+
return Response(status=status.HTTP_204_NO_CONTENT)
325+
300326
def retrieve(self, *args, **kwargs):
301327
"""
302328
Retrieves command data from storage.
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Generated by Django 5.2.4 on 2025-07-17 11:39
2+
3+
from django.db import migrations
4+
from django.db import models
5+
from django_safemigrate import Safe
6+
7+
8+
class Migration(migrations.Migration):
9+
safe = Safe.before_deploy()
10+
11+
dependencies = [
12+
("builds", "0063_alter_buildcommandresult"),
13+
]
14+
15+
operations = [
16+
migrations.AddField(
17+
model_name="build",
18+
name="healthcheck",
19+
field=models.DateTimeField(blank=True, null=True, verbose_name="Healthcheck"),
20+
),
21+
]

readthedocs/builds/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -644,6 +644,7 @@ class Build(models.Model):
644644
blank=True,
645645
)
646646
date = models.DateTimeField(_("Date"), auto_now_add=True, db_index=True)
647+
healthcheck = models.DateTimeField(_("Healthcheck"), null=True, blank=True)
647648
success = models.BooleanField(_("Success"), default=True)
648649

649650
# TODO: remove these fields (setup, setup_error, output, error, exit_code)

readthedocs/doc_builder/director.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ def create_vcs_environment(self):
120120
environment=self.get_vcs_env_vars(),
121121
container_image=settings.RTD_DOCKER_CLONE_IMAGE,
122122
api_client=self.data.api_client,
123+
build_api_key=self.data.build_api_key,
123124
)
124125

125126
def create_build_environment(self):
@@ -130,6 +131,7 @@ def create_build_environment(self):
130131
build=self.data.build,
131132
environment=self.get_build_env_vars(),
132133
api_client=self.data.api_client,
134+
build_api_key=self.data.build_api_key,
133135
)
134136

135137
def setup_environment(self):

readthedocs/doc_builder/environments.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
import structlog
1111
from django.conf import settings
12+
from django.urls import reverse
1213
from django.utils.translation import gettext_lazy as _
1314
from docker import APIClient
1415
from docker.errors import APIError as DockerAPIError
@@ -20,6 +21,7 @@
2021

2122
from readthedocs.builds.models import BuildCommandResultMixin
2223
from readthedocs.core.utils import slugify
24+
from readthedocs.projects.models import Feature
2325

2426
from .constants import DOCKER_HOSTNAME_MAX_LEN
2527
from .constants import DOCKER_IMAGE
@@ -583,6 +585,7 @@ class DockerBuildEnvironment(BaseBuildEnvironment):
583585
container_time_limit = DOCKER_LIMITS.get("time")
584586

585587
def __init__(self, *args, **kwargs):
588+
self.build_api_key = kwargs.pop("build_api_key", None)
586589
container_image = kwargs.pop("container_image", None)
587590
super().__init__(*args, **kwargs)
588591
self.client = None
@@ -839,7 +842,48 @@ def create_container(self):
839842
runtime="runsc", # gVisor runtime
840843
)
841844
client.start(container=self.container_id)
845+
846+
if self.project.has_feature(Feature.BUILD_HEALTHCHECK):
847+
self._run_background_healthcheck()
848+
842849
except (DockerAPIError, ConnectionError) as exc:
843850
raise BuildAppError(
844851
BuildAppError.GENERIC_WITH_BUILD_ID, exception_messag=exc.explanation
845852
) from exc
853+
854+
def _run_background_healthcheck(self):
855+
"""
856+
Run a cURL command in the background to ping the healthcheck API.
857+
858+
The API saves the last ping timestamp on each call. Then a periodic Celery task
859+
checks this value for all the running builds and decide if the build is stalled or not.
860+
If it's stalled, it terminates those builds and mark them as fail.
861+
"""
862+
log.debug("Running build with healthcheck.")
863+
864+
build_id = self.build.get("id")
865+
healthcheck_url = reverse("build-healthcheck", kwargs={"pk": build_id})
866+
if settings.RTD_DOCKER_COMPOSE and "ngrok" in settings.PRODUCTION_DOMAIN:
867+
# NOTE: we do require using NGROK here to go over internet because I
868+
# didn't find a way to access the `web` container from inside the
869+
# container the `build` container created for this particular build
870+
# (there are 3 containers involved locally here: web, build, and user's build)
871+
#
872+
# This shouldn't happen in production, because we are not doing Docker in Docker.
873+
url = f"http://readthedocs.ngrok.io{healthcheck_url}"
874+
else:
875+
url = f"{settings.SLUMBER_API_HOST}{healthcheck_url}"
876+
877+
cmd = f"/bin/bash -c 'while true; do curl --max-time 2 -H \"Authorization: Token {self.build_api_key}\" -X POST {url}; sleep {settings.RTD_BUILD_HEALTHCHECK_DELAY}; done;'"
878+
log.debug("Healthcheck command to run.", command=cmd)
879+
880+
client = self.get_client()
881+
exec_cmd = client.exec_create(
882+
container=self.container_id,
883+
cmd=cmd,
884+
user=settings.RTD_DOCKER_USER,
885+
stdout=True,
886+
stderr=True,
887+
)
888+
# `detach=True` allows us to run this command in the background
889+
client.exec_start(exec_id=exec_cmd["Id"], stream=False, detach=True)

readthedocs/projects/models.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1976,6 +1976,7 @@ def add_features(sender, **kwargs):
19761976
# Build related features
19771977
SCALE_IN_PROTECTION = "scale_in_prtection"
19781978
USE_S3_SCOPED_CREDENTIALS_ON_BUILDERS = "use_s3_scoped_credentials_on_builders"
1979+
BUILD_HEALTHCHECK = "build_healthcheck"
19791980

19801981
FEATURES = (
19811982
(
@@ -2050,6 +2051,10 @@ def add_features(sender, **kwargs):
20502051
USE_S3_SCOPED_CREDENTIALS_ON_BUILDERS,
20512052
_("Build: Use S3 scoped credentials for uploading build artifacts."),
20522053
),
2054+
(
2055+
BUILD_HEALTHCHECK,
2056+
_("Build: Use background cURL healthcheck."),
2057+
),
20532058
)
20542059

20552060
FEATURES = sorted(FEATURES, key=lambda x: x[1])

readthedocs/projects/tasks/builds.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ class TaskData:
108108

109109
# Slumber client to interact with the API v2.
110110
api_client: API = None
111+
build_api_key: str = None
111112

112113
start_time: timezone.datetime = None
113114
environment_class: type[DockerBuildEnvironment] | type[LocalBuildEnvironment] = None
@@ -381,7 +382,8 @@ def before_start(self, task_id, args, kwargs):
381382
# anymore and we are not using it
382383
self.data.environment_class = LocalBuildEnvironment
383384

384-
self.data.api_client = setup_api(kwargs["build_api_key"])
385+
self.data.build_api_key = kwargs["build_api_key"]
386+
self.data.api_client = setup_api(self.data.build_api_key)
385387

386388
self.data.build = self.get_build(self.data.build_pk)
387389
self.data.version = self.get_version(self.data.version_pk)

readthedocs/projects/tasks/utils.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from readthedocs.core.utils.filesystem import safe_rmtree
1818
from readthedocs.doc_builder.exceptions import BuildAppError
1919
from readthedocs.notifications.models import Notification
20+
from readthedocs.projects.models import Feature
2021
from readthedocs.storage import build_media_storage
2122
from readthedocs.worker import app
2223

@@ -95,6 +96,53 @@ def clean_project_resources(project, version=None, version_slug=None):
9596
project.imported_files.all().delete()
9697

9798

99+
@app.task()
100+
def finish_unhealthy_builds():
101+
"""
102+
Finish inactive builds.
103+
104+
A build is consider inactive if the last healthcheck reported was more than
105+
RTD_BUILD_HEALTHCHECK_TIMEOUT seconds ago.
106+
107+
These inactive builds will be marked as ``success=False`` and
108+
``state=CANCELLED`` with an ``error`` to be communicated to the user.
109+
"""
110+
log.debug("Running task to finish inactive builds (no healtcheck received).")
111+
delta = datetime.timedelta(seconds=settings.RTD_BUILD_HEALTHCHECK_TIMEOUT)
112+
query = (
113+
~Q(state__in=BUILD_FINAL_STATES)
114+
& Q(healthcheck__lt=timezone.now() - delta)
115+
& Q(project__feature__feature_id=Feature.BUILD_HEALTHCHECK)
116+
)
117+
118+
projects_finished = set()
119+
builds_finished = []
120+
builds = Build.objects.filter(query)[:50]
121+
for build in builds:
122+
build.success = False
123+
build.state = BUILD_STATE_CANCELLED
124+
build.save()
125+
126+
# Tell Celery to cancel this task in case it's in a zombie state.
127+
app.control.revoke(build.task_id, signal="SIGINT", terminate=True)
128+
129+
Notification.objects.add(
130+
message_id=BuildAppError.BUILD_TERMINATED_DUE_INACTIVITY,
131+
attached_to=build,
132+
)
133+
134+
builds_finished.append(build.pk)
135+
projects_finished.add(build.project.slug)
136+
137+
if builds_finished:
138+
log.info(
139+
'Builds marked as "Terminated due inactivity" (not healthcheck received).',
140+
count=len(builds_finished),
141+
project_slugs=projects_finished,
142+
build_pks=builds_finished,
143+
)
144+
145+
98146
@app.task()
99147
def finish_inactive_builds():
100148
"""
@@ -118,6 +166,7 @@ def finish_inactive_builds():
118166
~Q(state__in=BUILD_FINAL_STATES)
119167
& Q(date__lt=timezone.now() - delta)
120168
& Q(date__gt=timezone.now() - datetime.timedelta(days=1))
169+
& ~Q(project__feature__feature_id=Feature.BUILD_HEALTHCHECK)
121170
)
122171

123172
projects_finished = set()

readthedocs/settings/base.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,8 @@ def SHOW_DEBUG_TOOLBAR(self):
135135
RTD_STABLE = "stable"
136136
RTD_STABLE_VERBOSE_NAME = "stable"
137137
RTD_CLEAN_AFTER_BUILD = False
138+
RTD_BUILD_HEALTHCHECK_TIMEOUT = 60 # seconds
139+
RTD_BUILD_HEALTHCHECK_DELAY = 15 # seconds
138140
RTD_MAX_CONCURRENT_BUILDS = 4
139141
RTD_BUILDS_MAX_RETRIES = 25
140142
RTD_BUILDS_RETRY_DELAY = 5 * 60 # seconds
@@ -603,6 +605,13 @@ def TEMPLATES(self):
603605
CELERY_DEFAULT_QUEUE = "celery"
604606
CELERYBEAT_SCHEDULER = "django_celery_beat.schedulers:DatabaseScheduler"
605607
CELERYBEAT_SCHEDULE = {
608+
"every-minute-finish-unhealthy-builds": {
609+
"task": "readthedocs.projects.tasks.utils.finish_unhealthy_builds",
610+
"schedule": crontab(minute="*"),
611+
"options": {"queue": "web"},
612+
},
613+
# TODO: delete `quarter-finish-inactive-builds` once we are fully
614+
# migrated into build healthcheck
606615
"quarter-finish-inactive-builds": {
607616
"task": "readthedocs.projects.tasks.utils.finish_inactive_builds",
608617
"schedule": crontab(minute="*/15"),

0 commit comments

Comments
 (0)