Skip to content

Commit 8fd1dd6

Browse files
committed
[server] Introduce special /ready handler that only returns "false" during the shutdown phase
Tool: gitpod/catfood.gitpod.cloud
1 parent e43a38d commit 8fd1dd6

File tree

5 files changed

+76
-1
lines changed

5 files changed

+76
-1
lines changed

components/server/src/container-module.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ import { InstallationAdminCleanup } from "./jobs/installation-admin-cleanup";
137137
import { AuditLogService } from "./audit/AuditLogService";
138138
import { AuditLogGarbageCollectorJob } from "./jobs/auditlog-gc";
139139
import { ProbesApp } from "./liveness/probes";
140+
import { ReadinessController } from "./liveness/readiness-controller";
140141

141142
export const productionContainerModule = new ContainerModule(
142143
(bind, unbind, isBound, rebind, unbindAsync, onActivation, onDeactivation) => {
@@ -246,6 +247,7 @@ export const productionContainerModule = new ContainerModule(
246247
bind(ProbesApp).toSelf().inSingletonScope();
247248
bind(LivenessController).toSelf().inSingletonScope();
248249
bind(StartupController).toSelf().inSingletonScope();
250+
bind(ReadinessController).toSelf().inSingletonScope();
249251

250252
bind(OneTimeSecretServer).toSelf().inSingletonScope();
251253

components/server/src/liveness/probes.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import { inject, injectable } from "inversify";
1010
import { LivenessController } from "./liveness-controller";
1111
import { StartupController } from "./startup-controller";
1212
import { AddressInfo } from "net";
13+
import { ReadinessController } from "./readiness-controller";
1314

1415
@injectable()
1516
export class ProbesApp {
@@ -19,10 +20,12 @@ export class ProbesApp {
1920
constructor(
2021
@inject(LivenessController) protected readonly livenessController: LivenessController,
2122
@inject(StartupController) protected readonly startupController: StartupController,
23+
@inject(ReadinessController) protected readonly readinessController: ReadinessController,
2224
) {
2325
const probesApp = express();
2426
probesApp.use("/live", this.livenessController.apiRouter);
2527
probesApp.use("/startup", this.startupController.apiRouter);
28+
probesApp.use("/ready", this.readinessController.apiRouter);
2629
this.app = probesApp;
2730
}
2831

@@ -35,6 +38,10 @@ export class ProbesApp {
3538
});
3639
}
3740

41+
public notifyShutdown(): void {
42+
this.readinessController.notifyShutdown();
43+
}
44+
3845
public async stop(): Promise<void> {
3946
this.httpServer?.close();
4047
}
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
/**
2+
* Copyright (c) 2025 Gitpod GmbH. All rights reserved.
3+
* Licensed under the GNU Affero General Public License (AGPL).
4+
* See License.AGPL.txt in the project root for license information.
5+
*/
6+
7+
import { injectable } from "inversify";
8+
import express from "express";
9+
import { log } from "@gitpod/gitpod-protocol/lib/util/logging";
10+
11+
/**
12+
* ReadinessController is mimicking the behavior server had in the past: Behave as there is not ready probe - except during shutdown.
13+
*
14+
* Why? In Gitpod, our error strategy has always been "keep it local and retry", instead of "fail loud and have someone else handle it".
15+
* As we don't want to change this now, we keep the same behavior for most of the services lifetime.
16+
*
17+
* Only during shutdown, we want to signal that the service is not ready anymore, to reduce error peaks.
18+
*/
19+
@injectable()
20+
export class ReadinessController {
21+
private shutdown: boolean = false;
22+
23+
get apiRouter(): express.Router {
24+
const router = express.Router();
25+
this.addReadinessHandler(router);
26+
return router;
27+
}
28+
29+
public notifyShutdown(): void {
30+
this.shutdown = true;
31+
}
32+
33+
protected addReadinessHandler(router: express.Router) {
34+
router.get("/", async (_, res) => {
35+
if (this.shutdown) {
36+
log.warn("Readiness check failed: Server is shutting down");
37+
res.status(503).send("Server is shutting down");
38+
return;
39+
}
40+
41+
res.status(200).send("Ready");
42+
log.debug("Readiness check successful");
43+
});
44+
}
45+
}

components/server/src/server.ts

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,9 @@ export class Server {
387387
}
388388

389389
public async stop() {
390+
// mark as not-ready
391+
this.probesApp.notifyShutdown();
392+
390393
// run each stop with a timeout of 30s
391394
async function race(workLoad: Promise<any>, task: string, ms: number = 30 * 1000): Promise<void> {
392395
const before = Date.now();
@@ -413,10 +416,13 @@ export class Server {
413416
race(this.stopServer(this.httpServer), "stop httpserver"),
414417
race(this.stopServer(this.privateApiServer), "stop private api server"),
415418
race(this.stopServer(this.publicApiServer), "stop public api server"),
416-
race(this.probesApp.stop(), "stop probe server"),
417419
race((async () => this.disposables.dispose())(), "dispose disposables"),
418420
]);
419421

422+
this.probesApp.stop().catch(() => {
423+
/* ignore */
424+
});
425+
420426
log.info("server stopped.");
421427
}
422428

install/installer/pkg/components/server/deployment.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,21 @@ func deployment(ctx *common.RenderContext) ([]runtime.Object, error) {
387387
PeriodSeconds: 10,
388388
FailureThreshold: 18, // try for 180 seconds, then the Pod is restarted
389389
},
390+
// /ready will only return false on shutdown (SIGTERM), always true otherwise
391+
ReadinessProbe: &corev1.Probe{
392+
ProbeHandler: corev1.ProbeHandler{
393+
HTTPGet: &corev1.HTTPGetAction{
394+
Path: "/ready",
395+
Port: intstr.IntOrString{
396+
Type: intstr.Int,
397+
IntVal: ProbesPort,
398+
},
399+
},
400+
},
401+
InitialDelaySeconds: 5,
402+
PeriodSeconds: 5,
403+
FailureThreshold: 1, // mark as "not ready" as quick as possible after receiving SIGTERM
404+
},
390405
SecurityContext: &corev1.SecurityContext{
391406
Privileged: pointer.Bool(false),
392407
AllowPrivilegeEscalation: pointer.Bool(false),

0 commit comments

Comments
 (0)