diff --git a/components/server/src/container-module.ts b/components/server/src/container-module.ts index 0a1af2352bc012..5164b779a1529c 100644 --- a/components/server/src/container-module.ts +++ b/components/server/src/container-module.ts @@ -71,7 +71,7 @@ import { WebhookEventGarbageCollector } from "./jobs/webhook-gc"; import { WorkspaceGarbageCollector } from "./jobs/workspace-gc"; import { LinkedInService } from "./linkedin-service"; import { LivenessController } from "./liveness/liveness-controller"; -import { ReadinessController } from "./liveness/readiness-controller"; +import { StartupController } from "./liveness/startup-controller"; import { RedisSubscriber } from "./messaging/redis-subscriber"; import { MonitoringEndpointsApp } from "./monitoring-endpoints"; import { OAuthController } from "./oauth-server/oauth-controller"; @@ -137,6 +137,7 @@ import { InstallationAdminCleanup } from "./jobs/installation-admin-cleanup"; import { AuditLogService } from "./audit/AuditLogService"; import { AuditLogGarbageCollectorJob } from "./jobs/auditlog-gc"; import { ProbesApp } from "./liveness/probes"; +import { ReadinessController } from "./liveness/readiness-controller"; export const productionContainerModule = new ContainerModule( (bind, unbind, isBound, rebind, unbindAsync, onActivation, onDeactivation) => { @@ -245,6 +246,7 @@ export const productionContainerModule = new ContainerModule( bind(ProbesApp).toSelf().inSingletonScope(); bind(LivenessController).toSelf().inSingletonScope(); + bind(StartupController).toSelf().inSingletonScope(); bind(ReadinessController).toSelf().inSingletonScope(); bind(OneTimeSecretServer).toSelf().inSingletonScope(); diff --git a/components/server/src/liveness/probes.ts b/components/server/src/liveness/probes.ts index 46cd4860f26cf9..0371c2e9ba9503 100644 --- a/components/server/src/liveness/probes.ts +++ b/components/server/src/liveness/probes.ts @@ -8,8 +8,9 @@ import * as http from "http"; import express from "express"; import { inject, injectable } from "inversify"; import { LivenessController } from "./liveness-controller"; -import { ReadinessController } from "./readiness-controller"; +import { StartupController } from "./startup-controller"; import { AddressInfo } from "net"; +import { ReadinessController } from "./readiness-controller"; @injectable() export class ProbesApp { @@ -18,17 +19,17 @@ export class ProbesApp { constructor( @inject(LivenessController) protected readonly livenessController: LivenessController, + @inject(StartupController) protected readonly startupController: StartupController, @inject(ReadinessController) protected readonly readinessController: ReadinessController, ) { const probesApp = express(); probesApp.use("/live", this.livenessController.apiRouter); + probesApp.use("/startup", this.startupController.apiRouter); probesApp.use("/ready", this.readinessController.apiRouter); this.app = probesApp; } public async start(port: number): Promise { - await this.readinessController.start(); - return new Promise((resolve, reject) => { const probeServer = this.app.listen(port, () => { resolve((probeServer.address()).port); @@ -37,8 +38,11 @@ export class ProbesApp { }); } + public notifyShutdown(): void { + this.readinessController.notifyShutdown(); + } + public async stop(): Promise { this.httpServer?.close(); - await this.readinessController.stop(); } } diff --git a/components/server/src/liveness/readiness-controller.ts b/components/server/src/liveness/readiness-controller.ts index e75982eef70c58..34e60daefd967e 100644 --- a/components/server/src/liveness/readiness-controller.ts +++ b/components/server/src/liveness/readiness-controller.ts @@ -4,25 +4,21 @@ * See License.AGPL.txt in the project root for license information. */ -import { injectable, inject } from "inversify"; +import { injectable } from "inversify"; import express from "express"; -import { TypeORM } from "@gitpod/gitpod-db/lib"; -import { SpiceDBClientProvider } from "../authorization/spicedb"; import { log } from "@gitpod/gitpod-protocol/lib/util/logging"; -import { v1 } from "@authzed/authzed-node"; -import { getExperimentsClientForBackend } from "@gitpod/gitpod-protocol/lib/experiments/configcat-server"; -import { Redis } from "ioredis"; -import { repeat } from "@gitpod/gitpod-protocol/lib/util/repeat"; -import { Disposable, DisposableCollection } from "@gitpod/gitpod-protocol"; +/** + * ReadinessController is mimicking the behavior server had in the past: Behave as there is not ready probe - except during shutdown. + * + * Why? In Gitpod, our error strategy has always been "keep it local and retry", instead of "fail loud and have someone else handle it". + * As we don't want to change this now, we keep the same behavior for most of the services lifetime. + * + * Only during shutdown, we want to signal that the service is not ready anymore, to reduce error peaks. + */ @injectable() export class ReadinessController { - @inject(TypeORM) protected readonly typeOrm: TypeORM; - @inject(SpiceDBClientProvider) protected readonly spiceDBClientProvider: SpiceDBClientProvider; - @inject(Redis) protected readonly redis: Redis; - - private readinessProbeEnabled: boolean = true; - private disposables: DisposableCollection = new DisposableCollection(); + private shutdown: boolean = false; get apiRouter(): express.Router { const router = express.Router(); @@ -30,112 +26,20 @@ export class ReadinessController { return router; } - public async start() { - this.disposables.push(this.startPollingFeatureFlag()); - } - - public async stop() { - this.disposables.dispose(); + public notifyShutdown(): void { + this.shutdown = true; } protected addReadinessHandler(router: express.Router) { router.get("/", async (_, res) => { - try { - if (!this.readinessProbeEnabled) { - log.debug("Readiness check skipped due to feature flag"); - res.status(200); - return; - } - - // Check database connection - const dbConnection = await this.checkDatabaseConnection(); - if (!dbConnection) { - log.warn("Readiness check failed: Database connection failed"); - res.status(503).send("Database connection failed"); - return; - } - - // Check SpiceDB connection - const spiceDBConnection = await this.checkSpiceDBConnection(); - if (!spiceDBConnection) { - log.warn("Readiness check failed: SpiceDB connection failed"); - res.status(503).send("SpiceDB connection failed"); - return; - } - - // Check Redis connection - const redisConnection = await this.checkRedisConnection(); - if (!redisConnection) { - log.warn("Readiness check failed: Redis connection failed"); - res.status(503).send("Redis connection failed"); - return; - } - - // All connections are good - res.status(200).send("Ready"); - log.debug("Readiness check successful"); - } catch (error) { - log.error("Readiness check failed", error); - res.status(503).send("Readiness check failed"); + if (this.shutdown) { + log.warn("Readiness check failed: Server is shutting down"); + res.status(503).send("Server is shutting down"); + return; } - }); - } - private async checkDatabaseConnection(): Promise { - try { - const connection = await this.typeOrm.getConnection(); - // Simple query to verify connection is working - await connection.query("SELECT 1"); - log.debug("Database connection check successful"); - return true; - } catch (error) { - log.error("Database connection check failed", error); - return false; - } - } - - private async checkSpiceDBConnection(): Promise { - try { - const client = this.spiceDBClientProvider.getClient(); - - // Send a request, to verify that the connection works - const req = v1.ReadSchemaRequest.create({}); - const response = await client.readSchema(req); - log.debug("SpiceDB connection check successful", { schemaLength: response.schemaText.length }); - - return true; - } catch (error) { - log.error("SpiceDB connection check failed", error); - return false; - } - } - - private async checkRedisConnection(): Promise { - try { - // Simple PING command to verify connection is working - const result = await this.redis.ping(); - log.debug("Redis connection check successful", { result }); - return result === "PONG"; - } catch (error) { - log.error("Redis connection check failed", error); - return false; - } - } - - private startPollingFeatureFlag(): Disposable { - return repeat(async () => { - // Check feature flag first - const readinessProbeEnabled = await getExperimentsClientForBackend().getValueAsync( - "server_readiness_probe", - true, // Default to readiness probe, skip if false - {}, - ); - - log.debug("Feature flag server_readiness_probe updated", { - readinessProbeEnabled, - oldValue: this.readinessProbeEnabled, - }); - this.readinessProbeEnabled = readinessProbeEnabled; - }, 10_000); + res.status(200).send("Ready"); + log.debug("Readiness check successful"); + }); } } diff --git a/components/server/src/liveness/startup-controller.ts b/components/server/src/liveness/startup-controller.ts new file mode 100644 index 00000000000000..bef15454a34dfd --- /dev/null +++ b/components/server/src/liveness/startup-controller.ts @@ -0,0 +1,104 @@ +/** + * Copyright (c) 2025 Gitpod GmbH. All rights reserved. + * Licensed under the GNU Affero General Public License (AGPL). + * See License.AGPL.txt in the project root for license information. + */ + +import { injectable, inject } from "inversify"; +import express from "express"; +import { TypeORM } from "@gitpod/gitpod-db/lib"; +import { SpiceDBClientProvider } from "../authorization/spicedb"; +import { log } from "@gitpod/gitpod-protocol/lib/util/logging"; +import { v1 } from "@authzed/authzed-node"; +import { Redis } from "ioredis"; + +@injectable() +export class StartupController { + @inject(TypeORM) protected readonly typeOrm: TypeORM; + @inject(SpiceDBClientProvider) protected readonly spiceDBClientProvider: SpiceDBClientProvider; + @inject(Redis) protected readonly redis: Redis; + + get apiRouter(): express.Router { + const router = express.Router(); + this.addStartupHandler(router); + return router; + } + + protected addStartupHandler(router: express.Router) { + router.get("/", async (_, res) => { + try { + // Check database connection + const dbConnection = await this.checkDatabaseConnection(); + if (!dbConnection) { + log.warn("Startup check failed: Database connection failed"); + res.status(503).send("Database connection failed"); + return; + } + + // Check SpiceDB connection + const spiceDBConnection = await this.checkSpiceDBConnection(); + if (!spiceDBConnection) { + log.warn("Startup check failed: SpiceDB connection failed"); + res.status(503).send("SpiceDB connection failed"); + return; + } + + // Check Redis connection + const redisConnection = await this.checkRedisConnection(); + if (!redisConnection) { + log.warn("Startup check failed: Redis connection failed"); + res.status(503).send("Redis connection failed"); + return; + } + + // All connections are good + res.status(200).send("Ready"); + log.debug("Startup check successful"); + } catch (error) { + log.error("Startup check failed", error); + res.status(503).send("Startup check failed"); + } + }); + } + + private async checkDatabaseConnection(): Promise { + try { + const connection = await this.typeOrm.getConnection(); + // Simple query to verify connection is working + await connection.query("SELECT 1"); + log.debug("Database connection check successful"); + return true; + } catch (error) { + log.error("Database connection check failed", error); + return false; + } + } + + private async checkSpiceDBConnection(): Promise { + try { + const client = this.spiceDBClientProvider.getClient(); + + // Send a request, to verify that the connection works + const req = v1.ReadSchemaRequest.create({}); + const response = await client.readSchema(req); + log.debug("SpiceDB connection check successful", { schemaLength: response.schemaText.length }); + + return true; + } catch (error) { + log.error("SpiceDB connection check failed", error); + return false; + } + } + + private async checkRedisConnection(): Promise { + try { + // Simple PING command to verify connection is working + const result = await this.redis.ping(); + log.debug("Redis connection check successful", { result }); + return result === "PONG"; + } catch (error) { + log.error("Redis connection check failed", error); + return false; + } + } +} diff --git a/components/server/src/server.ts b/components/server/src/server.ts index 45d6c922097797..e33ba2872e02b6 100644 --- a/components/server/src/server.ts +++ b/components/server/src/server.ts @@ -387,6 +387,9 @@ export class Server { } public async stop() { + // mark as not-ready + this.probesApp.notifyShutdown(); + // run each stop with a timeout of 30s async function race(workLoad: Promise, task: string, ms: number = 30 * 1000): Promise { const before = Date.now(); @@ -413,10 +416,13 @@ export class Server { race(this.stopServer(this.httpServer), "stop httpserver"), race(this.stopServer(this.privateApiServer), "stop private api server"), race(this.stopServer(this.publicApiServer), "stop public api server"), - race(this.probesApp.stop(), "stop probe server"), race((async () => this.disposables.dispose())(), "dispose disposables"), ]); + this.probesApp.stop().catch(() => { + /* ignore */ + }); + log.info("server stopped."); } diff --git a/install/installer/pkg/components/server/deployment.go b/install/installer/pkg/components/server/deployment.go index 38683dca15b921..df03c0adfba396 100644 --- a/install/installer/pkg/components/server/deployment.go +++ b/install/installer/pkg/components/server/deployment.go @@ -376,7 +376,7 @@ func deployment(ctx *common.RenderContext) ([]runtime.Object, error) { StartupProbe: &corev1.Probe{ ProbeHandler: corev1.ProbeHandler{ HTTPGet: &corev1.HTTPGetAction{ - Path: "/ready", + Path: "/startup", Port: intstr.IntOrString{ Type: intstr.Int, IntVal: ProbesPort, @@ -387,6 +387,21 @@ func deployment(ctx *common.RenderContext) ([]runtime.Object, error) { PeriodSeconds: 10, FailureThreshold: 18, // try for 180 seconds, then the Pod is restarted }, + // /ready will only return false on shutdown (SIGTERM), always true otherwise + ReadinessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/ready", + Port: intstr.IntOrString{ + Type: intstr.Int, + IntVal: ProbesPort, + }, + }, + }, + InitialDelaySeconds: 5, + PeriodSeconds: 5, + FailureThreshold: 1, // mark as "not ready" as quick as possible after receiving SIGTERM + }, SecurityContext: &corev1.SecurityContext{ Privileged: pointer.Bool(false), AllowPrivilegeEscalation: pointer.Bool(false),