From ff8e8bf6a59194d9ffe60019596b46730e8f9b04 Mon Sep 17 00:00:00 2001 From: Gero Posmyk-Leinemann Date: Wed, 26 Feb 2025 16:55:07 +0000 Subject: [PATCH 1/4] [dev] Bump grpc/grpc-js 1.10.8 -> 1.12.6 and authzed/authzed-node 0.15.0 -> 1.2.2 Tool: gitpod/catfood.gitpod.cloud --- .../typescript/package.json | 2 +- components/gitpod-protocol/package.json | 2 +- .../typescript-grpc/package.json | 2 +- .../image-builder-api/typescript/package.json | 2 +- components/server/package.json | 2 +- .../typescript-grpc/package.json | 2 +- .../ws-daemon-api/typescript/package.json | 2 +- .../ws-manager-api/typescript/package.json | 2 +- .../typescript/package.json | 2 +- yarn.lock | 30 +++++++++---------- 10 files changed, 24 insertions(+), 24 deletions(-) diff --git a/components/content-service-api/typescript/package.json b/components/content-service-api/typescript/package.json index 8beba94c05af11..7990f2a17a9045 100644 --- a/components/content-service-api/typescript/package.json +++ b/components/content-service-api/typescript/package.json @@ -11,7 +11,7 @@ "lib" ], "dependencies": { - "@grpc/grpc-js": "1.10.8", + "@grpc/grpc-js": "1.12.6", "google-protobuf": "^3.19.1", "inversify": "^6.0.1", "opentracing": "^0.14.4" diff --git a/components/gitpod-protocol/package.json b/components/gitpod-protocol/package.json index 81cb38e74010a4..354aba29a56247 100644 --- a/components/gitpod-protocol/package.json +++ b/components/gitpod-protocol/package.json @@ -10,7 +10,7 @@ "src" ], "devDependencies": { - "@grpc/grpc-js": "1.10.8", + "@grpc/grpc-js": "1.12.6", "@testdeck/mocha": "^0.3.3", "@types/analytics-node": "^3.1.9", "@types/chai-subset": "^1.3.3", diff --git a/components/ide-metrics-api/typescript-grpc/package.json b/components/ide-metrics-api/typescript-grpc/package.json index 3f2280d9e07732..4c89e50491e523 100644 --- a/components/ide-metrics-api/typescript-grpc/package.json +++ b/components/ide-metrics-api/typescript-grpc/package.json @@ -9,7 +9,7 @@ "lib" ], "dependencies": { - "@grpc/grpc-js": "1.10.8", + "@grpc/grpc-js": "1.12.6", "google-protobuf": "^3.19.1" }, "devDependencies": { diff --git a/components/image-builder-api/typescript/package.json b/components/image-builder-api/typescript/package.json index 05f3d2dbd0a227..c60d40f50b83b8 100644 --- a/components/image-builder-api/typescript/package.json +++ b/components/image-builder-api/typescript/package.json @@ -13,7 +13,7 @@ "dependencies": { "@gitpod/content-service": "0.1.5", "@gitpod/gitpod-protocol": "0.1.5", - "@grpc/grpc-js": "1.10.8", + "@grpc/grpc-js": "1.12.6", "google-protobuf": "^3.19.1", "inversify": "^6.0.1", "opentracing": "^0.14.4" diff --git a/components/server/package.json b/components/server/package.json index 18c6ed06a0202c..d9db1a2f0db055 100644 --- a/components/server/package.json +++ b/components/server/package.json @@ -45,7 +45,7 @@ "/src" ], "dependencies": { - "@authzed/authzed-node": "^0.15.0", + "@authzed/authzed-node": "^1.2.2", "@connectrpc/connect": "1.1.2", "@connectrpc/connect-express": "1.1.2", "@gitbeaker/rest": "^39.12.0", diff --git a/components/supervisor-api/typescript-grpc/package.json b/components/supervisor-api/typescript-grpc/package.json index 8065019086c4e4..0d747b0b205781 100644 --- a/components/supervisor-api/typescript-grpc/package.json +++ b/components/supervisor-api/typescript-grpc/package.json @@ -9,7 +9,7 @@ "lib" ], "dependencies": { - "@grpc/grpc-js": "1.10.8", + "@grpc/grpc-js": "1.12.6", "google-protobuf": "^3.19.1" }, "devDependencies": { diff --git a/components/ws-daemon-api/typescript/package.json b/components/ws-daemon-api/typescript/package.json index 24bb66207a3886..328bd4c811ed3d 100644 --- a/components/ws-daemon-api/typescript/package.json +++ b/components/ws-daemon-api/typescript/package.json @@ -13,7 +13,7 @@ ], "dependencies": { "@gitpod/content-service": "0.1.5", - "@grpc/grpc-js": "1.10.8", + "@grpc/grpc-js": "1.12.6", "google-protobuf": "^3.19.1", "inversify": "^6.0.1", "opentracing": "^0.14.4" diff --git a/components/ws-manager-api/typescript/package.json b/components/ws-manager-api/typescript/package.json index c0b9ab04b77728..66924ce85b93e8 100644 --- a/components/ws-manager-api/typescript/package.json +++ b/components/ws-manager-api/typescript/package.json @@ -27,7 +27,7 @@ "dependencies": { "@gitpod/content-service": "0.1.5", "@gitpod/gitpod-protocol": "0.1.5", - "@grpc/grpc-js": "1.10.8", + "@grpc/grpc-js": "1.12.6", "google-protobuf": "^3.19.1", "inversify": "^6.0.1", "opentracing": "^0.14.4" diff --git a/components/ws-manager-bridge-api/typescript/package.json b/components/ws-manager-bridge-api/typescript/package.json index 5699765c8c8f0f..488ef88446b127 100644 --- a/components/ws-manager-bridge-api/typescript/package.json +++ b/components/ws-manager-bridge-api/typescript/package.json @@ -11,7 +11,7 @@ "lib" ], "dependencies": { - "@grpc/grpc-js": "1.10.8", + "@grpc/grpc-js": "1.12.6", "google-protobuf": "^3.19.1", "inversify": "^6.0.1", "opentracing": "^0.14.4" diff --git a/yarn.lock b/yarn.lock index f84ad39fc60d45..b30c7835cfdc3a 100644 --- a/yarn.lock +++ b/yarn.lock @@ -24,15 +24,15 @@ jsonpointer "^5.0.0" leven "^3.1.0" -"@authzed/authzed-node@^0.15.0": - version "0.15.0" - resolved "https://registry.yarnpkg.com/@authzed/authzed-node/-/authzed-node-0.15.0.tgz#2163357d76ebf4068d0735a19357c8fa689d1c85" - integrity sha512-juB03KDkxuPShvWz4coJeDzKN7obSZwm6a5Ii4xcCAUT6IItSS+7hOQtFYDMSQVz6ynchwleqqzbhAWMZ1NISQ== +"@authzed/authzed-node@^1.2.2": + version "1.2.2" + resolved "https://registry.yarnpkg.com/@authzed/authzed-node/-/authzed-node-1.2.2.tgz#a1f1fed945d1746ff933c938e6e4068c179e5be6" + integrity sha512-Br25e/1K4dmBIXqqCodvdcT3Ii2N3TSHYpwPrh8PCyhswgiVbUhiXO/PHuqIqEmlC6N1F78FEvlNiyGK4VXw6g== dependencies: - "@grpc/grpc-js" "^1.10.7" + "@grpc/grpc-js" "^1.12.5" "@protobuf-ts/runtime" "^2.9.4" "@protobuf-ts/runtime-rpc" "^2.9.4" - google-protobuf "^3.15.3" + google-protobuf "^3.21.4" "@babel/code-frame@^7.0.0", "@babel/code-frame@^7.10.4", "@babel/code-frame@^7.12.13", "@babel/code-frame@^7.16.0", "@babel/code-frame@^7.22.10", "@babel/code-frame@^7.22.5", "@babel/code-frame@^7.8.3": version "7.22.10" @@ -1666,10 +1666,10 @@ resolved "https://registry.yarnpkg.com/@google-cloud/promisify/-/promisify-4.0.0.tgz#a906e533ebdd0f754dca2509933334ce58b8c8b1" integrity sha512-Orxzlfb9c67A15cq2JQEyVc7wEsmFBmHjZWZYQMUyJ1qivXyMwdyNOs9odi79hze+2zqdTtu1E19IM/FtqZ10g== -"@grpc/grpc-js@1.10.8", "@grpc/grpc-js@^1.10.7": - version "1.10.8" - resolved "https://registry.yarnpkg.com/@grpc/grpc-js/-/grpc-js-1.10.8.tgz#99787785cd8335be861afd1cd485ae9f058e4484" - integrity sha512-vYVqYzHicDqyKB+NQhAc54I1QWCBLCrYG6unqOIcBTHx+7x8C9lcoLj3KVJXs2VB4lUbpWY+Kk9NipcbXYWmvg== +"@grpc/grpc-js@1.12.6", "@grpc/grpc-js@^1.12.5": + version "1.12.6" + resolved "https://registry.yarnpkg.com/@grpc/grpc-js/-/grpc-js-1.12.6.tgz#a3586ffdfb6a1f5cd5b4866dec9074c4a1e65472" + integrity sha512-JXUj6PI0oqqzTGvKtzOkxtpsyPRNsrmhh41TtIz/zEB6J+AUiZZ0dxWzcMwO9Ns5rmSPuMdghlTbUuqIM48d3Q== dependencies: "@grpc/proto-loader" "^0.7.13" "@js-sdsl/ordered-map" "^4.4.2" @@ -8776,16 +8776,16 @@ google-protobuf@3.15.8: resolved "https://registry.npmjs.org/google-protobuf/-/google-protobuf-3.15.8.tgz" integrity sha512-2jtfdqTaSxk0cuBJBtTTWsot4WtR9RVr2rXg7x7OoqiuOKopPrwXpM1G4dXIkLcUNRh3RKzz76C8IOkksZSeOw== -google-protobuf@^3.15.3: - version "3.21.2" - resolved "https://registry.yarnpkg.com/google-protobuf/-/google-protobuf-3.21.2.tgz#4580a2bea8bbb291ee579d1fefb14d6fa3070ea4" - integrity sha512-3MSOYFO5U9mPGikIYCzK0SaThypfGgS6bHqrUGXG3DPHCrb+txNqeEcns1W0lkGfk0rCyNXm7xB9rMxnCiZOoA== - google-protobuf@^3.19.1, google-protobuf@^3.6.1: version "3.19.1" resolved "https://registry.npmjs.org/google-protobuf/-/google-protobuf-3.19.1.tgz" integrity sha512-Isv1RlNC+IzZzilcxnlVSf+JvuhxmY7DaxYCBy+zPS9XVuJRtlTTIXR9hnZ1YL1MMusJn/7eSy2swCzZIomQSg== +google-protobuf@^3.21.4: + version "3.21.4" + resolved "https://registry.yarnpkg.com/google-protobuf/-/google-protobuf-3.21.4.tgz#2f933e8b6e5e9f8edde66b7be0024b68f77da6c9" + integrity sha512-MnG7N936zcKTco4Jd2PX2U96Kf9PxygAPKBug+74LHzmHXmceN16MmRcdgZv+DGef/S9YvQAfRsNCn4cjf9yyQ== + gopd@^1.0.1: version "1.0.1" resolved "https://registry.yarnpkg.com/gopd/-/gopd-1.0.1.tgz#29ff76de69dac7489b7c0918a5788e56477c332c" From bf6997467ca15ef62dd843c6d999cb75b55ad680 Mon Sep 17 00:00:00 2001 From: Gero Posmyk-Leinemann Date: Thu, 27 Feb 2025 10:05:29 +0000 Subject: [PATCH 2/4] [server] Streamline spicedb gRPC client usage and creation options - instead of doing retries on two levels, rely on the gRPC-level retries - to mitigate the loss of insights, introduce createDebugLogInterceptor - client options: use sane defaults derived from the documentation instead of the excessive ones we had in place before - use "waitForReady" option: it should a) make our calls for responsive on re-connects, while b) - because we keep re-trying on DEADLINE_EXCEEDED - should be as reliable as before Tool: gitpod/catfood.gitpod.cloud --- components/gitpod-protocol/src/util/grpc.ts | 68 +++++++++++++++ .../src/authorization/spicedb-authorizer.ts | 82 ++++++------------- .../server/src/authorization/spicedb.ts | 25 ++++-- components/server/src/container-module.ts | 3 +- 4 files changed, 113 insertions(+), 65 deletions(-) diff --git a/components/gitpod-protocol/src/util/grpc.ts b/components/gitpod-protocol/src/util/grpc.ts index 190be12882eada..2a3d835cb8d57b 100644 --- a/components/gitpod-protocol/src/util/grpc.ts +++ b/components/gitpod-protocol/src/util/grpc.ts @@ -6,6 +6,8 @@ import * as grpc from "@grpc/grpc-js"; import { Status } from "@grpc/grpc-js/build/src/constants"; +import { log } from "./logging"; +import { TrustedValue } from "./scrubbing"; export const defaultGRPCOptions = { "grpc.keepalive_timeout_ms": 10000, @@ -108,6 +110,72 @@ export function createClientCallMetricsInterceptor(metrics: IClientCallMetrics): }; } +export function createDebugLogInterceptor(): grpc.Interceptor { + const FAILURE_STATUS_CODES = new Map([ + [Status.ABORTED, true], + [Status.CANCELLED, true], + [Status.DATA_LOSS, true], + [Status.DEADLINE_EXCEEDED, true], + [Status.FAILED_PRECONDITION, true], + [Status.INTERNAL, true], + [Status.PERMISSION_DENIED, true], + [Status.RESOURCE_EXHAUSTED, true], + [Status.UNAUTHENTICATED, true], + [Status.UNAVAILABLE, true], + [Status.UNIMPLEMENTED, true], + [Status.UNKNOWN, true], + ]); + + return (options, nextCall): grpc.InterceptingCall => { + const methodDef = options.method_definition; + const method = methodDef.path.substring(methodDef.path.lastIndexOf("/") + 1); + const service = methodDef.path.substring(1, methodDef.path.length - method.length - 1); + const labels = { + service, + method, + type: getGrpcMethodType(options.method_definition.requestStream, options.method_definition.responseStream), + }; + const requester = new grpc.RequesterBuilder() + .withStart((metadata, listener, next) => { + const newListener = new grpc.ListenerBuilder() + .withOnReceiveStatus((status, next) => { + try { + const info = { + labels: new TrustedValue(labels), + metadata: new TrustedValue(metadata.toJSON()), + code: Status[status.code], + }; + if (FAILURE_STATUS_CODES.has(status.code)) { + log.warn(`grpc call failed`, info); + } else { + log.debug(`grpc call status`, info); + } + } finally { + next(status); + } + }) + .build(); + try { + log.debug(`grpc call started`, { + labels: new TrustedValue(labels), + metadata: new TrustedValue(metadata.toJSON()), + }); + } finally { + next(metadata, newListener); + } + }) + .withCancel((next) => { + try { + log.debug(`grpc call cancelled`, { labels: new TrustedValue(labels) }); + } finally { + next(); + } + }) + .build(); + return new grpc.InterceptingCall(nextCall(options), requester); + }; +} + export function isGrpcError(err: any): err is grpc.StatusObject { return err.code && err.details; } diff --git a/components/server/src/authorization/spicedb-authorizer.ts b/components/server/src/authorization/spicedb-authorizer.ts index e8f3093fb4fb74..33b284d046a292 100644 --- a/components/server/src/authorization/spicedb-authorizer.ts +++ b/components/server/src/authorization/spicedb-authorizer.ts @@ -17,46 +17,6 @@ import { ctxTryGetCache, ctxTrySetCache } from "../util/request-context"; import { ApplicationError, ErrorCodes } from "@gitpod/gitpod-protocol/lib/messaging/error"; import { isGrpcError } from "@gitpod/gitpod-protocol/lib/util/grpc"; -async function tryThree(errMessage: string, code: (attempt: number) => Promise): Promise { - let attempt = 0; - // we do sometimes see INTERNAL errors from SpiceDB, or grpc-js reports DEADLINE_EXCEEDED, so we retry a few times - // last time we checked it was 15 times per day (check logs) - while (attempt++ < 3) { - try { - return await code(attempt); - } catch (err) { - if ( - (err.code === grpc.status.INTERNAL || - err.code === grpc.status.DEADLINE_EXCEEDED || - err.code === grpc.status.UNAVAILABLE) && - attempt < 3 - ) { - let delay = 500 * attempt; - if (err.code === grpc.status.DEADLINE_EXCEEDED) { - // we already waited for timeout, so let's try again immediately - delay = 0; - } - - log.warn(errMessage, err, { - attempt, - delay, - code: err.code, - }); - await new Promise((resolve) => setTimeout(resolve, delay)); - continue; - } - - log.error(errMessage, err, { - attempt, - code: err.code, - }); - // we don't try again on other errors - throw err; - } - } - throw new Error("unreachable"); -} - export function createSpiceDBAuthorizer(clientProvider: SpiceDBClientProvider): SpiceDBAuthorizer { return new SpiceDBAuthorizer(clientProvider, new RequestLocalZedTokenCache()); } @@ -71,13 +31,11 @@ interface DeletionResult { deletedAt?: string; } +const GRPC_DEADLINE = 10_000; + export class SpiceDBAuthorizer { constructor(private readonly clientProvider: SpiceDBClientProvider, private readonly tokenCache: ZedTokenCache) {} - private get client(): v1.ZedPromiseClientInterface { - return this.clientProvider.getClient(); - } - public async check(req: v1.CheckPermissionRequest, experimentsFields: { userId: string }): Promise { req.consistency = await this.tokenCache.consistency(req.resource); incSpiceDBRequestsCheckTotal(req.consistency?.requirement?.oneofKind || "undefined"); @@ -99,8 +57,8 @@ export class SpiceDBAuthorizer { const timer = spicedbClientLatency.startTimer(); let error: Error | undefined; try { - const response = await tryThree("[spicedb] Failed to perform authorization check.", () => - this.client.checkPermission(req, this.callOptions), + const response = await this.call("[spicedb] Failed to perform authorization check.", (client) => + client.checkPermission(req, this.callOptions), ); const permitted = response.permissionship === v1.CheckPermissionResponse_Permissionship.HAS_PERMISSION; return { permitted, checkedAt: response.checkedAt?.token }; @@ -139,8 +97,8 @@ export class SpiceDBAuthorizer { const timer = spicedbClientLatency.startTimer(); let error: Error | undefined; try { - const response = await tryThree("[spicedb] Failed to write relationships.", () => - this.client.writeRelationships( + const response = await this.call("[spicedb] Failed to write relationships.", (client) => + client.writeRelationships( v1.WriteRelationshipsRequest.create({ updates, }), @@ -175,16 +133,16 @@ export class SpiceDBAuthorizer { let error: Error | undefined; try { let deletedAt: string | undefined = undefined; - const existing = await tryThree("readRelationships before deleteRelationships failed.", () => - this.client.readRelationships(v1.ReadRelationshipsRequest.create(req), this.callOptions), + const existing = await this.call("readRelationships before deleteRelationships failed.", (client) => + client.readRelationships(v1.ReadRelationshipsRequest.create(req), this.callOptions), ); if (existing.length > 0) { - const response = await tryThree("deleteRelationships failed.", () => - this.client.deleteRelationships(req, this.callOptions), + const response = await this.call("deleteRelationships failed.", (client) => + client.deleteRelationships(req, this.callOptions), ); deletedAt = response.deletedAt?.token; - const after = await tryThree("readRelationships failed.", () => - this.client.readRelationships(v1.ReadRelationshipsRequest.create(req), this.callOptions), + const after = await this.call("readRelationships failed.", (client) => + client.readRelationships(v1.ReadRelationshipsRequest.create(req), this.callOptions), ); if (after.length > 0) { log.error("[spicedb] Failed to delete relationships.", { existing, after, request: req }); @@ -213,7 +171,19 @@ export class SpiceDBAuthorizer { async readRelationships(req: v1.ReadRelationshipsRequest): Promise { req.consistency = await this.tokenCache.consistency(undefined); incSpiceDBRequestsCheckTotal(req.consistency?.requirement?.oneofKind || "undefined"); - return tryThree("readRelationships failed.", () => this.client.readRelationships(req, this.callOptions)); + return this.call("readRelationships failed.", (client) => client.readRelationships(req, this.callOptions)); + } + + private async call(errMessage: string, code: (client: v1.ZedPromiseClientInterface) => Promise): Promise { + try { + const client = this.clientProvider.getClient(); + return code(client); + } catch (err) { + log.error(errMessage, err, { + code: err.code, + }); + throw err; + } } /** @@ -223,7 +193,7 @@ export class SpiceDBAuthorizer { */ private get callOptions(): grpc.Metadata { return ({ - deadline: Date.now() + 8000, + deadline: Date.now() + GRPC_DEADLINE, }) as any as grpc.Metadata; } } diff --git a/components/server/src/authorization/spicedb.ts b/components/server/src/authorization/spicedb.ts index 25c917f070f3b5..cf553b8716cd11 100644 --- a/components/server/src/authorization/spicedb.ts +++ b/components/server/src/authorization/spicedb.ts @@ -17,24 +17,33 @@ export interface SpiceDBClientConfig { export type SpiceDBClient = v1.ZedPromiseClientInterface; type Client = v1.ZedClientInterface & grpc.Client; + const DEFAULT_FEATURE_FLAG_VALUE = "undefined"; const DefaultClientOptions: grpc.ClientOptions = { // we ping frequently to check if the connection is still alive - "grpc.keepalive_time_ms": 1000, - "grpc.keepalive_timeout_ms": 1000, + "grpc.keepalive_time_ms": 30_000, + "grpc.keepalive_timeout_ms": 4_000, + + "grpc.max_reconnect_backoff_ms": 5_000, + "grpc.initial_reconnect_backoff_ms": 1_000, - "grpc.max_reconnect_backoff_ms": 5000, - "grpc.initial_reconnect_backoff_ms": 500, + // docs on client-side retry support: https://github.com/grpc/grpc-node/blob/0c093b0b7f78f691a4f6e41efc184899d7a2d987/examples/retry/README.md?plain=1#L3 + "grpc.service_config_disable_resolution": 1, // don't resolve from external, but guarantee to take this config "grpc.service_config": JSON.stringify({ methodConfig: [ { + // here is the code that shows how an empty shape matches every method: https://github.com/grpc/grpc-node/blob/bfd87a9bf62ebc438bcf98a7af223d5353f4c8b2/packages/grpc-js/src/resolving-load-balancer.ts#L62-L147 name: [{}], + // docs: https://github.com/grpc/grpc-proto/blob/master/grpc/service_config/service_config.proto#L88C29-L88C43 + waitForReady: true, + // docs: https://github.com/grpc/grpc-proto/blob/master/grpc/service_config/service_config.proto#L136 retryPolicy: { - maxAttempts: 10, - initialBackoff: "0.1s", + maxAttempts: 5, + initialBackoff: "1s", maxBackoff: "5s", backoffMultiplier: 2.0, - retryableStatusCodes: ["UNAVAILABLE", "DEADLINE_EXCEEDED"], + // validation code: https://github.com/grpc/grpc-node/blob/0c093b0b7f78f691a4f6e41efc184899d7a2d987/packages/grpc-js/src/service-config.ts#L182C1-L197C4 + retryableStatusCodes: ["UNAVAILABLE", "DEADLINE_EXCEEDED", "INTERNAL"], }, }, ], @@ -43,7 +52,7 @@ const DefaultClientOptions: grpc.ClientOptions = { // Governs how log DNS resolution results are cached (at minimum!) // default is 30s, which is too long for us during rollouts (where service DNS entries are updated) - "grpc.dns_min_time_between_resolutions_ms": 2000, + "grpc.dns_min_time_between_resolutions_ms": 2_000, }; export function spiceDBConfigFromEnv(): SpiceDBClientConfig | undefined { diff --git a/components/server/src/container-module.ts b/components/server/src/container-module.ts index 58d09ef485b4c5..a8f8495b18bd49 100644 --- a/components/server/src/container-module.ts +++ b/components/server/src/container-module.ts @@ -15,6 +15,7 @@ import { DebugApp } from "@gitpod/gitpod-protocol/lib/util/debug-app"; import { IClientCallMetrics, createClientCallMetricsInterceptor, + createDebugLogInterceptor, defaultGRPCOptions, } from "@gitpod/gitpod-protocol/lib/util/grpc"; import { prometheusClientMiddleware } from "@gitpod/gitpod-protocol/lib/util/nice-grpc"; @@ -341,7 +342,7 @@ export const productionContainerModule = new ContainerModule( const clientCallMetrics = ctx.container.get(IClientCallMetrics); return new SpiceDBClientProvider( config, // - [createClientCallMetricsInterceptor(clientCallMetrics)], + [createClientCallMetricsInterceptor(clientCallMetrics), createDebugLogInterceptor()], ); }) .inSingletonScope(); From 99193d90a7b62a0c24a67d4327201e5079815f53 Mon Sep 17 00:00:00 2001 From: Gero Posmyk-Leinemann Date: Thu, 6 Mar 2025 08:37:39 +0000 Subject: [PATCH 3/4] [protocol] Centralize grpc.isConnectionAlive Tool: gitpod/catfood.gitpod.cloud --- components/gitpod-protocol/src/util/grpc.ts | 9 +++++++++ .../image-builder-api/typescript/src/sugar.ts | 9 ++------- .../server/src/util/content-service-sugar.ts | 15 +++++---------- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/components/gitpod-protocol/src/util/grpc.ts b/components/gitpod-protocol/src/util/grpc.ts index 2a3d835cb8d57b..794bbc67dadb03 100644 --- a/components/gitpod-protocol/src/util/grpc.ts +++ b/components/gitpod-protocol/src/util/grpc.ts @@ -179,3 +179,12 @@ export function createDebugLogInterceptor(): grpc.Interceptor { export function isGrpcError(err: any): err is grpc.StatusObject { return err.code && err.details; } + +export function isConnectionAlive(client: grpc.Client) { + const cs = client.getChannel().getConnectivityState(false); + return ( + cs == grpc.connectivityState.CONNECTING || + cs == grpc.connectivityState.IDLE || + cs == grpc.connectivityState.READY + ); +} diff --git a/components/image-builder-api/typescript/src/sugar.ts b/components/image-builder-api/typescript/src/sugar.ts index 4d77d2ca320347..0e44c4ed0202f5 100644 --- a/components/image-builder-api/typescript/src/sugar.ts +++ b/components/image-builder-api/typescript/src/sugar.ts @@ -8,7 +8,7 @@ import { ImageBuilderClient } from "./imgbuilder_grpc_pb"; import { TraceContext } from "@gitpod/gitpod-protocol/lib/util/tracing"; import { Deferred } from "@gitpod/gitpod-protocol/lib/util/deferred"; import { log } from "@gitpod/gitpod-protocol/lib/util/logging"; -import { createClientCallMetricsInterceptor, IClientCallMetrics } from "@gitpod/gitpod-protocol/lib/util/grpc"; +import { createClientCallMetricsInterceptor, IClientCallMetrics, isConnectionAlive } from "@gitpod/gitpod-protocol/lib/util/grpc"; import * as opentracing from "opentracing"; import { Metadata } from "@grpc/grpc-js"; import { @@ -130,12 +130,7 @@ export class PromisifiedImageBuilderClient { ) {} public isConnectionAlive() { - const cs = this.client.getChannel().getConnectivityState(false); - return ( - cs == grpc.connectivityState.CONNECTING || - cs == grpc.connectivityState.IDLE || - cs == grpc.connectivityState.READY - ); + return isConnectionAlive(this.client); } public resolveBaseImage(ctx: TraceContext, request: ResolveBaseImageRequest): Promise { diff --git a/components/server/src/util/content-service-sugar.ts b/components/server/src/util/content-service-sugar.ts index 6a7173760ff64e..f1a021dbaf3458 100644 --- a/components/server/src/util/content-service-sugar.ts +++ b/components/server/src/util/content-service-sugar.ts @@ -6,7 +6,11 @@ import { inject, injectable, interfaces, optional } from "inversify"; import * as grpc from "@grpc/grpc-js"; -import { createClientCallMetricsInterceptor, IClientCallMetrics } from "@gitpod/gitpod-protocol/lib/util/grpc"; +import { + createClientCallMetricsInterceptor, + IClientCallMetrics, + isConnectionAlive, +} from "@gitpod/gitpod-protocol/lib/util/grpc"; import { IDEPluginServiceClient } from "@gitpod/content-service/lib/ideplugin_grpc_pb"; import { ContentServiceClient } from "@gitpod/content-service/lib/content_grpc_pb"; import { BlobServiceClient } from "@gitpod/content-service/lib/blobs_grpc_pb"; @@ -142,12 +146,3 @@ export class CachingHeadlessLogServiceClientProvider extends CachingClientProvid }); } } - -function isConnectionAlive(client: grpc.Client) { - const cs = client.getChannel().getConnectivityState(false); - return ( - cs == grpc.connectivityState.CONNECTING || - cs == grpc.connectivityState.IDLE || - cs == grpc.connectivityState.READY - ); -} From c875f6ddbcc85aa37f8682d4411cc722eeb49321 Mon Sep 17 00:00:00 2001 From: Gero Posmyk-Leinemann Date: Thu, 6 Mar 2025 09:55:44 +0000 Subject: [PATCH 4/4] [server] SpiceDB client: retry with new client on "Waiting for LB pick" error Tool: gitpod/catfood.gitpod.cloud --- components/gitpod-protocol/src/util/grpc.ts | 12 +++- .../src/authorization/spicedb-authorizer.ts | 54 ++++++++++++++--- .../server/src/authorization/spicedb.ts | 58 ++++++++++++++----- components/server/src/container-module.ts | 3 +- 4 files changed, 101 insertions(+), 26 deletions(-) diff --git a/components/gitpod-protocol/src/util/grpc.ts b/components/gitpod-protocol/src/util/grpc.ts index 794bbc67dadb03..8ba1658e7c4706 100644 --- a/components/gitpod-protocol/src/util/grpc.ts +++ b/components/gitpod-protocol/src/util/grpc.ts @@ -110,7 +110,7 @@ export function createClientCallMetricsInterceptor(metrics: IClientCallMetrics): }; } -export function createDebugLogInterceptor(): grpc.Interceptor { +export function createDebugLogInterceptor(additionalContextF: (() => object) | undefined): grpc.Interceptor { const FAILURE_STATUS_CODES = new Map([ [Status.ABORTED, true], [Status.CANCELLED, true], @@ -139,11 +139,21 @@ export function createDebugLogInterceptor(): grpc.Interceptor { .withStart((metadata, listener, next) => { const newListener = new grpc.ListenerBuilder() .withOnReceiveStatus((status, next) => { + // If given, call the additionalContext function and log the result + let additionalContext = {}; + try { + if (additionalContextF) { + additionalContext = additionalContextF(); + } + } catch (e) {} + try { const info = { labels: new TrustedValue(labels), metadata: new TrustedValue(metadata.toJSON()), code: Status[status.code], + details: status.details, + additionalContext: new TrustedValue(additionalContext), }; if (FAILURE_STATUS_CODES.has(status.code)) { log.warn(`grpc call failed`, info); diff --git a/components/server/src/authorization/spicedb-authorizer.ts b/components/server/src/authorization/spicedb-authorizer.ts index 33b284d046a292..6d7ae3464e0a9f 100644 --- a/components/server/src/authorization/spicedb-authorizer.ts +++ b/components/server/src/authorization/spicedb-authorizer.ts @@ -174,16 +174,52 @@ export class SpiceDBAuthorizer { return this.call("readRelationships failed.", (client) => client.readRelationships(req, this.callOptions)); } - private async call(errMessage: string, code: (client: v1.ZedPromiseClientInterface) => Promise): Promise { - try { - const client = this.clientProvider.getClient(); - return code(client); - } catch (err) { - log.error(errMessage, err, { - code: err.code, - }); - throw err; + /** + * call retrieves a Spicedb client and executes the given code block. + * In addition to the gRPC-level retry mechanisms, it retries on "Waiting for LB pick" errors. + * This is required, because we seem to be running into a grpc/grpc-js bug where a subchannel takes 120s+ to reconnect. + * @param description + * @param code + * @returns + */ + private async call(description: string, code: (client: v1.ZedPromiseClientInterface) => Promise): Promise { + const MAX_ATTEMPTS = 3; + let attempt = 0; + while (attempt++ < 3) { + try { + const checkClient = attempt > 1; // the last client error'd out, so check if we should get a new one + const client = this.clientProvider.getClient(checkClient); + return code(client); + } catch (err) { + // Check: Is this a "no connection to upstream" error? If yes, retry here, to work around grpc/grpc-js bugs introducing high latency for re-tries + if ( + (err.code === grpc.status.DEADLINE_EXCEEDED || err.code === grpc.status.UNAVAILABLE) && + attempt < MAX_ATTEMPTS + ) { + let delay = 500 * attempt; + if (err.code === grpc.status.DEADLINE_EXCEEDED) { + // we already waited for timeout, so let's try again immediately + delay = 0; + } + + log.warn(description, err, { + attempt, + delay, + code: err.code, + }); + await new Promise((resolve) => setTimeout(resolve, delay)); + continue; + } + + // Some other error: log and rethrow + log.error(description, err, { + attempt, + code: err.code, + }); + throw err; + } } + throw new Error("unreachable"); } /** diff --git a/components/server/src/authorization/spicedb.ts b/components/server/src/authorization/spicedb.ts index cf553b8716cd11..fe5c6fdccd5a4e 100644 --- a/components/server/src/authorization/spicedb.ts +++ b/components/server/src/authorization/spicedb.ts @@ -9,6 +9,8 @@ import { log } from "@gitpod/gitpod-protocol/lib/util/logging"; import * as grpc from "@grpc/grpc-js"; import { getExperimentsClientForBackend } from "@gitpod/gitpod-protocol/lib/experiments/configcat-server"; import { TrustedValue } from "@gitpod/gitpod-protocol/lib/util/scrubbing"; +import { createDebugLogInterceptor } from "@gitpod/gitpod-protocol/lib/util/grpc"; +import { isConnectionAlive } from "@gitpod/gitpod-protocol/lib/util/grpc"; export interface SpiceDBClientConfig { address: string; @@ -54,6 +56,7 @@ const DefaultClientOptions: grpc.ClientOptions = { // default is 30s, which is too long for us during rollouts (where service DNS entries are updated) "grpc.dns_min_time_between_resolutions_ms": 2_000, }; +const CLIENT_CLOSE_DELAY = 60_000; // 60 [s] export function spiceDBConfigFromEnv(): SpiceDBClientConfig | undefined { const token = process.env["SPICEDB_PRESHARED_KEY"]; @@ -110,9 +113,7 @@ export class SpiceDBClientProvider { }); // close client after 2 minutes to make sure most pending requests on the previous client are finished. - setTimeout(() => { - this.closeClient(oldClient); - }, 2 * 60 * 1000); + this.closeClientAfter(oldClient, CLIENT_CLOSE_DELAY); } this.clientOptions = clientOptions; // `createClient` will use the `DefaultClientOptions` to create client if the value on Feature Flag is not able to create a client @@ -132,48 +133,77 @@ export class SpiceDBClientProvider { })(); } - private closeClient(client: Client) { - try { - client.close(); - } catch (error) { - log.error("[spicedb] Error closing client", error); - } + private closeClientAfter(client: Client, timeout: number): void { + setTimeout(() => { + try { + client.close(); + } catch (error) { + log.error("[spicedb] Error closing client", error); + } + }, timeout); } private createClient(clientOptions: grpc.ClientOptions): Client { log.debug("[spicedb] Creating client", { clientOptions: new TrustedValue(clientOptions), }); + + // Inject debugLogInterceptor to log details especially on failed grpc calls + let client: Client; + const debugInfo = (): object => { + if (!client) { + return {}; + } + const ch = client.getChannel(); + return { + target: ch.getTarget(), + state: ch.getConnectivityState(false), + }; + }; + const interceptors = [...(this.interceptors || []), createDebugLogInterceptor(debugInfo)]; + + // Create the actual client try { - return v1.NewClient( + client = v1.NewClient( this.clientConfig.token, this.clientConfig.address, v1.ClientSecurity.INSECURE_PLAINTEXT_CREDENTIALS, undefined, { ...clientOptions, - interceptors: this.interceptors, + interceptors, }, ) as Client; } catch (error) { log.error("[spicedb] Error create client, fallback to default options", error); - return v1.NewClient( + client = v1.NewClient( this.clientConfig.token, this.clientConfig.address, v1.ClientSecurity.INSECURE_PLAINTEXT_CREDENTIALS, undefined, { ...DefaultClientOptions, - interceptors: this.interceptors, + interceptors, }, ) as Client; } + return client; } - getClient(): SpiceDBClient { + getClient(checkClient: boolean = false): SpiceDBClient { if (!this.client) { this.client = this.createClient(this.clientOptions); } + + if (checkClient) { + const oldClient = this.client; + if (!isConnectionAlive(oldClient)) { + const connectivityState = oldClient.getChannel().getConnectivityState(false); + log.warn("[spicedb] Client is not alive, creating a new one", { connectivityState }); + this.closeClientAfter(oldClient, CLIENT_CLOSE_DELAY); + this.client = this.createClient(this.clientOptions); + } + } return this.client.promises; } } diff --git a/components/server/src/container-module.ts b/components/server/src/container-module.ts index a8f8495b18bd49..58d09ef485b4c5 100644 --- a/components/server/src/container-module.ts +++ b/components/server/src/container-module.ts @@ -15,7 +15,6 @@ import { DebugApp } from "@gitpod/gitpod-protocol/lib/util/debug-app"; import { IClientCallMetrics, createClientCallMetricsInterceptor, - createDebugLogInterceptor, defaultGRPCOptions, } from "@gitpod/gitpod-protocol/lib/util/grpc"; import { prometheusClientMiddleware } from "@gitpod/gitpod-protocol/lib/util/nice-grpc"; @@ -342,7 +341,7 @@ export const productionContainerModule = new ContainerModule( const clientCallMetrics = ctx.container.get(IClientCallMetrics); return new SpiceDBClientProvider( config, // - [createClientCallMetricsInterceptor(clientCallMetrics), createDebugLogInterceptor()], + [createClientCallMetricsInterceptor(clientCallMetrics)], ); }) .inSingletonScope();