@@ -17,46 +17,6 @@ import { ctxTryGetCache, ctxTrySetCache } from "../util/request-context";
1717import { ApplicationError , ErrorCodes } from "@gitpod/gitpod-protocol/lib/messaging/error" ;
1818import { isGrpcError } from "@gitpod/gitpod-protocol/lib/util/grpc" ;
1919
20- async function tryThree < T > ( errMessage : string , code : ( attempt : number ) => Promise < T > ) : Promise < T > {
21- let attempt = 0 ;
22- // we do sometimes see INTERNAL errors from SpiceDB, or grpc-js reports DEADLINE_EXCEEDED, so we retry a few times
23- // last time we checked it was 15 times per day (check logs)
24- while ( attempt ++ < 3 ) {
25- try {
26- return await code ( attempt ) ;
27- } catch ( err ) {
28- if (
29- ( err . code === grpc . status . INTERNAL ||
30- err . code === grpc . status . DEADLINE_EXCEEDED ||
31- err . code === grpc . status . UNAVAILABLE ) &&
32- attempt < 3
33- ) {
34- let delay = 500 * attempt ;
35- if ( err . code === grpc . status . DEADLINE_EXCEEDED ) {
36- // we already waited for timeout, so let's try again immediately
37- delay = 0 ;
38- }
39-
40- log . warn ( errMessage , err , {
41- attempt,
42- delay,
43- code : err . code ,
44- } ) ;
45- await new Promise ( ( resolve ) => setTimeout ( resolve , delay ) ) ;
46- continue ;
47- }
48-
49- log . error ( errMessage , err , {
50- attempt,
51- code : err . code ,
52- } ) ;
53- // we don't try again on other errors
54- throw err ;
55- }
56- }
57- throw new Error ( "unreachable" ) ;
58- }
59-
6020export function createSpiceDBAuthorizer ( clientProvider : SpiceDBClientProvider ) : SpiceDBAuthorizer {
6121 return new SpiceDBAuthorizer ( clientProvider , new RequestLocalZedTokenCache ( ) ) ;
6222}
@@ -71,13 +31,11 @@ interface DeletionResult {
7131 deletedAt ?: string ;
7232}
7333
34+ const GRPC_DEADLINE = 10_000 ;
35+
7436export class SpiceDBAuthorizer {
7537 constructor ( private readonly clientProvider : SpiceDBClientProvider , private readonly tokenCache : ZedTokenCache ) { }
7638
77- private get client ( ) : v1 . ZedPromiseClientInterface {
78- return this . clientProvider . getClient ( ) ;
79- }
80-
8139 public async check ( req : v1 . CheckPermissionRequest , experimentsFields : { userId : string } ) : Promise < boolean > {
8240 req . consistency = await this . tokenCache . consistency ( req . resource ) ;
8341 incSpiceDBRequestsCheckTotal ( req . consistency ?. requirement ?. oneofKind || "undefined" ) ;
@@ -99,8 +57,8 @@ export class SpiceDBAuthorizer {
9957 const timer = spicedbClientLatency . startTimer ( ) ;
10058 let error : Error | undefined ;
10159 try {
102- const response = await tryThree ( "[spicedb] Failed to perform authorization check." , ( ) =>
103- this . client . checkPermission ( req , this . callOptions ) ,
60+ const response = await this . call ( "[spicedb] Failed to perform authorization check." , ( client ) =>
61+ client . checkPermission ( req , this . callOptions ) ,
10462 ) ;
10563 const permitted = response . permissionship === v1 . CheckPermissionResponse_Permissionship . HAS_PERMISSION ;
10664 return { permitted, checkedAt : response . checkedAt ?. token } ;
@@ -139,8 +97,8 @@ export class SpiceDBAuthorizer {
13997 const timer = spicedbClientLatency . startTimer ( ) ;
14098 let error : Error | undefined ;
14199 try {
142- const response = await tryThree ( "[spicedb] Failed to write relationships." , ( ) =>
143- this . client . writeRelationships (
100+ const response = await this . call ( "[spicedb] Failed to write relationships." , ( client ) =>
101+ client . writeRelationships (
144102 v1 . WriteRelationshipsRequest . create ( {
145103 updates,
146104 } ) ,
@@ -175,16 +133,16 @@ export class SpiceDBAuthorizer {
175133 let error : Error | undefined ;
176134 try {
177135 let deletedAt : string | undefined = undefined ;
178- const existing = await tryThree ( "readRelationships before deleteRelationships failed." , ( ) =>
179- this . client . readRelationships ( v1 . ReadRelationshipsRequest . create ( req ) , this . callOptions ) ,
136+ const existing = await this . call ( "readRelationships before deleteRelationships failed." , ( client ) =>
137+ client . readRelationships ( v1 . ReadRelationshipsRequest . create ( req ) , this . callOptions ) ,
180138 ) ;
181139 if ( existing . length > 0 ) {
182- const response = await tryThree ( "deleteRelationships failed." , ( ) =>
183- this . client . deleteRelationships ( req , this . callOptions ) ,
140+ const response = await this . call ( "deleteRelationships failed." , ( client ) =>
141+ client . deleteRelationships ( req , this . callOptions ) ,
184142 ) ;
185143 deletedAt = response . deletedAt ?. token ;
186- const after = await tryThree ( "readRelationships failed." , ( ) =>
187- this . client . readRelationships ( v1 . ReadRelationshipsRequest . create ( req ) , this . callOptions ) ,
144+ const after = await this . call ( "readRelationships failed." , ( client ) =>
145+ client . readRelationships ( v1 . ReadRelationshipsRequest . create ( req ) , this . callOptions ) ,
188146 ) ;
189147 if ( after . length > 0 ) {
190148 log . error ( "[spicedb] Failed to delete relationships." , { existing, after, request : req } ) ;
@@ -213,7 +171,55 @@ export class SpiceDBAuthorizer {
213171 async readRelationships ( req : v1 . ReadRelationshipsRequest ) : Promise < v1 . ReadRelationshipsResponse [ ] > {
214172 req . consistency = await this . tokenCache . consistency ( undefined ) ;
215173 incSpiceDBRequestsCheckTotal ( req . consistency ?. requirement ?. oneofKind || "undefined" ) ;
216- return tryThree ( "readRelationships failed." , ( ) => this . client . readRelationships ( req , this . callOptions ) ) ;
174+ return this . call ( "readRelationships failed." , ( client ) => client . readRelationships ( req , this . callOptions ) ) ;
175+ }
176+
177+ /**
178+ * call retrieves a Spicedb client and executes the given code block.
179+ * In addition to the gRPC-level retry mechanisms, it retries on "Waiting for LB pick" errors.
180+ * This is required, because we seem to be running into a grpc/grpc-js bug where a subchannel takes 120s+ to reconnect.
181+ * @param description
182+ * @param code
183+ * @returns
184+ */
185+ private async call < T > ( description : string , code : ( client : v1 . ZedPromiseClientInterface ) => Promise < T > ) : Promise < T > {
186+ const MAX_ATTEMPTS = 3 ;
187+ let attempt = 0 ;
188+ while ( attempt ++ < 3 ) {
189+ try {
190+ const checkClient = attempt > 1 ; // the last client error'd out, so check if we should get a new one
191+ const client = this . clientProvider . getClient ( checkClient ) ;
192+ return code ( client ) ;
193+ } catch ( err ) {
194+ // Check: Is this a "no connection to upstream" error? If yes, retry here, to work around grpc/grpc-js bugs introducing high latency for re-tries
195+ if (
196+ ( err . code === grpc . status . DEADLINE_EXCEEDED || err . code === grpc . status . UNAVAILABLE ) &&
197+ attempt < MAX_ATTEMPTS
198+ ) {
199+ let delay = 500 * attempt ;
200+ if ( err . code === grpc . status . DEADLINE_EXCEEDED ) {
201+ // we already waited for timeout, so let's try again immediately
202+ delay = 0 ;
203+ }
204+
205+ log . warn ( description , err , {
206+ attempt,
207+ delay,
208+ code : err . code ,
209+ } ) ;
210+ await new Promise ( ( resolve ) => setTimeout ( resolve , delay ) ) ;
211+ continue ;
212+ }
213+
214+ // Some other error: log and rethrow
215+ log . error ( description , err , {
216+ attempt,
217+ code : err . code ,
218+ } ) ;
219+ throw err ;
220+ }
221+ }
222+ throw new Error ( "unreachable" ) ;
217223 }
218224
219225 /**
@@ -223,7 +229,7 @@ export class SpiceDBAuthorizer {
223229 */
224230 private get callOptions ( ) : grpc . Metadata {
225231 return ( < grpc . CallOptions > {
226- deadline : Date . now ( ) + 8000 ,
232+ deadline : Date . now ( ) + GRPC_DEADLINE ,
227233 } ) as any as grpc . Metadata ;
228234 }
229235}
0 commit comments