-
Notifications
You must be signed in to change notification settings - Fork 58
fix(flagd): improve stream observer, refine retry policy; don't use retry to avoid busy loop #1590
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
893cd1c
67b190d
5b47a15
e972a80
7991058
f4a93d6
774bdee
acab1f3
3ad54de
fc963ce
c7c1a3a
0c41114
0c29e37
8baba19
ddffc09
1920779
86de034
9a2ef7b
95914e6
d0286a9
455b362
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -98,10 +98,18 @@ public class FlagdOptions { | |
| @Builder.Default | ||
| private int deadline = fallBackToEnvOrDefault(Config.DEADLINE_MS_ENV_VAR_NAME, Config.DEFAULT_DEADLINE); | ||
|
|
||
| /** | ||
| * Max stream retry backoff in milliseconds. | ||
| */ | ||
| @Builder.Default | ||
| private int retryBackoffMaxMs = | ||
| fallBackToEnvOrDefault(Config.FLAGD_RETRY_BACKOFF_MAX_MS_VAR_NAME, Config.DEFAULT_MAX_RETRY_BACKOFF); | ||
|
||
|
|
||
| /** | ||
| * Streaming connection deadline in milliseconds. | ||
| * Set to 0 to disable the deadline. | ||
| * Defaults to 600000 (10 minutes); recommended to prevent infrastructure from killing idle connections. | ||
| * Defaults to 600000 (10 minutes); recommended to prevent infrastructure from | ||
| * killing idle connections. | ||
| */ | ||
| @Builder.Default | ||
| private int streamDeadlineMs = | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,95 +24,57 @@ | |
| import javax.net.ssl.SSLException; | ||
|
|
||
| /** gRPC channel builder helper. */ | ||
| @SuppressFBWarnings(value = "SE_BAD_FIELD", justification = "we don't care to serialize this") | ||
| public class ChannelBuilder { | ||
|
|
||
| @SuppressWarnings({"unchecked", "rawtypes"}) | ||
| private static final Map<String, ?> DEFAULT_RETRY_POLICY = new HashMap() { | ||
| { | ||
| // 1s + 2s + 4s | ||
| put("maxAttempts", 3.0); // types used here are important, need to be doubles | ||
| put("initialBackoff", "1s"); | ||
| put("maxBackoff", "5s"); | ||
| put("backoffMultiplier", 2.0); | ||
| // status codes to retry on: | ||
| put( | ||
| "retryableStatusCodes", | ||
| Arrays.asList( | ||
| /* | ||
| * Only states UNAVAILABLE and UNKNOWN should be retried. All | ||
| * other failure states will probably not be resolved with a simple retry. | ||
| */ | ||
| Code.UNAVAILABLE.toString(), Code.UNKNOWN.toString())); | ||
| } | ||
| }; | ||
|
|
||
| /** | ||
| * Controls retry (not-reconnection) policy for failed RPCs. | ||
| */ | ||
| @SuppressWarnings({"unchecked", "rawtypes"}) | ||
| static final Map<String, ?> SERVICE_CONFIG_WITH_RETRY = new HashMap() { | ||
| { | ||
| put("methodConfig", Arrays.asList(new HashMap() { | ||
| { | ||
| put( | ||
| "name", | ||
| Arrays.asList( | ||
| new HashMap() { | ||
| { | ||
| put("service", "flagd.sync.v1.FlagSyncService"); | ||
| } | ||
| }, | ||
| new HashMap() { | ||
| { | ||
| put("service", "flagd.evaluation.v1.Service"); | ||
| } | ||
| })); | ||
| put("retryPolicy", DEFAULT_RETRY_POLICY); | ||
| } | ||
|
|
||
| { | ||
| put("name", Arrays.asList(new HashMap() { | ||
| { | ||
| put("service", "flagd.sync.v1.FlagSyncService"); | ||
| put("method", "SyncFlags"); | ||
| } | ||
| })); | ||
| put("retryPolicy", new HashMap(DEFAULT_RETRY_POLICY) { | ||
| { | ||
| // 1s + 2s + 4s + 5s + 5s + 5s + 5s + 5s + 5s + 5s | ||
| put("maxAttempts", 12.0); | ||
| // for streaming Retry on more status codes | ||
| put( | ||
| "retryableStatusCodes", | ||
| Arrays.asList( | ||
| /* | ||
| * All codes are retryable except OK and DEADLINE_EXCEEDED since | ||
| * any others not listed here cause a very tight loop of retries. | ||
| * DEADLINE_EXCEEDED is typically a result of a client specified | ||
| * deadline,and definitionally should not result in a tight loop | ||
| * (it's a timeout). | ||
| */ | ||
| Code.CANCELLED.toString(), | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removed - these were only here to slow down our loop, which we do with a sleep now. |
||
| Code.UNKNOWN.toString(), | ||
| Code.INVALID_ARGUMENT.toString(), | ||
| Code.NOT_FOUND.toString(), | ||
| Code.ALREADY_EXISTS.toString(), | ||
| Code.PERMISSION_DENIED.toString(), | ||
| Code.RESOURCE_EXHAUSTED.toString(), | ||
| Code.FAILED_PRECONDITION.toString(), | ||
| Code.ABORTED.toString(), | ||
| Code.OUT_OF_RANGE.toString(), | ||
| Code.UNIMPLEMENTED.toString(), | ||
| Code.INTERNAL.toString(), | ||
| Code.UNAVAILABLE.toString(), | ||
| Code.DATA_LOSS.toString(), | ||
| Code.UNAUTHENTICATED.toString())); | ||
| } | ||
| }); | ||
| } | ||
| })); | ||
| } | ||
| }; | ||
| static Map<String, ?> buildRetryPolicy(final FlagdOptions options) { | ||
| return new HashMap() { | ||
| { | ||
| put("methodConfig", Arrays.asList(new HashMap() { | ||
| { | ||
| put( | ||
| "name", | ||
| Arrays.asList( | ||
| new HashMap() { | ||
| { | ||
| put("service", "flagd.sync.v1.FlagSyncService"); | ||
| } | ||
| }, | ||
| new HashMap() { | ||
| { | ||
| put("service", "flagd.evaluation.v1.Service"); | ||
| } | ||
| })); | ||
| put("retryPolicy", new HashMap() { | ||
| { | ||
| // 1 + 2 + 4 | ||
| put("maxAttempts", 3.0); // types used here are important, need to be doubles | ||
| put("initialBackoff", "1s"); | ||
| put( | ||
| "maxBackoff", | ||
| options.getRetryBackoffMaxMs() >= 1000 | ||
| ? String.format("%ds", options.getRetryBackoffMaxMs() / 1000) | ||
| : "1s"); | ||
| put("backoffMultiplier", 2.0); | ||
| // status codes to retry on: | ||
| put( | ||
| "retryableStatusCodes", | ||
| Arrays.asList( | ||
| /* | ||
| * As per gRPC spec, the following status codes are safe to retry: | ||
| * UNAVAILABLE, UNKNOWN, | ||
| */ | ||
| Code.UNKNOWN.toString(), Code.UNAVAILABLE.toString())); | ||
| } | ||
| }); | ||
| } | ||
| })); | ||
| } | ||
| }; | ||
| } | ||
|
|
||
| private ChannelBuilder() {} | ||
|
|
||
|
|
@@ -137,7 +99,7 @@ public static ManagedChannel nettyChannel(final FlagdOptions options) { | |
| .eventLoopGroup(new MultiThreadIoEventLoopGroup(EpollIoHandler.newFactory())) | ||
| .channelType(EpollDomainSocketChannel.class) | ||
| .usePlaintext() | ||
| .defaultServiceConfig(SERVICE_CONFIG_WITH_RETRY) | ||
| .defaultServiceConfig(buildRetryPolicy(options)) | ||
| .enableRetry() | ||
| .build(); | ||
| } | ||
|
|
@@ -183,7 +145,7 @@ public static ManagedChannel nettyChannel(final FlagdOptions options) { | |
| builder.intercept(new FlagdGrpcInterceptor(options.getOpenTelemetry())); | ||
| } | ||
|
|
||
| return builder.defaultServiceConfig(SERVICE_CONFIG_WITH_RETRY) | ||
| return builder.defaultServiceConfig(buildRetryPolicy(options)) | ||
| .enableRetry() | ||
| .build(); | ||
| } catch (SSLException ssle) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -40,6 +40,7 @@ public class SyncStreamQueueSource implements QueueSource { | |
| private final AtomicBoolean shutdown = new AtomicBoolean(false); | ||
| private final int streamDeadline; | ||
| private final int deadline; | ||
| private final int maxBackoffMs; | ||
| private final String selector; | ||
| private final String providerId; | ||
| private final boolean syncMetadataDisabled; | ||
|
|
@@ -56,6 +57,7 @@ public SyncStreamQueueSource(final FlagdOptions options, Consumer<FlagdProviderE | |
| deadline = options.getDeadline(); | ||
| selector = options.getSelector(); | ||
| providerId = options.getProviderId(); | ||
| maxBackoffMs = options.getRetryBackoffMaxMs(); | ||
| syncMetadataDisabled = options.isSyncMetadataDisabled(); | ||
| channelConnector = new ChannelConnector(options, onConnectionEvent, ChannelBuilder.nettyChannel(options)); | ||
| flagSyncStub = | ||
|
|
@@ -75,6 +77,7 @@ protected SyncStreamQueueSource( | |
| selector = options.getSelector(); | ||
| providerId = options.getProviderId(); | ||
| channelConnector = connectorMock; | ||
| maxBackoffMs = options.getRetryBackoffMaxMs(); | ||
| flagSyncStub = stubMock; | ||
| syncMetadataDisabled = options.isSyncMetadataDisabled(); | ||
| metadataStub = blockingStubMock; | ||
|
|
@@ -110,26 +113,32 @@ private void observeSyncStream() { | |
| log.info("Initializing sync stream observer"); | ||
|
|
||
| // outer loop for re-issuing the stream request | ||
| // "waitForReady" on the channel, plus our retry policy slow this loop down in error conditions | ||
| // "waitForReady" on the channel, plus our retry policy slow this loop down in | ||
| // error conditions | ||
| while (!shutdown.get()) { | ||
| log.debug("Initializing sync stream request"); | ||
| SyncStreamObserver observer = new SyncStreamObserver(outgoingQueue); | ||
|
|
||
| try { | ||
| observer.metadata = getMetadata(); | ||
| } catch (Exception metaEx) { | ||
| // retry if getMetadata fails | ||
| String message = metaEx.getMessage(); | ||
| log.debug("Metadata request error: {}, will restart", message, metaEx); | ||
| enqueueError(String.format("Error in getMetadata request: %s", message)); | ||
| continue; | ||
| } | ||
|
|
||
| try { | ||
| syncFlags(observer); | ||
| } catch (Exception ex) { | ||
| log.error("Unexpected sync stream exception, will restart.", ex); | ||
| enqueueError(String.format("Error in syncStream: %s", ex.getMessage())); | ||
| log.debug("Initializing sync stream request"); | ||
| SyncStreamObserver observer = new SyncStreamObserver(outgoingQueue); | ||
| try { | ||
| observer.metadata = getMetadata(); | ||
| } catch (Exception metaEx) { | ||
| // retry if getMetadata fails | ||
| String message = metaEx.getMessage(); | ||
| log.debug("Metadata request error: {}, will restart", message, metaEx); | ||
| enqueueError(String.format("Error in getMetadata request: %s", message)); | ||
| Thread.sleep(this.maxBackoffMs); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this sleep helps prevent tight loops during retries, which can be a problem if a upstream connection in a proxy is down. |
||
| continue; | ||
| } | ||
|
|
||
| try { | ||
| syncFlags(observer); | ||
| } catch (Exception ex) { | ||
| log.error("Unexpected sync stream exception, will restart.", ex); | ||
| enqueueError(String.format("Error in syncStream: %s", ex.getMessage())); | ||
| Thread.sleep(this.maxBackoffMs); | ||
| } | ||
| } catch (InterruptedException ie) { | ||
| log.debug("Stream loop interrupted, most likely shutdown was invoked", ie); | ||
| } | ||
| } | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.