@@ -996,125 +996,36 @@ class UpdateMachine {
996996 }());
997997 }
998998
999- // This is static so that it persists through new UpdateMachine instances
1000- // as we attempt to fix things by reloading data from scratch. In principle
1001- // it could also be per-account (or per-realm or per-server); but currently
1002- // we skip that complication, as well as attempting to reset backoff on
1003- // later success. After all, these unexpected errors should be uncommon;
1004- // ideally they'd never happen.
1005- static BackoffMachine get _unexpectedErrorBackoffMachine {
1006- return __unexpectedErrorBackoffMachine
1007- ?? = BackoffMachine (maxBound: const Duration (seconds: 60 ));
999+ Future <void > _debugLoopWait () async {
1000+ await _debugLoopSignal! .future;
1001+ if (_disposed) return ;
1002+ assert (() {
1003+ _debugLoopSignal = Completer ();
1004+ return true ;
1005+ }());
10081006 }
1009- static BackoffMachine ? __unexpectedErrorBackoffMachine;
1010-
1011- /// This controls when we start to report transient errors to the user when
1012- /// polling.
1013- ///
1014- /// At the 6th failure, the expected time elapsed since the first failure
1015- /// will be 1.55 seocnds.
1016- static const transientFailureCountNotifyThreshold = 5 ;
10171007
10181008 void poll () async {
10191009 assert (! _disposed);
10201010 try {
1021- BackoffMachine ? backoffMachine;
1022- int accumulatedTransientFailureCount = 0 ;
1023-
1024- /// This only reports transient errors after reaching
1025- /// a pre-defined threshold of retries.
1026- void maybeReportToUserTransientError (Object error) {
1027- accumulatedTransientFailureCount++ ;
1028- if (accumulatedTransientFailureCount > transientFailureCountNotifyThreshold) {
1029- _reportToUserErrorConnectingToServer (error);
1030- }
1031- }
1032-
10331011 while (true ) {
10341012 if (_debugLoopSignal != null ) {
1035- await _debugLoopSignal ! .future ;
1013+ await _debugLoopWait () ;
10361014 if (_disposed) return ;
1037- assert (() {
1038- _debugLoopSignal = Completer ();
1039- return true ;
1040- }());
10411015 }
10421016
10431017 final GetEventsResult result;
10441018 try {
10451019 result = await getEvents (store.connection,
10461020 queueId: queueId, lastEventId: lastEventId);
10471021 if (_disposed) return ;
1048- } catch (e) {
1022+ } catch (e, stackTrace ) {
10491023 if (_disposed) return ;
1050- store.isLoading = true ;
1051-
1052- if (e is ! ApiRequestException ) {
1053- // Some unexpected error, outside even making the HTTP request.
1054- // Definitely a bug in our code.
1055- rethrow ;
1056- }
1057-
1058- bool shouldReportToUser;
1059- switch (e) {
1060- case NetworkException (cause: SocketException ()):
1061- // A [SocketException] is common when the app returns from sleep.
1062- shouldReportToUser = false ;
1063-
1064- case NetworkException ():
1065- case Server5xxException ():
1066- shouldReportToUser = true ;
1067-
1068- case ServerException (httpStatus: 429 ):
1069- case ZulipApiException (httpStatus: 429 ):
1070- case ZulipApiException (code: 'RATE_LIMIT_HIT' ):
1071- // TODO(#946) handle rate-limit errors more generally, in ApiConnection
1072- shouldReportToUser = true ;
1073-
1074- case ZulipApiException (code: 'BAD_EVENT_QUEUE_ID' ):
1075- rethrow ;
1076-
1077- case ZulipApiException ():
1078- case MalformedServerResponseException ():
1079- // Either a 4xx we didn't expect, or a malformed response;
1080- // in either case, a mismatch of the client's expectations to the
1081- // server's behavior, and therefore a bug in one or the other.
1082- // TODO(#1054) handle auth failures specifically
1083- rethrow ;
1084- }
1085-
1086- assert (debugLog ('Transient error polling event queue for $store : $e \n '
1087- 'Backing off, then will retry…' ));
1088- if (shouldReportToUser) {
1089- maybeReportToUserTransientError (e);
1090- }
1091- await (backoffMachine ?? = BackoffMachine ()).wait ();
1024+ await _handlePollRequestError (e, stackTrace); // may rethrow
10921025 if (_disposed) return ;
1093- assert (debugLog ('… Backoff wait complete, retrying poll.' ));
10941026 continue ;
10951027 }
1096-
1097- // After one successful request, we reset backoff to its initial state.
1098- // That way if the user is off the network and comes back on, the app
1099- // doesn't wind up in a state where it's slow to recover the next time
1100- // one request fails.
1101- //
1102- // This does mean that if the server is having trouble and handling some
1103- // but not all of its requests, we'll end up doing a lot more retries than
1104- // if we stayed at the max backoff interval; partway toward what would
1105- // happen if we weren't backing off at all.
1106- //
1107- // But at least for [getEvents] requests, as here, it should be OK,
1108- // because this is a long-poll. That means a typical successful request
1109- // takes a long time to come back; in fact longer than our max backoff
1110- // duration (which is 10 seconds). So if we're getting a mix of successes
1111- // and failures, the successes themselves should space out the requests.
1112- backoffMachine = null ;
1113-
1114- store.isLoading = false ;
1115- // Dismiss existing errors, if any.
1116- reportErrorToUserBriefly (null );
1117- accumulatedTransientFailureCount = 0 ;
1028+ _clearPollErrors ();
11181029
11191030 final events = result.events;
11201031 for (final event in events) {
@@ -1133,61 +1044,186 @@ class UpdateMachine {
11331044 }
11341045 } catch (e) {
11351046 if (_disposed) return ;
1047+ await _handlePollError (e);
1048+ assert (_disposed);
1049+ return ;
1050+ }
1051+ }
11361052
1137- // An error occurred, other than the transient request errors we retry on.
1138- // This means either a lost/expired event queue on the server (which is
1139- // normal after the app is offline for a period like 10 minutes),
1140- // or an unexpected exception representing a bug in our code or the server.
1141- // Either way, the show must go on. So reload server data from scratch.
1142-
1143- // First, log what happened.
1144- store.isLoading = true ;
1145- bool isUnexpected;
1146- switch (e) {
1147- case ZulipApiException (code: 'BAD_EVENT_QUEUE_ID' ):
1148- assert (debugLog ('Lost event queue for $store . Replacing…' ));
1149- // The old event queue is gone, so we need a new one. This is normal.
1150- isUnexpected = false ;
1151-
1152- case _EventHandlingException (: final cause, : final event):
1153- assert (debugLog ('BUG: Error handling an event: $cause \n ' // TODO(log)
1154- ' event: $event \n '
1155- 'Replacing event queue…' ));
1156- reportErrorToUserBriefly (
1157- GlobalLocalizations .zulipLocalizations.errorHandlingEventTitle,
1158- details: GlobalLocalizations .zulipLocalizations.errorHandlingEventDetails (
1159- store.realmUrl.toString (), cause.toString (), event.toString ()));
1160- // We can't just continue with the next event, because our state
1161- // may be garbled due to failing to apply this one (and worse,
1162- // any invariants that were left in a broken state from where
1163- // the exception was thrown). So reload from scratch.
1164- // Hopefully (probably?) the bug only affects our implementation of
1165- // the *change* in state represented by the event, and when we get the
1166- // new state in a fresh InitialSnapshot we'll handle that just fine.
1167- isUnexpected = true ;
1168-
1169- default :
1170- assert (debugLog ('BUG: Unexpected error in event polling: $e \n ' // TODO(log)
1171- 'Replacing event queue…' ));
1172- _reportToUserErrorConnectingToServer (e);
1173- // Similar story to the _EventHandlingException case;
1174- // separate only so that that other case can print more context.
1175- // The bug here could be in the server if it's an ApiRequestException,
1176- // but our remedy is the same either way.
1177- isUnexpected = true ;
1178- }
1053+ // This is static so that it persists through new UpdateMachine instances
1054+ // as we attempt to fix things by reloading data from scratch. In principle
1055+ // it could also be per-account (or per-realm or per-server); but currently
1056+ // we skip that complication, as well as attempting to reset backoff on
1057+ // later success. After all, these unexpected errors should be uncommon;
1058+ // ideally they'd never happen.
1059+ static BackoffMachine get _unexpectedErrorBackoffMachine {
1060+ return __unexpectedErrorBackoffMachine
1061+ ?? = BackoffMachine (maxBound: const Duration (seconds: 60 ));
1062+ }
1063+ static BackoffMachine ? __unexpectedErrorBackoffMachine;
11791064
1180- if (isUnexpected) {
1181- // We don't know the cause of the failure; it might well keep happening.
1182- // Avoid creating a retry storm.
1183- await _unexpectedErrorBackoffMachine.wait ();
1184- if (_disposed) return ;
1185- }
1065+ BackoffMachine ? _pollBackoffMachine;
11861066
1187- // This disposes the store, which disposes this update machine.
1188- await store._globalStore._reloadPerAccount (store.accountId);
1189- assert (debugLog ('… Event queue replaced.' ));
1190- return ;
1067+ /// This controls when we start to report transient errors to the user when
1068+ /// polling.
1069+ ///
1070+ /// At the 6th failure, the expected time elapsed since the first failure
1071+ /// will be 1.55 seocnds.
1072+ static const transientFailureCountNotifyThreshold = 5 ;
1073+
1074+ int _accumulatedTransientFailureCount = 0 ;
1075+
1076+ void _clearPollErrors () {
1077+ // After one successful request, we reset backoff to its initial state.
1078+ // That way if the user is off the network and comes back on, the app
1079+ // doesn't wind up in a state where it's slow to recover the next time
1080+ // one request fails.
1081+ //
1082+ // This does mean that if the server is having trouble and handling some
1083+ // but not all of its requests, we'll end up doing a lot more retries than
1084+ // if we stayed at the max backoff interval; partway toward what would
1085+ // happen if we weren't backing off at all.
1086+ //
1087+ // But at least for [getEvents] requests, as here, it should be OK,
1088+ // because this is a long-poll. That means a typical successful request
1089+ // takes a long time to come back; in fact longer than our max backoff
1090+ // duration (which is 10 seconds). So if we're getting a mix of successes
1091+ // and failures, the successes themselves should space out the requests.
1092+ _pollBackoffMachine = null ;
1093+
1094+ store.isLoading = false ;
1095+ _accumulatedTransientFailureCount = 0 ;
1096+ reportErrorToUserBriefly (null );
1097+ }
1098+
1099+ /// Sort out an error from the network request in [poll] :
1100+ /// either wait for a backoff duration (and possibly report the error),
1101+ /// or rethrow.
1102+ ///
1103+ /// If the request should be retried, this method uses [_pollBackoffMachine]
1104+ /// to wait an appropriate backoff duration for that retry,
1105+ /// after reporting the error if appropriate to the user and/or developer.
1106+ ///
1107+ /// Otherwise this method rethrows the error, with no other side effects.
1108+ ///
1109+ /// See also:
1110+ /// * [_handlePollError] , which handles errors from the rest of [poll]
1111+ /// and errors this method rethrows.
1112+ Future <void > _handlePollRequestError (Object error, StackTrace stackTrace) async {
1113+ store.isLoading = true ;
1114+
1115+ if (error is ! ApiRequestException ) {
1116+ // Some unexpected error, outside even making the HTTP request.
1117+ // Definitely a bug in our code.
1118+ Error .throwWithStackTrace (error, stackTrace);
1119+ }
1120+
1121+ bool shouldReportToUser;
1122+ switch (error) {
1123+ case NetworkException (cause: SocketException ()):
1124+ // A [SocketException] is common when the app returns from sleep.
1125+ shouldReportToUser = false ;
1126+
1127+ case NetworkException ():
1128+ case Server5xxException ():
1129+ shouldReportToUser = true ;
1130+
1131+ case ServerException (httpStatus: 429 ):
1132+ case ZulipApiException (httpStatus: 429 ):
1133+ case ZulipApiException (code: 'RATE_LIMIT_HIT' ):
1134+ // TODO(#946) handle rate-limit errors more generally, in ApiConnection
1135+ shouldReportToUser = true ;
1136+
1137+ case ZulipApiException (code: 'BAD_EVENT_QUEUE_ID' ):
1138+ Error .throwWithStackTrace (error, stackTrace);
1139+
1140+ case ZulipApiException ():
1141+ case MalformedServerResponseException ():
1142+ // Either a 4xx we didn't expect, or a malformed response;
1143+ // in either case, a mismatch of the client's expectations to the
1144+ // server's behavior, and therefore a bug in one or the other.
1145+ // TODO(#1054) handle auth failures specifically
1146+ Error .throwWithStackTrace (error, stackTrace);
1147+ }
1148+
1149+ assert (debugLog ('Transient error polling event queue for $store : $error \n '
1150+ 'Backing off, then will retry…' ));
1151+ if (shouldReportToUser) {
1152+ _maybeReportToUserTransientError (error);
1153+ }
1154+ await (_pollBackoffMachine ?? = BackoffMachine ()).wait ();
1155+ if (_disposed) return ;
1156+ assert (debugLog ('… Backoff wait complete, retrying poll.' ));
1157+ }
1158+
1159+ /// Deal with an error in [poll] : reload server data to replace the store,
1160+ /// after reporting the error as appropriate to the user and/or developer.
1161+ ///
1162+ /// See also:
1163+ /// * [_handlePollRequestError] , which handles certain errors
1164+ /// and causes them not to reach this method.
1165+ Future <void > _handlePollError (Object error) async {
1166+ // An error occurred, other than the transient request errors we retry on.
1167+ // This means either a lost/expired event queue on the server (which is
1168+ // normal after the app is offline for a period like 10 minutes),
1169+ // or an unexpected exception representing a bug in our code or the server.
1170+ // Either way, the show must go on. So reload server data from scratch.
1171+
1172+ store.isLoading = true ;
1173+
1174+ bool isUnexpected;
1175+ switch (error) {
1176+ case ZulipApiException (code: 'BAD_EVENT_QUEUE_ID' ):
1177+ assert (debugLog ('Lost event queue for $store . Replacing…' ));
1178+ // The old event queue is gone, so we need a new one. This is normal.
1179+ isUnexpected = false ;
1180+
1181+ case _EventHandlingException (: final cause, : final event):
1182+ assert (debugLog ('BUG: Error handling an event: $cause \n ' // TODO(log)
1183+ ' event: $event \n '
1184+ 'Replacing event queue…' ));
1185+ reportErrorToUserBriefly (
1186+ GlobalLocalizations .zulipLocalizations.errorHandlingEventTitle,
1187+ details: GlobalLocalizations .zulipLocalizations.errorHandlingEventDetails (
1188+ store.realmUrl.toString (), cause.toString (), event.toString ()));
1189+ // We can't just continue with the next event, because our state
1190+ // may be garbled due to failing to apply this one (and worse,
1191+ // any invariants that were left in a broken state from where
1192+ // the exception was thrown). So reload from scratch.
1193+ // Hopefully (probably?) the bug only affects our implementation of
1194+ // the *change* in state represented by the event, and when we get the
1195+ // new state in a fresh InitialSnapshot we'll handle that just fine.
1196+ isUnexpected = true ;
1197+
1198+ default :
1199+ assert (debugLog ('BUG: Unexpected error in event polling: $error \n ' // TODO(log)
1200+ 'Replacing event queue…' ));
1201+ _reportToUserErrorConnectingToServer (error);
1202+ // Similar story to the _EventHandlingException case;
1203+ // separate only so that that other case can print more context.
1204+ // The bug here could be in the server if it's an ApiRequestException,
1205+ // but our remedy is the same either way.
1206+ isUnexpected = true ;
1207+ }
1208+
1209+ if (isUnexpected) {
1210+ // We don't know the cause of the failure; it might well keep happening.
1211+ // Avoid creating a retry storm.
1212+ await _unexpectedErrorBackoffMachine.wait ();
1213+ if (_disposed) return ;
1214+ }
1215+
1216+ await store._globalStore._reloadPerAccount (store.accountId);
1217+ assert (_disposed);
1218+ assert (debugLog ('… Event queue replaced.' ));
1219+ }
1220+
1221+ /// This only reports transient errors after reaching
1222+ /// a pre-defined threshold of retries.
1223+ void _maybeReportToUserTransientError (Object error) {
1224+ _accumulatedTransientFailureCount++ ;
1225+ if (_accumulatedTransientFailureCount > transientFailureCountNotifyThreshold) {
1226+ _reportToUserErrorConnectingToServer (error);
11911227 }
11921228 }
11931229
0 commit comments