|
7 | 7 | "strings" |
8 | 8 |
|
9 | 9 | "github.com/10gen/migration-verifier/internal/logger" |
| 10 | + "github.com/10gen/migration-verifier/mmongo" |
| 11 | + mapset "github.com/deckarep/golang-set/v2" |
10 | 12 | "github.com/pkg/errors" |
11 | 13 | "go.mongodb.org/mongo-driver/bson" |
12 | 14 | "go.mongodb.org/mongo-driver/mongo" |
@@ -184,62 +186,83 @@ func isNetworkError(err error) bool { |
184 | 186 | return mongo.IsNetworkError(err) |
185 | 187 | } |
186 | 188 |
|
| 189 | +// The below list was copied from mongosync. |
| 190 | +// (Not every error code is known to happen in migration-verifier.) |
| 191 | +var transientErrorCodes = mapset.NewSet( |
| 192 | + 6, // HostUnreachable |
| 193 | + 7, // HostNotFound |
| 194 | + |
| 195 | + // CursorNotFound sometimes happens due to inconsistencies |
| 196 | + // in the server’s sharding abstraction. See REP-2440. |
| 197 | + 43, // CursorNotFound |
| 198 | + |
| 199 | + 50, // MaxTimeMSExpired |
| 200 | + 63, // OBSOLETE_StaleShardVersion |
| 201 | + 64, // WriteConcernFailed |
| 202 | + |
| 203 | + // This seems to be possible if a shard is unavailable due |
| 204 | + // to an election. See REP-2926. |
| 205 | + 70, // ShardNotFound |
| 206 | + |
| 207 | + 89, // NetworkTimeout |
| 208 | + 90, // CallbackCanceled |
| 209 | + 91, // ShutdownInProgress |
| 210 | + 112, // WriteConflict |
| 211 | + 117, // ConflictingOperationInProgress |
| 212 | + 133, // FailedToSatisfyReadPreference |
| 213 | + 134, // ReadConcernMajorityNotAvailableYet |
| 214 | + 136, // CappedPositionLost |
| 215 | + 175, // QueryPlanKilled |
| 216 | + 187, // LinearizableReadConcernError |
| 217 | + 189, // PrimarySteppedDown |
| 218 | + 202, // NetworkInterfaceExceededTimeLimit |
| 219 | + 211, // KeyNotFound |
| 220 | + 251, // NoSuchTransaction |
| 221 | + 262, // ExceededTimeLimit |
| 222 | + 282, // TransactionCoordinatorReachedAbortDecision |
| 223 | + 290, // TransactionExceededLifetimeLimitSeconds |
| 224 | + 314, // ObjectIsBusy |
| 225 | + 317, // ConnectionPoolExpired |
| 226 | + 358, // InternalTransactionNotSupported |
| 227 | + 365, // TemporarilyUnavailable |
| 228 | + 384, // ConnectionError |
| 229 | + 402, // ResourceExhausted |
| 230 | + 406, // MigrationBlockingOperationCoordinatorCleaningUp |
| 231 | + 407, // PooledConnectionAcquisitionExceededTimeLimit |
| 232 | + 412, // UpdatesStillPending |
| 233 | + 9001, // SocketException |
| 234 | + 10107, // NotWritablePrimary |
| 235 | + 11600, // InterruptedAtShutdown |
| 236 | + 11601, // Interrupted |
| 237 | + 11602, // InterruptedDueToReplStateChange |
| 238 | + 12586, // BackgroundOperationInProgressForDatabase |
| 239 | + 12587, // BackgroundOperationInProgressForNamespace |
| 240 | + 13388, // StaleConfig |
| 241 | + 13435, // NotPrimaryNoSecondaryOk |
| 242 | + 13436, // NotPrimaryOrSecondary |
| 243 | + |
| 244 | + 50915, // BackupCursorOpenConflictWithCheckpoint |
| 245 | + 91331, // RemoteCommandFailed |
| 246 | +) |
| 247 | + |
187 | 248 | // hasTransientErrorCode returns true if the error has one of a set of known-to-be-transient |
188 | 249 | // Mongo server error codes. |
189 | 250 | func hasTransientErrorCode(err error) bool { |
190 | | - switch GetErrorCode(err) { |
191 | | - case 6, 7, 64, 89, 91, 112, 136, 175, 189, 202, 262, 290, 314, 317, |
192 | | - 9001, 10107, 11600, 11601, 11602, 13388, 13435, 13436: |
193 | | - // These error codes are either listed as retryable in the remote command retry |
194 | | - // scheduler, or have been added here deliberately, since they have been observed to be |
195 | | - // issued when applyOps/find/getMore is interrupted while the server is being shut |
196 | | - // down. |
197 | | - // |
198 | | - // There is a list of error codes at |
199 | | - // https://github.com/mongodb/mongo/blob/master/src/mongo/base/error_codes.yml. The |
200 | | - // list below includes all codes that are in the NetworkError and RetriableError |
201 | | - // categories, except 358 (InternalTransactionNotSupported) and 50915 |
202 | | - // (BackupCursorOpenConflictWithCheckpoint), as these do not apply to any operations |
203 | | - // performed by mongosync. |
204 | | - // |
205 | | - // 6 HostUnreachable |
206 | | - // 7 HostNotFound |
207 | | - // 64 WriteConcernFailed |
208 | | - // 89 NetworkTimeout |
209 | | - // 91 ShutdownInProgress |
210 | | - // 112 WriteConflict |
211 | | - // 136 CappedPositionLost - XXX - there was some discussion over whether this should be included |
212 | | - // 175 QueryPlanKilled, e.g. when a collection is dropped/renamed while a cursor is open on it |
213 | | - // 189 PrimarySteppedDown |
214 | | - // 202 NetworkInterfaceExceededTimeLimit |
215 | | - // 262 ExceededTimeLimit |
216 | | - // 290 TransactionExceededLifetimeLimitSeconds |
217 | | - // 314 ObjectIsBusy |
218 | | - // 317 ConnectionPoolExpired |
219 | | - // 9001 SocketException |
220 | | - // 10107 NotWritablePrimary |
221 | | - // 11600 InterruptedAtShutdown |
222 | | - // 11601 Interrupted |
223 | | - // 11602 InterruptedDueToReplStateChange |
224 | | - // 13388 StaleConfig |
225 | | - // 13435 NotPrimaryNoSecondaryOk |
226 | | - // 13436 NotPrimaryOrSecondary |
227 | | - return true |
228 | | - case 0: |
| 251 | + if GetErrorCode(err) == 0 { |
229 | 252 | // The server may send "not master" without an error code. |
230 | 253 | if strings.Contains(err.Error(), "not master") { |
231 | 254 | return true |
232 | 255 | } |
233 | | - // These codes only apply to DDL operations. However, we decided that |
234 | | - // there's no harm in including them in the default list. See REP-1289 for |
235 | | - // more details. |
236 | | - case 63, 117, 12586, 12587: |
237 | | - // 63 OBSOLETE_StaleShardVersion |
238 | | - // 117 ConflictingOperationInProgress |
239 | | - // 12586 BackgroundOperationInProgressForDatabase |
240 | | - // 12587 BackgroundOperationInProgressForNamespace |
241 | | - return true |
242 | 256 | } |
| 257 | + |
| 258 | + // Now check whether any of the transient error codes appears |
| 259 | + // in the error. |
| 260 | + for code := range transientErrorCodes.Iter() { |
| 261 | + if mmongo.ErrorHasCode(err, code) { |
| 262 | + return true |
| 263 | + } |
| 264 | + } |
| 265 | + |
243 | 266 | return false |
244 | 267 | } |
245 | 268 |
|
@@ -271,22 +294,17 @@ func IsCollectionUUIDMismatchError(err error) bool { |
271 | 294 | return GetErrorCode(err) == 361 |
272 | 295 | } |
273 | 296 |
|
274 | | -// IsServerError returns true if the error implements the ServerError interface in driver. |
275 | | -func IsServerError(err error) bool { |
276 | | - // Get the cause of the err. |
277 | | - cause := errors.Cause(err) |
278 | | - _, ok := cause.(mongo.ServerError) |
279 | | - |
280 | | - return ok |
281 | | -} |
282 | | - |
283 | 297 | // IsCommandNotSupportedOnViewError returns true if this is a CommandNotSupportedOnView error. |
284 | 298 | func IsCommandNotSupportedOnViewError(err error) bool { |
285 | 299 | return GetErrorCode(err) == 166 |
286 | 300 | } |
287 | 301 |
|
288 | | -// GetErrorCode returns the error code corresponding to the provided error. |
| 302 | +// GetErrorCode returns the provided error’s top-level error code. |
289 | 303 | // It returns 0 if the error is nil or not one of the supported error types. |
| 304 | +// |
| 305 | +// CAUTION: Server errors can contain multiple errors, and inspecting just |
| 306 | +// the top-level error code often doesn’t achieve proper error handling. |
| 307 | +// Instead consider mongo.ServerError.HasErrorCode(). |
290 | 308 | func GetErrorCode(err error) int { |
291 | 309 | switch e := errors.Cause(err).(type) { |
292 | 310 | case mongo.CommandError: |
|
0 commit comments