-
-
Notifications
You must be signed in to change notification settings - Fork 438
fix: add backoff for rate-limited batch downloads in range sync #8924
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: unstable
Are you sure you want to change the base?
Changes from 10 commits
bf7145b
dd764c6
a463db0
29fdc5c
c2fc311
0c00da5
920dba0
8d89a30
eb3f889
3d6f704
ded88f4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,6 @@ | ||
| import {ChainForkConfig} from "@lodestar/config"; | ||
| import {Epoch, Root, Slot} from "@lodestar/types"; | ||
| import {ErrorAborted, LodestarError, Logger, toRootHex} from "@lodestar/utils"; | ||
| import {ErrorAborted, LodestarError, Logger, sleep, toRootHex} from "@lodestar/utils"; | ||
| import {isBlockInputBlobs, isBlockInputColumns} from "../../chain/blocks/blockInput/blockInput.js"; | ||
| import {BlockInputErrorCode} from "../../chain/blocks/blockInput/errors.js"; | ||
| import {IBlockInput} from "../../chain/blocks/blockInput/types.js"; | ||
|
|
@@ -15,7 +15,7 @@ import {ItTrigger} from "../../util/itTrigger.js"; | |
| import {PeerIdStr} from "../../util/peerId.js"; | ||
| import {WarnResult, wrapError} from "../../util/wrapError.js"; | ||
| import {BATCH_BUFFER_SIZE, EPOCHS_PER_BATCH, MAX_LOOK_AHEAD_EPOCHS} from "../constants.js"; | ||
| import {DownloadByRangeError, DownloadByRangeErrorCode} from "../utils/downloadByRange.js"; | ||
| import {DownloadByRangeError, DownloadByRangeErrorCode, isRateLimitRequestError} from "../utils/downloadByRange.js"; | ||
| import {RangeSyncType} from "../utils/remoteSyncType.js"; | ||
| import {Batch, BatchError, BatchErrorCode, BatchMetadata, BatchStatus} from "./batch.js"; | ||
| import { | ||
|
|
@@ -420,7 +420,11 @@ export class SyncChain { | |
| // Note: Don't count batches in the AwaitingValidation state, to prevent stalling sync | ||
| // if the current processing window is contained in a long range of skip slots. | ||
| const batchesInBuffer = batches.filter((batch) => { | ||
| return batch.state.status === BatchStatus.Downloading || batch.state.status === BatchStatus.AwaitingProcessing; | ||
| return ( | ||
| batch.state.status === BatchStatus.Downloading || | ||
| batch.state.status === BatchStatus.RateLimited || | ||
| batch.state.status === BatchStatus.AwaitingProcessing | ||
| ); | ||
| }); | ||
| if (batchesInBuffer.length > BATCH_BUFFER_SIZE) { | ||
| return null; | ||
|
|
@@ -471,43 +475,76 @@ export class SyncChain { | |
|
|
||
| if (res.err) { | ||
| // There's several known error cases where we want to take action on the peer | ||
| const errCode = (res.err as LodestarError<{code: string}>).type?.code; | ||
| const errCode = (res.err as LodestarError<{code: string}>).type?.code ?? (res.err as {code?: string}).code; | ||
| this.metrics?.syncRange.downloadByRange.error.inc({client: peer.client, code: errCode ?? "UNKNOWN"}); | ||
| if (this.syncType === RangeSyncType.Finalized) { | ||
| // For finalized sync, we are stricter with peers as there is no ambiguity about which chain we're syncing. | ||
| // The below cases indicate the peer may be on a different chain, so are not penalized during head sync. | ||
|
|
||
| // Rate-limited responses are handled with backoff rather than peer penalties. | ||
| // The peer is healthy but throttling us — penalizing it would make things worse. | ||
| const isRateLimited = isRateLimitRequestError(errCode); | ||
| if (isRateLimited) { | ||
| const delayMs = batch.downloadingRateLimited(peer.peerId); | ||
| if (delayMs > 0) { | ||
| const uniqueRateLimitedPeers = [...new Set(batch.rateLimitedPeers)]; | ||
| this.logger.debug("Batch download rate limited", { | ||
| id: this.logId, | ||
| ...batch.getMetadata(), | ||
| peer: prettyPrintPeerIdStr(peer.peerId), | ||
| rateLimitedPeers: uniqueRateLimitedPeers.map((peerId) => prettyPrintPeerIdStr(peerId)).join(", "), | ||
| delayMs, | ||
| }); | ||
| // Wait for cooldown before transitioning back to AwaitingDownload so triggerBatchDownloader can select | ||
| // a different peer. Rate-limited peers are tracked in getFailedPeers(), | ||
| // so peerBalancer will prefer alternative peers. If no alternative is available | ||
| // the backoff delay is applied before retrying with the same peer pool. | ||
| await sleep(delayMs); | ||
|
||
| batch.endCoolDown(); | ||
| } else { | ||
| this.logger.debug("Batch download rate limited, max retries exhausted", { | ||
| id: this.logId, | ||
| ...batch.getMetadata(), | ||
| peer: prettyPrintPeerIdStr(peer.peerId), | ||
| }); | ||
| } | ||
| } | ||
|
|
||
| // Important: avoid duplicate error logging and downloadingError() for rate-limited responses. | ||
| if (!isRateLimited) { | ||
| if (this.syncType === RangeSyncType.Finalized) { | ||
| // For finalized sync, we are stricter with peers as there is no ambiguity about which chain we're syncing. | ||
| // The below cases indicate the peer may be on a different chain, so are not penalized during head sync. | ||
| switch (errCode) { | ||
| case BlockInputErrorCode.MISMATCHED_ROOT_HEX: | ||
| case DownloadByRangeErrorCode.MISSING_BLOBS: | ||
| case DownloadByRangeErrorCode.EXTRA_BLOBS: | ||
| case DownloadByRangeErrorCode.MISSING_COLUMNS: | ||
| case DownloadByRangeErrorCode.EXTRA_COLUMNS: | ||
| case BlobSidecarErrorCode.INCORRECT_SIDECAR_COUNT: | ||
| case BlobSidecarErrorCode.INCORRECT_BLOCK: | ||
| case DataColumnSidecarErrorCode.INCORRECT_SIDECAR_COUNT: | ||
| case DataColumnSidecarErrorCode.INCORRECT_BLOCK: | ||
| this.reportPeer(peer.peerId, PeerAction.LowToleranceError, res.err.message); | ||
| } | ||
| } | ||
| switch (errCode) { | ||
| case BlockInputErrorCode.MISMATCHED_ROOT_HEX: | ||
| case DownloadByRangeErrorCode.MISSING_BLOBS: | ||
| case DownloadByRangeErrorCode.EXTRA_BLOBS: | ||
| case DownloadByRangeErrorCode.MISSING_COLUMNS: | ||
| case DownloadByRangeErrorCode.EXTRA_COLUMNS: | ||
| case BlobSidecarErrorCode.INCORRECT_SIDECAR_COUNT: | ||
| case BlobSidecarErrorCode.INCORRECT_BLOCK: | ||
| case DataColumnSidecarErrorCode.INCORRECT_SIDECAR_COUNT: | ||
| case DataColumnSidecarErrorCode.INCORRECT_BLOCK: | ||
| case DownloadByRangeErrorCode.EXTRA_BLOCKS: | ||
| case DownloadByRangeErrorCode.OUT_OF_ORDER_BLOCKS: | ||
| case DownloadByRangeErrorCode.OUT_OF_RANGE_BLOCKS: | ||
| case DownloadByRangeErrorCode.PARENT_ROOT_MISMATCH: | ||
| case BlobSidecarErrorCode.INCLUSION_PROOF_INVALID: | ||
| case BlobSidecarErrorCode.INVALID_KZG_PROOF_BATCH: | ||
| case DataColumnSidecarErrorCode.INCORRECT_KZG_COMMITMENTS_COUNT: | ||
| case DataColumnSidecarErrorCode.INCORRECT_KZG_PROOF_COUNT: | ||
| case DataColumnSidecarErrorCode.INVALID_KZG_PROOF_BATCH: | ||
| case DataColumnSidecarErrorCode.INCLUSION_PROOF_INVALID: | ||
| this.reportPeer(peer.peerId, PeerAction.LowToleranceError, res.err.message); | ||
| } | ||
| this.logger.verbose( | ||
| "Batch download error", | ||
| {id: this.logId, ...batch.getMetadata(), peer: prettyPrintPeerIdStr(peer.peerId)}, | ||
| res.err | ||
| ); | ||
| batch.downloadingError(peer.peerId); // Throws after MAX_DOWNLOAD_ATTEMPTS | ||
| } | ||
| switch (errCode) { | ||
| case DownloadByRangeErrorCode.EXTRA_BLOCKS: | ||
| case DownloadByRangeErrorCode.OUT_OF_ORDER_BLOCKS: | ||
| case DownloadByRangeErrorCode.OUT_OF_RANGE_BLOCKS: | ||
| case DownloadByRangeErrorCode.PARENT_ROOT_MISMATCH: | ||
| case BlobSidecarErrorCode.INCLUSION_PROOF_INVALID: | ||
| case BlobSidecarErrorCode.INVALID_KZG_PROOF_BATCH: | ||
| case DataColumnSidecarErrorCode.INCORRECT_KZG_COMMITMENTS_COUNT: | ||
| case DataColumnSidecarErrorCode.INCORRECT_KZG_PROOF_COUNT: | ||
| case DataColumnSidecarErrorCode.INVALID_KZG_PROOF_BATCH: | ||
| case DataColumnSidecarErrorCode.INCLUSION_PROOF_INVALID: | ||
| this.reportPeer(peer.peerId, PeerAction.LowToleranceError, res.err.message); | ||
| } | ||
| this.logger.verbose( | ||
| "Batch download error", | ||
| {id: this.logId, ...batch.getMetadata(), peer: prettyPrintPeerIdStr(peer.peerId)}, | ||
| res.err | ||
| ); | ||
| batch.downloadingError(peer.peerId); // Throws after MAX_DOWNLOAD_ATTEMPTS | ||
| } else { | ||
| this.logger.verbose("Batch download success", { | ||
| id: this.logId, | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.