Skip to content

Commit 542c861

Browse files
authored
core/txpool, eth/catalyst: fix racy simulator due to txpool background reset (#28837)
This PR fixes an issues in the new simulated backend. The root cause is the fact that the transaction pool has an internal reset operation that runs on a background thread. When a new transaction is added to the pool via the RPC, the transaction is added to a non-executable queue and will be moved to its final location on a background thread. If the machine is overloaded (or simply due to timing issues), it can happen that the simulated backend will try to produce the next block, whilst the pool has not yet marked the newly added transaction executable. This will cause the block to not contain the transaction. This is an issue because we want determinism from the simulator: add a tx, mine a block. It should be in there. The PR fixes it by adding a Sync function to the txpool, which waits for the current reset operation (if any) to finish, and then runs an entire round of reset on top. The new round is needed because resets are only triggered by new head events, so newly added transactions will not trigger the outer resets that we can wait on. The transaction pool would eventually internally do a reset even on transaction addition, but there's no easy way to wait on that and there's no meaningful reason to bubble that across everything. A clean outer reset will at worse be a small noop goroutine.
1 parent 98eaa57 commit 542c861

File tree

3 files changed

+85
-10
lines changed

3 files changed

+85
-10
lines changed

core/txpool/txpool.go

Lines changed: 64 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@ type TxPool struct {
7272

7373
subs event.SubscriptionScope // Subscription scope to unsubscribe all on shutdown
7474
quit chan chan error // Quit channel to tear down the head updater
75+
term chan struct{} // Termination channel to detect a closed pool
76+
77+
sync chan chan error // Testing / simulator channel to block until internal reset is done
7578
}
7679

7780
// New creates a new transaction pool to gather, sort and filter inbound
@@ -86,6 +89,8 @@ func New(gasTip *big.Int, chain BlockChain, subpools []SubPool) (*TxPool, error)
8689
subpools: subpools,
8790
reservations: make(map[common.Address]SubPool),
8891
quit: make(chan chan error),
92+
term: make(chan struct{}),
93+
sync: make(chan chan error),
8994
}
9095
for i, subpool := range subpools {
9196
if err := subpool.Init(gasTip, head, pool.reserver(i, subpool)); err != nil {
@@ -174,6 +179,9 @@ func (p *TxPool) Close() error {
174179
// outside blockchain events as well as for various reporting and transaction
175180
// eviction events.
176181
func (p *TxPool) loop(head *types.Header, chain BlockChain) {
182+
// Close the termination marker when the pool stops
183+
defer close(p.term)
184+
177185
// Subscribe to chain head events to trigger subpool resets
178186
var (
179187
newHeadCh = make(chan core.ChainHeadEvent)
@@ -190,13 +198,23 @@ func (p *TxPool) loop(head *types.Header, chain BlockChain) {
190198
var (
191199
resetBusy = make(chan struct{}, 1) // Allow 1 reset to run concurrently
192200
resetDone = make(chan *types.Header)
201+
202+
resetForced bool // Whether a forced reset was requested, only used in simulator mode
203+
resetWaiter chan error // Channel waiting on a forced reset, only used in simulator mode
193204
)
205+
// Notify the live reset waiter to not block if the txpool is closed.
206+
defer func() {
207+
if resetWaiter != nil {
208+
resetWaiter <- errors.New("pool already terminated")
209+
resetWaiter = nil
210+
}
211+
}()
194212
var errc chan error
195213
for errc == nil {
196214
// Something interesting might have happened, run a reset if there is
197215
// one needed but none is running. The resetter will run on its own
198216
// goroutine to allow chain head events to be consumed contiguously.
199-
if newHead != oldHead {
217+
if newHead != oldHead || resetForced {
200218
// Try to inject a busy marker and start a reset if successful
201219
select {
202220
case resetBusy <- struct{}{}:
@@ -208,8 +226,17 @@ func (p *TxPool) loop(head *types.Header, chain BlockChain) {
208226
resetDone <- newHead
209227
}(oldHead, newHead)
210228

229+
// If the reset operation was explicitly requested, consider it
230+
// being fulfilled and drop the request marker. If it was not,
231+
// this is a noop.
232+
resetForced = false
233+
211234
default:
212-
// Reset already running, wait until it finishes
235+
// Reset already running, wait until it finishes.
236+
//
237+
// Note, this will not drop any forced reset request. If a forced
238+
// reset was requested, but we were busy, then when the currently
239+
// running reset finishes, a new one will be spun up.
213240
}
214241
}
215242
// Wait for the next chain head event or a previous reset finish
@@ -223,8 +250,26 @@ func (p *TxPool) loop(head *types.Header, chain BlockChain) {
223250
oldHead = head
224251
<-resetBusy
225252

253+
// If someone is waiting for a reset to finish, notify them, unless
254+
// the forced op is still pending. In that case, wait another round
255+
// of resets.
256+
if resetWaiter != nil && !resetForced {
257+
resetWaiter <- nil
258+
resetWaiter = nil
259+
}
260+
226261
case errc = <-p.quit:
227262
// Termination requested, break out on the next loop round
263+
264+
case syncc := <-p.sync:
265+
// Transaction pool is running inside a simulator, and we are about
266+
// to create a new block. Request a forced sync operation to ensure
267+
// that any running reset operation finishes to make block imports
268+
// deterministic. On top of that, run a new reset operation to make
269+
// transaction insertions deterministic instead of being stuck in a
270+
// queue waiting for a reset.
271+
resetForced = true
272+
resetWaiter = syncc
228273
}
229274
}
230275
// Notify the closer of termination (no error possible for now)
@@ -415,3 +460,20 @@ func (p *TxPool) Status(hash common.Hash) TxStatus {
415460
}
416461
return TxStatusUnknown
417462
}
463+
464+
// Sync is a helper method for unit tests or simulator runs where the chain events
465+
// are arriving in quick succession, without any time in between them to run the
466+
// internal background reset operations. This method will run an explicit reset
467+
// operation to ensure the pool stabilises, thus avoiding flakey behavior.
468+
//
469+
// Note, do not use this in production / live code. In live code, the pool is
470+
// meant to reset on a separate thread to avoid DoS vectors.
471+
func (p *TxPool) Sync() error {
472+
sync := make(chan error)
473+
select {
474+
case p.sync <- sync:
475+
return <-sync
476+
case <-p.term:
477+
return errors.New("pool already terminated")
478+
}
479+
}

eth/catalyst/api.go

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ func (api *ConsensusAPI) ForkchoiceUpdatedV1(update engine.ForkchoiceStateV1, pa
180180
return engine.STATUS_INVALID, engine.InvalidParams.With(errors.New("forkChoiceUpdateV1 called post-shanghai"))
181181
}
182182
}
183-
return api.forkchoiceUpdated(update, payloadAttributes)
183+
return api.forkchoiceUpdated(update, payloadAttributes, false)
184184
}
185185

186186
// ForkchoiceUpdatedV2 is equivalent to V1 with the addition of withdrawals in the payload attributes.
@@ -196,7 +196,7 @@ func (api *ConsensusAPI) ForkchoiceUpdatedV2(update engine.ForkchoiceStateV1, pa
196196
return engine.STATUS_INVALID, engine.UnsupportedFork.With(errors.New("forkchoiceUpdatedV2 must only be called for shanghai payloads"))
197197
}
198198
}
199-
return api.forkchoiceUpdated(update, params)
199+
return api.forkchoiceUpdated(update, params, false)
200200
}
201201

202202
// ForkchoiceUpdatedV3 is equivalent to V2 with the addition of parent beacon block root in the payload attributes.
@@ -220,10 +220,10 @@ func (api *ConsensusAPI) ForkchoiceUpdatedV3(update engine.ForkchoiceStateV1, pa
220220
// hash, even if params are wrong. To do this we need to split up
221221
// forkchoiceUpdate into a function that only updates the head and then a
222222
// function that kicks off block construction.
223-
return api.forkchoiceUpdated(update, params)
223+
return api.forkchoiceUpdated(update, params, false)
224224
}
225225

226-
func (api *ConsensusAPI) forkchoiceUpdated(update engine.ForkchoiceStateV1, payloadAttributes *engine.PayloadAttributes) (engine.ForkChoiceResponse, error) {
226+
func (api *ConsensusAPI) forkchoiceUpdated(update engine.ForkchoiceStateV1, payloadAttributes *engine.PayloadAttributes, simulatorMode bool) (engine.ForkChoiceResponse, error) {
227227
api.forkchoiceLock.Lock()
228228
defer api.forkchoiceLock.Unlock()
229229

@@ -330,7 +330,7 @@ func (api *ConsensusAPI) forkchoiceUpdated(update engine.ForkchoiceStateV1, payl
330330
if merger := api.eth.Merger(); !merger.PoSFinalized() {
331331
merger.FinalizePoS()
332332
}
333-
// If the finalized block is not in our canonical tree, somethings wrong
333+
// If the finalized block is not in our canonical tree, something is wrong
334334
finalBlock := api.eth.BlockChain().GetBlockByHash(update.FinalizedBlockHash)
335335
if finalBlock == nil {
336336
log.Warn("Final block not available in database", "hash", update.FinalizedBlockHash)
@@ -342,7 +342,7 @@ func (api *ConsensusAPI) forkchoiceUpdated(update engine.ForkchoiceStateV1, payl
342342
// Set the finalized block
343343
api.eth.BlockChain().SetFinalized(finalBlock.Header())
344344
}
345-
// Check if the safe block hash is in our canonical tree, if not somethings wrong
345+
// Check if the safe block hash is in our canonical tree, if not something is wrong
346346
if update.SafeBlockHash != (common.Hash{}) {
347347
safeBlock := api.eth.BlockChain().GetBlockByHash(update.SafeBlockHash)
348348
if safeBlock == nil {
@@ -374,6 +374,19 @@ func (api *ConsensusAPI) forkchoiceUpdated(update engine.ForkchoiceStateV1, payl
374374
if api.localBlocks.has(id) {
375375
return valid(&id), nil
376376
}
377+
// If the beacon chain is ran by a simulator, then transaction insertion,
378+
// block insertion and block production will happen without any timing
379+
// delay between them. This will cause flaky simulator executions due to
380+
// the transaction pool running its internal reset operation on a back-
381+
// ground thread. To avoid the racey behavior - in simulator mode - the
382+
// pool will be explicitly blocked on its reset before continuing to the
383+
// block production below.
384+
if simulatorMode {
385+
if err := api.eth.TxPool().Sync(); err != nil {
386+
log.Error("Failed to sync transaction pool", "err", err)
387+
return valid(nil), engine.InvalidPayloadAttributes.With(err)
388+
}
389+
}
377390
payload, err := api.eth.Miner().BuildPayload(args)
378391
if err != nil {
379392
log.Error("Failed to build payload", "err", err)

eth/catalyst/simulated_beacon.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,12 +155,12 @@ func (c *SimulatedBeacon) sealBlock(withdrawals []*types.Withdrawal, timestamp u
155155

156156
var random [32]byte
157157
rand.Read(random[:])
158-
fcResponse, err := c.engineAPI.ForkchoiceUpdatedV2(c.curForkchoiceState, &engine.PayloadAttributes{
158+
fcResponse, err := c.engineAPI.forkchoiceUpdated(c.curForkchoiceState, &engine.PayloadAttributes{
159159
Timestamp: timestamp,
160160
SuggestedFeeRecipient: feeRecipient,
161161
Withdrawals: withdrawals,
162162
Random: random,
163-
})
163+
}, true)
164164
if err != nil {
165165
return err
166166
}

0 commit comments

Comments
 (0)