Skip to content

Commit 5aaf813

Browse files
authored
Merge pull request #4345 from stacks-network/hotfix/stackerdb-stale-view
Hotfix/stackerdb stale view
2 parents c659fca + e97b244 commit 5aaf813

File tree

9 files changed

+299
-34
lines changed

9 files changed

+299
-34
lines changed

stackslib/src/chainstate/burn/db/sortdb.rs

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3883,34 +3883,21 @@ impl SortitionDB {
38833883
.unwrap_or(&burnchain.first_block_hash)
38843884
.clone();
38853885

3886-
let rc = burnchain
3887-
.block_height_to_reward_cycle(chain_tip.block_height)
3888-
.expect("FATAL: block height does not have a reward cycle");
3889-
3890-
let rc_height = burnchain.reward_cycle_to_block_height(rc);
3891-
let rc_consensus_hash = SortitionDB::get_ancestor_snapshot(
3892-
conn,
3893-
cmp::min(chain_tip.block_height, rc_height),
3894-
&chain_tip.sortition_id,
3895-
)?
3896-
.map(|sn| sn.consensus_hash)
3897-
.ok_or(db_error::NotFoundError)?;
3898-
38993886
test_debug!(
39003887
"Chain view: {},{}-{},{},{}",
39013888
chain_tip.block_height,
39023889
chain_tip.burn_header_hash,
39033890
stable_block_height,
39043891
&burn_stable_block_hash,
3905-
&rc_consensus_hash,
3892+
&chain_tip.canonical_stacks_tip_consensus_hash,
39063893
);
39073894
Ok(BurnchainView {
39083895
burn_block_height: chain_tip.block_height,
39093896
burn_block_hash: chain_tip.burn_header_hash,
39103897
burn_stable_block_height: stable_block_height,
39113898
burn_stable_block_hash: burn_stable_block_hash,
39123899
last_burn_block_hashes: last_burn_block_hashes,
3913-
rc_consensus_hash,
3900+
rc_consensus_hash: chain_tip.canonical_stacks_tip_consensus_hash,
39143901
})
39153902
}
39163903
}
@@ -4099,6 +4086,21 @@ impl SortitionDB {
40994086
Ok((consensus_hash, stacks_block_hash))
41004087
}
41014088

4089+
#[cfg(test)]
4090+
pub fn set_canonical_stacks_chain_tip(
4091+
conn: &Connection,
4092+
ch: &ConsensusHash,
4093+
bhh: &BlockHeaderHash,
4094+
height: u64,
4095+
) -> Result<(), db_error> {
4096+
let tip = SortitionDB::get_canonical_burn_chain_tip(conn)?;
4097+
let args: &[&dyn ToSql] = &[ch, bhh, &u64_to_sql(height)?, &tip.sortition_id];
4098+
conn.execute("UPDATE snapshots SET canonical_stacks_tip_consensus_hash = ?1, canonical_stacks_tip_hash = ?2, canonical_stacks_tip_height = ?3
4099+
WHERE sortition_id = ?4", args)
4100+
.map_err(db_error::SqliteError)?;
4101+
Ok(())
4102+
}
4103+
41024104
/// Get the maximum arrival index for any known snapshot.
41034105
fn get_max_arrival_index(conn: &Connection) -> Result<u64, db_error> {
41044106
match conn

stackslib/src/net/api/poststackerdbchunk.rs

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -116,20 +116,23 @@ impl HttpRequest for RPCPostStackerDBChunkRequestHandler {
116116
pub enum StackerDBErrorCodes {
117117
DataAlreadyExists,
118118
NoSuchSlot,
119+
BadSigner,
119120
}
120121

121122
impl StackerDBErrorCodes {
122123
pub fn code(&self) -> u32 {
123124
match self {
124125
Self::DataAlreadyExists => 0,
125126
Self::NoSuchSlot => 1,
127+
Self::BadSigner => 2,
126128
}
127129
}
128130

129131
pub fn reason(&self) -> &'static str {
130132
match self {
131133
Self::DataAlreadyExists => "Data for this slot and version already exist",
132134
Self::NoSuchSlot => "No such StackerDB slot",
135+
Self::BadSigner => "Signature does not match slot signer",
133136
}
134137
}
135138

@@ -183,11 +186,18 @@ impl RPCRequestHandler for RPCPostStackerDBChunkRequestHandler {
183186
&HttpNotFound::new("StackerDB not found".to_string()),
184187
));
185188
}
186-
if let Err(_e) = tx.try_replace_chunk(
189+
if let Err(e) = tx.try_replace_chunk(
187190
&contract_identifier,
188191
&stackerdb_chunk.get_slot_metadata(),
189192
&stackerdb_chunk.data,
190193
) {
194+
test_debug!(
195+
"Failed to replace chunk {}.{} in {}: {:?}",
196+
stackerdb_chunk.slot_id,
197+
stackerdb_chunk.slot_version,
198+
&contract_identifier,
199+
&e
200+
);
191201
let slot_metadata_opt =
192202
match tx.get_slot_metadata(&contract_identifier, stackerdb_chunk.slot_id) {
193203
Ok(slot_opt) => slot_opt,
@@ -209,11 +219,15 @@ impl RPCRequestHandler for RPCPostStackerDBChunkRequestHandler {
209219

210220
let (reason, slot_metadata_opt) = if let Some(slot_metadata) = slot_metadata_opt
211221
{
222+
let code = if let NetError::BadSlotSigner(..) = e {
223+
StackerDBErrorCodes::BadSigner
224+
} else {
225+
StackerDBErrorCodes::DataAlreadyExists
226+
};
227+
212228
(
213-
serde_json::to_string(
214-
&StackerDBErrorCodes::DataAlreadyExists.into_json(),
215-
)
216-
.unwrap_or("(unable to encode JSON)".to_string()),
229+
serde_json::to_string(&code.into_json())
230+
.unwrap_or("(unable to encode JSON)".to_string()),
217231
Some(slot_metadata),
218232
)
219233
} else {

stackslib/src/net/chat.rs

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1339,8 +1339,8 @@ impl ConversationP2P {
13391339
self.update_from_stacker_db_handshake_data(stackerdb_accept);
13401340
} else {
13411341
// remote peer's burnchain view has diverged, so assume no longer replicating (we
1342-
// can't talk to it anyway). This can happen once per reward cycle for a few
1343-
// minutes as nodes begin the next reward cycle, but it's harmless -- at worst, it
1342+
// can't talk to it anyway). This can happen once per burnchain block for a few
1343+
// seconds as nodes begin processing the next Stacks blocks, but it's harmless -- at worst, it
13441344
// just means that no stacker DB replication happens between this peer and
13451345
// localhost during this time.
13461346
self.clear_stacker_db_handshake_data();
@@ -1779,13 +1779,16 @@ impl ConversationP2P {
17791779
let local_peer = network.get_local_peer();
17801780
let burnchain_view = network.get_chain_view();
17811781

1782+
// remote peer's Stacks chain tip is different from ours, meaning it might have a different
1783+
// stackerdb configuration view (and we won't be able to authenticate their chunks, and
1784+
// vice versa)
17821785
if burnchain_view.rc_consensus_hash != getchunkinv.rc_consensus_hash {
17831786
debug!(
17841787
"{:?}: NACK StackerDBGetChunkInv; {} != {}",
17851788
local_peer, &burnchain_view.rc_consensus_hash, &getchunkinv.rc_consensus_hash
17861789
);
17871790
return Ok(StacksMessageType::Nack(NackData::new(
1788-
NackErrorCodes::InvalidPoxFork,
1791+
NackErrorCodes::StaleView,
17891792
)));
17901793
}
17911794

@@ -1827,7 +1830,7 @@ impl ConversationP2P {
18271830
local_peer, &burnchain_view.rc_consensus_hash, &getchunk.rc_consensus_hash
18281831
);
18291832
return Ok(StacksMessageType::Nack(NackData::new(
1830-
NackErrorCodes::InvalidPoxFork,
1833+
NackErrorCodes::StaleView,
18311834
)));
18321835
}
18331836

stackslib/src/net/mod.rs

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -975,6 +975,7 @@ pub mod NackErrorCodes {
975975
pub const InvalidMessage: u32 = 5;
976976
pub const NoSuchDB: u32 = 6;
977977
pub const StaleVersion: u32 = 7;
978+
pub const StaleView: u32 = 8;
978979
}
979980

980981
#[derive(Debug, Clone, PartialEq)]
@@ -997,7 +998,9 @@ pub struct NatPunchData {
997998
/// Inform the remote peer of (a page of) the list of stacker DB contracts this node supports
998999
#[derive(Debug, Clone, PartialEq)]
9991000
pub struct StackerDBHandshakeData {
1000-
/// current reward cycle ID
1001+
/// current reward cycle consensus hash (i.e. the consensus hash of the Stacks tip in the
1002+
/// current reward cycle, which commits to both the Stacks block tip and the underlying PoX
1003+
/// history).
10011004
pub rc_consensus_hash: ConsensusHash,
10021005
/// list of smart contracts that we index.
10031006
/// there can be as many as 256 entries.
@@ -1009,7 +1012,7 @@ pub struct StackerDBHandshakeData {
10091012
pub struct StackerDBGetChunkInvData {
10101013
/// smart contract being used to determine chunk quantity and order
10111014
pub contract_id: QualifiedContractIdentifier,
1012-
/// consensus hash of the sortition that started this reward cycle
1015+
/// consensus hash of the Stacks chain tip in this reward cycle
10131016
pub rc_consensus_hash: ConsensusHash,
10141017
}
10151018

@@ -1028,7 +1031,7 @@ pub struct StackerDBChunkInvData {
10281031
pub struct StackerDBGetChunkData {
10291032
/// smart contract being used to determine slot quantity and order
10301033
pub contract_id: QualifiedContractIdentifier,
1031-
/// consensus hash of the sortition that started this reward cycle
1034+
/// consensus hash of the Stacks chain tip in this reward cycle
10321035
pub rc_consensus_hash: ConsensusHash,
10331036
/// slot ID
10341037
pub slot_id: u32,
@@ -1041,7 +1044,7 @@ pub struct StackerDBGetChunkData {
10411044
pub struct StackerDBPushChunkData {
10421045
/// smart contract being used to determine chunk quantity and order
10431046
pub contract_id: QualifiedContractIdentifier,
1044-
/// consensus hash of the sortition that started this reward cycle
1047+
/// consensus hash of the Stacks chain tip in this reward cycle
10451048
pub rc_consensus_hash: ConsensusHash,
10461049
/// the pushed chunk
10471050
pub chunk_data: StackerDBChunkData,

stackslib/src/net/p2p.rs

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5224,7 +5224,12 @@ impl PeerNetwork {
52245224
// update burnchain snapshot if we need to (careful -- it's expensive)
52255225
let sn = SortitionDB::get_canonical_burn_chain_tip(&sortdb.conn())?;
52265226
let mut ret: HashMap<NeighborKey, Vec<StacksMessage>> = HashMap::new();
5227-
if sn.block_height != self.chain_view.burn_block_height {
5227+
let mut need_stackerdb_refresh = sn.canonical_stacks_tip_consensus_hash
5228+
!= self.burnchain_tip.canonical_stacks_tip_consensus_hash;
5229+
5230+
if sn.block_height != self.chain_view.burn_block_height
5231+
|| self.num_state_machine_passes == 0
5232+
{
52285233
debug!(
52295234
"{:?}: load chain view for burn block {}",
52305235
&self.local_peer, sn.block_height
@@ -5303,7 +5308,17 @@ impl PeerNetwork {
53035308
.get_last_selected_anchor_block_txid()?
53045309
.unwrap_or(Txid([0x00; 32]));
53055310

5306-
// refresh stackerdb configs
5311+
test_debug!(
5312+
"{:?}: chain view is {:?}",
5313+
&self.get_local_peer(),
5314+
&self.chain_view
5315+
);
5316+
need_stackerdb_refresh = true;
5317+
}
5318+
5319+
if need_stackerdb_refresh {
5320+
// refresh stackerdb configs -- canonical stacks tip has changed
5321+
debug!("{:?}: Refresh all stackerdbs", &self.get_local_peer());
53075322
let mut new_stackerdb_configs = HashMap::new();
53085323
let stacker_db_configs = mem::replace(&mut self.stacker_db_configs, HashMap::new());
53095324
for (stackerdb_contract_id, stackerdb_config) in stacker_db_configs.into_iter() {

stackslib/src/net/stackerdb/mod.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,8 @@ pub struct StackerDBSyncResult {
151151
dead: HashSet<NeighborKey>,
152152
/// neighbors that misbehaved while syncing
153153
broken: HashSet<NeighborKey>,
154+
/// neighbors that have stale views, but are otherwise online
155+
pub(crate) stale: HashSet<NeighborAddress>,
154156
}
155157

156158
/// Settings for the Stacker DB
@@ -262,6 +264,8 @@ pub struct StackerDBSync<NC: NeighborComms> {
262264
/// whether or not we should immediately re-fetch chunks because we learned about new chunks
263265
/// from our peers when they replied to our chunk-pushes with new inventory state
264266
need_resync: bool,
267+
/// Track stale neighbors
268+
pub(crate) stale_neighbors: HashSet<NeighborAddress>,
265269
}
266270

267271
impl StackerDBSyncResult {
@@ -274,6 +278,7 @@ impl StackerDBSyncResult {
274278
chunks_to_store: vec![chunk.chunk_data],
275279
dead: HashSet::new(),
276280
broken: HashSet::new(),
281+
stale: HashSet::new(),
277282
}
278283
}
279284
}

stackslib/src/net/stackerdb/sync.rs

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@ use crate::net::stackerdb::{
3333
StackerDBConfig, StackerDBSync, StackerDBSyncResult, StackerDBSyncState, StackerDBs,
3434
};
3535
use crate::net::{
36-
Error as net_error, NackData, Neighbor, NeighborAddress, NeighborKey, StackerDBChunkData,
37-
StackerDBChunkInvData, StackerDBGetChunkData, StackerDBGetChunkInvData, StackerDBPushChunkData,
38-
StacksMessageType,
36+
Error as net_error, NackData, NackErrorCodes, Neighbor, NeighborAddress, NeighborKey,
37+
StackerDBChunkData, StackerDBChunkInvData, StackerDBGetChunkData, StackerDBGetChunkInvData,
38+
StackerDBPushChunkData, StacksMessageType,
3939
};
4040

4141
const MAX_CHUNKS_IN_FLIGHT: usize = 6;
@@ -72,6 +72,7 @@ impl<NC: NeighborComms> StackerDBSync<NC> {
7272
total_pushed: 0,
7373
last_run_ts: 0,
7474
need_resync: false,
75+
stale_neighbors: HashSet::new(),
7576
};
7677
dbsync.reset(None, config);
7778
dbsync
@@ -178,6 +179,7 @@ impl<NC: NeighborComms> StackerDBSync<NC> {
178179
chunks_to_store: chunks,
179180
dead: self.comms.take_dead_neighbors(),
180181
broken: self.comms.take_broken_neighbors(),
182+
stale: std::mem::replace(&mut self.stale_neighbors, HashSet::new()),
181183
};
182184

183185
// keep all connected replicas, and replenish from config hints and the DB as needed
@@ -677,6 +679,7 @@ impl<NC: NeighborComms> StackerDBSync<NC> {
677679
&network.get_chain_view().rc_consensus_hash,
678680
&db_data.rc_consensus_hash
679681
);
682+
self.connected_replicas.remove(&naddr);
680683
continue;
681684
}
682685
db_data
@@ -688,6 +691,10 @@ impl<NC: NeighborComms> StackerDBSync<NC> {
688691
&naddr,
689692
data.error_code
690693
);
694+
self.connected_replicas.remove(&naddr);
695+
if data.error_code == NackErrorCodes::StaleView {
696+
self.stale_neighbors.insert(naddr);
697+
}
691698
continue;
692699
}
693700
x => {
@@ -800,10 +807,15 @@ impl<NC: NeighborComms> StackerDBSync<NC> {
800807
&naddr,
801808
data.error_code
802809
);
810+
self.connected_replicas.remove(&naddr);
811+
if data.error_code == NackErrorCodes::StaleView {
812+
self.stale_neighbors.insert(naddr);
813+
}
803814
continue;
804815
}
805816
x => {
806817
info!("Received unexpected message {:?}", &x);
818+
self.connected_replicas.remove(&naddr);
807819
continue;
808820
}
809821
};
@@ -929,10 +941,14 @@ impl<NC: NeighborComms> StackerDBSync<NC> {
929941
data.error_code
930942
);
931943
self.connected_replicas.remove(&naddr);
944+
if data.error_code == NackErrorCodes::StaleView {
945+
self.stale_neighbors.insert(naddr);
946+
}
932947
continue;
933948
}
934949
x => {
935950
info!("Received unexpected message {:?}", &x);
951+
self.connected_replicas.remove(&naddr);
936952
continue;
937953
}
938954
};
@@ -1072,6 +1088,9 @@ impl<NC: NeighborComms> StackerDBSync<NC> {
10721088
data.error_code
10731089
);
10741090
self.connected_replicas.remove(&naddr);
1091+
if data.error_code == NackErrorCodes::StaleView {
1092+
self.stale_neighbors.insert(naddr);
1093+
}
10751094
continue;
10761095
}
10771096
x => {

0 commit comments

Comments
 (0)