Skip to content

Commit ba7b5d9

Browse files
authored
chore(l1): improve healing logs (#5138)
**Motivation** Healing logs didn't adapt the structured logs format and was throwing inaccurate errors that were not relevant to the user. This PR aims to improve them **Description** - Change healing logs to be structured - Lowered the level of healing logs to debug or trace where appropriate Works to improve #4950
1 parent a40bc41 commit ba7b5d9

File tree

2 files changed

+95
-61
lines changed

2 files changed

+95
-61
lines changed

crates/networking/p2p/sync/state_healing.rs

Lines changed: 25 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use ethrex_common::{H256, constants::EMPTY_KECCACK_HASH, types::AccountState};
1919
use ethrex_rlp::{decode::RLPDecode, encode::RLPEncode};
2020
use ethrex_storage::Store;
2121
use ethrex_trie::{EMPTY_TRIE_HASH, Nibbles, Node, TrieDB, TrieError};
22-
use tracing::{debug, error, info};
22+
use tracing::debug;
2323

2424
use crate::{
2525
metrics::{CurrentStepValue, METRICS},
@@ -56,7 +56,7 @@ pub async fn heal_state_trie_wrap(
5656
) -> Result<bool, SyncError> {
5757
let mut healing_done = false;
5858
METRICS.current_step.set(CurrentStepValue::HealingState);
59-
info!("Starting state healing");
59+
debug!("Starting state healing");
6060
while !healing_done {
6161
healing_done = heal_state_trie(
6262
state_root,
@@ -70,11 +70,11 @@ pub async fn heal_state_trie_wrap(
7070
)
7171
.await?;
7272
if current_unix_time() > staleness_timestamp {
73-
info!("Stopped state healing due to staleness");
73+
debug!("Stopped state healing due to staleness");
7474
break;
7575
}
7676
}
77-
info!("Stopped state healing");
77+
debug!("Stopped state healing");
7878
Ok(healing_done)
7979
}
8080

@@ -137,11 +137,17 @@ async fn heal_state_trie(
137137
.healing_empty_try_recv
138138
.store(empty_try_recv, Ordering::Relaxed);
139139
debug!(
140-
"State Healing {}, snap peers available {num_peers}, inflight_tasks: {inflight_tasks}, Maximum depth reached on loop {longest_path_seen}, leafs healed {leafs_healed}, global leafs healed {}, Download success rate {downloads_rate}, Paths to go {}, Membatch size {}, Processing per cycle {heals_per_cycle}",
141-
if is_stale { "stopping" } else { "in progress" },
140+
status = if is_stale { "stopping" } else { "in progress" },
141+
snap_peers = num_peers,
142+
inflight_tasks,
143+
longest_path_seen,
144+
leafs_healed,
142145
global_leafs_healed,
143-
paths.len(),
144-
membatch.len()
146+
downloads_rate,
147+
paths_to_go = paths.len(),
148+
pending_nodes = membatch.len(),
149+
heals_per_cycle,
150+
"State Healing",
145151
);
146152
downloads_success = 0;
147153
downloads_fail = 0;
@@ -220,7 +226,7 @@ async fn heal_state_trie(
220226
.get_best_peer(&SUPPORTED_SNAP_CAPABILITIES)
221227
.await
222228
.inspect_err(
223-
|err| error!(err= ?err, "Error requesting a peer to perform state healing"),
229+
|err| debug!(err=?err, "Error requesting a peer to perform state healing"),
224230
)
225231
.unwrap_or(None)
226232
else {
@@ -244,11 +250,9 @@ async fn heal_state_trie(
244250
)
245251
.await;
246252
// TODO: add error handling
247-
tx.send((peer_id, response, batch))
248-
.await
249-
.inspect_err(|err| {
250-
error!("Failed to send state trie nodes response. Error: {err}")
251-
})
253+
tx.send((peer_id, response, batch)).await.inspect_err(
254+
|err| debug!(error=?err, "Failed to send state trie nodes response"),
255+
)
252256
});
253257
tokio::task::yield_now().await;
254258
}
@@ -265,7 +269,7 @@ async fn heal_state_trie(
265269
&mut nodes_to_write,
266270
)
267271
.inspect_err(|err| {
268-
error!("We have found a sync error while trying to write to DB a batch: {err}")
272+
debug!(error=?err, "We have found a sync error while trying to write to DB a batch")
269273
})?;
270274
paths.extend(return_paths);
271275
}
@@ -303,24 +307,24 @@ async fn heal_state_trie(
303307

304308
// End loop if we have no more paths to fetch nor nodes to heal and no inflight tasks
305309
if is_done {
306-
info!("Nothing more to heal found");
310+
debug!("Nothing more to heal found");
307311
db_joinset.join_all().await;
308312
break;
309313
}
310314

311315
// We check with a clock if we are stale
312316
if !is_stale && current_unix_time() > staleness_timestamp {
313-
info!("state healing is stale");
317+
debug!("state healing is stale");
314318
is_stale = true;
315319
}
316320

317321
if is_stale && nodes_to_heal.is_empty() && inflight_tasks == 0 {
318-
info!("Finisehd inflight tasks");
322+
debug!("Finisehd inflight tasks");
319323
db_joinset.join_all().await;
320324
break;
321325
}
322326
}
323-
info!("State Healing stopped, signaling storage healer");
327+
debug!("State Healing stopped, signaling storage healer");
324328
// Save paths for the next cycle. If there are no paths left, clear it in case pivot becomes stale during storage
325329
// Send empty batch to signal that no more batches are incoming
326330
// bytecode_sender.send(vec![]).await?;
@@ -412,7 +416,7 @@ pub fn node_missing_children(
412416
let validity = child
413417
.get_node(trie_state, child_path.clone())
414418
.inspect_err(|_| {
415-
error!("Malformed data when doing get child of a branch node")
419+
debug!("Malformed data when doing get child of a branch node")
416420
})?
417421
.is_some();
418422
if validity {
@@ -435,7 +439,7 @@ pub fn node_missing_children(
435439
let validity = node
436440
.child
437441
.get_node(trie_state, child_path.clone())
438-
.inspect_err(|_| error!("Malformed data when doing get child of a branch node"))?
442+
.inspect_err(|_| debug!("Malformed data when doing get child of a branch node"))?
439443
.is_some();
440444
if validity {
441445
return Ok((0, vec![]));

crates/networking/p2p/sync/storage_healing.rs

Lines changed: 70 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ use tokio::{
2929
sync::mpsc::{Sender, error::TrySendError},
3030
task::yield_now,
3131
};
32-
use tracing::{debug, error, info, trace};
32+
use tracing::{debug, trace};
3333

3434
const MAX_IN_FLIGHT_REQUESTS: u32 = 77;
3535

@@ -128,9 +128,9 @@ pub async fn heal_storage_trie(
128128
) -> Result<bool, SyncError> {
129129
METRICS.current_step.set(CurrentStepValue::HealingStorage);
130130
let download_queue = get_initial_downloads(&store, state_root, storage_accounts);
131-
info!(
132-
"Started Storage Healing with {} accounts",
133-
download_queue.len()
131+
debug!(
132+
initial_accounts_count = download_queue.len(),
133+
"Started Storage Healing",
134134
);
135135
let mut state = StorageHealer {
136136
last_update: Instant::now(),
@@ -174,24 +174,27 @@ pub async fn heal_storage_trie(
174174
.healing_empty_try_recv
175175
.store(state.empty_count as u64, Ordering::Relaxed);
176176
state.last_update = Instant::now();
177+
let snap_peer_count = peers
178+
.peer_table
179+
.peer_count_by_capabilities(&SUPPORTED_SNAP_CAPABILITIES)
180+
.await
181+
.unwrap_or(0);
177182
debug!(
178-
"We are storage healing. Snap Peers {}. Inflight tasks {}. Download Queue {}. Maximum length {}. Leafs Healed {}. Global Leafs Healed {global_leafs_healed}. Roots Healed {}. Good Downloads {}. Good Download Percentage {}. Empty count {}. Disconnected Count {}.",
179-
peers
180-
.peer_table
181-
.peer_count_by_capabilities(&SUPPORTED_SNAP_CAPABILITIES)
182-
.await
183-
.unwrap_or(0),
184-
state.requests.len(),
185-
state.download_queue.len(),
186-
state.maximum_length_seen,
187-
state.leafs_healed,
188-
state.roots_healed,
189-
state.succesful_downloads,
190-
state.succesful_downloads as f64
183+
snap_peer_count,
184+
inflight_requests = state.requests.len(),
185+
download_queue_len = state.download_queue.len(),
186+
maximum_depth = state.maximum_length_seen,
187+
leaves_healed = state.leafs_healed,
188+
global_leaves_healed = global_leafs_healed,
189+
roots_healed = state.roots_healed,
190+
succesful_downloads = state.succesful_downloads,
191+
succesful_downloads_percentage = state.succesful_downloads as f64
191192
/ (state.succesful_downloads as f64 + state.failed_downloads as f64),
192-
state.empty_count,
193-
state.disconnected_count,
193+
empty_count = state.empty_count,
194+
disconnected_count = state.disconnected_count,
195+
"We are storage healing",
194196
);
197+
//. Snap Peers {}. Inflight tasks {}. Download Queue {}. Maximum length {}. Leafs Healed {}. Global Leafs Healed {}. Roots Healed {}. Good Downloads {}. Good Download Percentage {}. Empty count {}. Disconnected Count {}.
195198
state.succesful_downloads = 0;
196199
state.failed_downloads = 0;
197200
state.empty_count = 0;
@@ -321,7 +324,7 @@ async fn ask_peers_for_nodes(
321324
.get_best_peer(&SUPPORTED_SNAP_CAPABILITIES)
322325
.await
323326
.inspect_err(
324-
|err| error!(err= ?err, "Error requesting a peer to perform storage healing"),
327+
|err| debug!(err= ?err, "Error requesting a peer to perform storage healing"),
325328
)
326329
.unwrap_or(None)
327330
else {
@@ -354,13 +357,12 @@ async fn ask_peers_for_nodes(
354357

355358
requests_task_joinset.spawn(async move {
356359
let req_id = gtn.id;
357-
// TODO: check errors to determine whether the current block is stale
358360
let response =
359361
PeerHandler::request_storage_trienodes(peer_id, connection, peer_table, gtn).await;
360362
// TODO: add error handling
361-
tx.try_send(response).inspect_err(|err| {
362-
error!("Failed to send state trie nodes response. Error: {err}")
363-
})?;
363+
tx.try_send(response).inspect_err(
364+
|err| debug!(error=?err, "Failed to send state trie nodes response"),
365+
)?;
364366
Ok(req_id)
365367
});
366368
}
@@ -408,11 +410,14 @@ async fn zip_requeue_node_responses_score_peer(
408410
failed_downloads: &mut usize,
409411
) -> Result<Option<Vec<NodeResponse>>, SyncError> {
410412
trace!(
411-
"We are processing the nodes, we received {} nodes from our peer",
412-
trie_nodes.nodes.len()
413+
trie_response_len=?trie_nodes.nodes.len(),
414+
"We are processing the nodes",
413415
);
414416
let Some(request) = requests.remove(&trie_nodes.id) else {
415-
info!("No matching request found for received response {trie_nodes:?}");
417+
debug!(
418+
?trie_nodes,
419+
"No matching request found for received response"
420+
);
416421
return Ok(None);
417422
};
418423

@@ -437,17 +442,28 @@ async fn zip_requeue_node_responses_score_peer(
437442
.iter()
438443
.zip(trie_nodes.nodes.clone())
439444
.map(|(node_request, node_bytes)| {
440-
let node = Node::decode(&node_bytes).inspect_err(|err|{
441-
info!("this peer {} request {node_request:?}, had this error {err:?}, and the raw node was {node_bytes:?}", request.peer_id)
442-
})?;
445+
let node = Node::decode(&node_bytes).inspect_err(|err| {
446+
trace!(
447+
peer=?request.peer_id,
448+
?node_request,
449+
error=?err,
450+
?node_bytes,
451+
"Decode Failed"
452+
)
453+
})?;
443454

444455
if node.compute_hash().finalize() != node_request.hash {
445-
error!("this peer {} request {node_request:?}, sent us a valid node with the wrong hash, and the raw node was {node_bytes:?}", request.peer_id);
456+
trace!(
457+
peer=?request.peer_id,
458+
?node_request,
459+
?node_bytes,
460+
"Node Hash failed"
461+
);
446462
Err(RLPDecodeError::MalformedData)
447463
} else {
448464
Ok(NodeResponse {
449465
node_request: node_request.clone(),
450-
node
466+
node,
451467
})
452468
}
453469
})
@@ -457,11 +473,17 @@ async fn zip_requeue_node_responses_score_peer(
457473
download_queue.extend(request.requests.into_iter().skip(nodes_size));
458474
}
459475
*succesful_downloads += 1;
460-
peer_handler.peer_table.record_success(&request.peer_id).await?;
476+
peer_handler
477+
.peer_table
478+
.record_success(&request.peer_id)
479+
.await?;
461480
Ok(Some(nodes))
462481
} else {
463482
*failed_downloads += 1;
464-
peer_handler.peer_table.record_failure(&request.peer_id).await?;
483+
peer_handler
484+
.peer_table
485+
.record_failure(&request.peer_id)
486+
.await?;
465487
download_queue.extend(request.requests);
466488
Ok(None)
467489
}
@@ -480,7 +502,7 @@ fn process_node_responses(
480502
to_write: &mut HashMap<H256, Vec<(Nibbles, Node)>>,
481503
) -> Result<(), StoreError> {
482504
while let Some(node_response) = node_processing_queue.pop() {
483-
trace!("We are processing node response {:?}", node_response);
505+
trace!(?node_response, "We are processing node response");
484506
if let Node::Leaf(_) = &node_response.node {
485507
*leafs_healed += 1;
486508
*global_leafs_healed += 1;
@@ -493,13 +515,21 @@ fn process_node_responses(
493515

494516
let (missing_children_nibbles, missing_children_count) =
495517
determine_missing_children(&node_response, store).inspect_err(|err| {
496-
error!("{err} in determine missing children while searching {node_response:?}")
518+
debug!(
519+
error=?err,
520+
?node_response,
521+
"Error in determine_missing_children"
522+
)
497523
})?;
498524

499525
if missing_children_count == 0 {
500526
// We flush to the database this node
501527
commit_node(&node_response, membatch, roots_healed, to_write).inspect_err(|err| {
502-
error!("{err} in commit node while committing {node_response:?}")
528+
debug!(
529+
error=?err,
530+
?node_response,
531+
"Error in commit_node"
532+
)
503533
})?;
504534
} else {
505535
let key = (
@@ -572,7 +602,7 @@ pub fn determine_missing_children(
572602
*EMPTY_TRIE_HASH,
573603
)
574604
.inspect_err(|_| {
575-
error!("Malformed data when opening the storage trie in determine missing children")
605+
debug!("Malformed data when opening the storage trie in determine missing children")
576606
})?;
577607
let trie_state = trie.db();
578608

@@ -589,7 +619,7 @@ pub fn determine_missing_children(
589619
let validity = child
590620
.get_node(trie_state, child_path.clone())
591621
.inspect_err(|_| {
592-
error!("Malformed data when doing get child of a branch node")
622+
debug!("Malformed data when doing get child of a branch node")
593623
})?
594624
.is_some();
595625

@@ -614,7 +644,7 @@ pub fn determine_missing_children(
614644
let validity = node
615645
.child
616646
.get_node(trie_state, child_path.clone())
617-
.inspect_err(|_| error!("Malformed data when doing get child of a branch node"))?
647+
.inspect_err(|_| debug!("Malformed data when doing get child of a branch node"))?
618648
.is_some();
619649

620650
if validity {

0 commit comments

Comments
 (0)