Skip to content

Commit 6954364

Browse files
committed
Ignore non-fatal farming errors, emit them as events and log as metrics
1 parent 2af3523 commit 6954364

File tree

5 files changed

+227
-90
lines changed

5 files changed

+227
-90
lines changed

crates/subspace-farmer-components/src/proving.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,20 @@ pub enum ProvingError {
6060
RecordReadingError(#[from] ReadingError),
6161
}
6262

63+
impl ProvingError {
64+
/// Whether this error is fatal and makes farm unusable
65+
pub fn is_fatal(&self) -> bool {
66+
match self {
67+
ProvingError::InvalidErasureCodingInstance => true,
68+
ProvingError::FailedToCreatePolynomialForRecord { .. } => false,
69+
ProvingError::FailedToCreateChunkWitness { .. } => false,
70+
ProvingError::FailedToDecodeSectorContentsMap(_) => false,
71+
ProvingError::Io(_) => true,
72+
ProvingError::RecordReadingError(error) => error.is_fatal(),
73+
}
74+
}
75+
}
76+
6377
#[derive(Debug, Clone)]
6478
struct WinningChunk {
6579
/// Chunk offset within s-bucket

crates/subspace-farmer-components/src/reading.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,21 @@ pub enum ReadingError {
7575
ChecksumMismatch,
7676
}
7777

78+
impl ReadingError {
79+
/// Whether this error is fatal and renders farm unusable
80+
pub fn is_fatal(&self) -> bool {
81+
match self {
82+
ReadingError::FailedToReadChunk { .. } => false,
83+
ReadingError::InvalidChunk { .. } => false,
84+
ReadingError::FailedToErasureDecodeRecord { .. } => false,
85+
ReadingError::WrongRecordSizeAfterDecoding { .. } => false,
86+
ReadingError::FailedToDecodeSectorContentsMap(_) => false,
87+
ReadingError::Io(_) => true,
88+
ReadingError::ChecksumMismatch => false,
89+
}
90+
}
91+
}
92+
7893
/// Record contained in the plot
7994
#[derive(Debug, Clone)]
8095
pub struct PlotRecord {

crates/subspace-farmer/src/bin/subspace-farmer/commands/farm.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -761,6 +761,9 @@ where
761761
proving_details.result,
762762
);
763763
}
764+
FarmingNotification::NonFatalError(error) => {
765+
farmer_metrics.note_farming_error(&single_disk_farm_id, error);
766+
}
764767
}
765768
}))
766769
.detach();

crates/subspace-farmer/src/bin/subspace-farmer/commands/farm/metrics.rs

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,13 @@ use prometheus_client::registry::{Registry, Unit};
55
use std::sync::atomic::AtomicU64;
66
use std::time::Duration;
77
use subspace_farmer::single_disk_farm::farming::ProvingResult;
8-
use subspace_farmer::single_disk_farm::SingleDiskFarmId;
8+
use subspace_farmer::single_disk_farm::{FarmingError, SingleDiskFarmId};
99

1010
#[derive(Debug, Clone)]
1111
pub(super) struct FarmerMetrics {
1212
auditing_time: Family<Vec<(String, String)>, Histogram>,
1313
proving_time: Family<Vec<(String, String)>, Histogram>,
14+
farming_errors: Family<Vec<(String, String)>, Counter<u64, AtomicU64>>,
1415
sector_downloading_time: Family<Vec<(String, String)>, Histogram>,
1516
sector_encoding_time: Family<Vec<(String, String)>, Histogram>,
1617
sector_writing_time: Family<Vec<(String, String)>, Histogram>,
@@ -51,6 +52,14 @@ impl FarmerMetrics {
5152
proving_time.clone(),
5253
);
5354

55+
let farming_errors = Family::<_, _>::new_with_constructor(Counter::<_, _>::default);
56+
57+
sub_registry.register(
58+
"farming_errors",
59+
"Non-fatal farming errors",
60+
farming_errors.clone(),
61+
);
62+
5463
let sector_downloading_time = Family::<_, _>::new_with_constructor(|| {
5564
Histogram::new(exponential_buckets(0.0001, 2.0, 15))
5665
});
@@ -170,6 +179,7 @@ impl FarmerMetrics {
170179
Self {
171180
auditing_time,
172181
proving_time,
182+
farming_errors,
173183
sector_downloading_time,
174184
sector_encoding_time,
175185
sector_writing_time,
@@ -212,6 +222,19 @@ impl FarmerMetrics {
212222
.observe(time.as_secs_f64());
213223
}
214224

225+
pub(super) fn note_farming_error(
226+
&self,
227+
single_disk_farm_id: &SingleDiskFarmId,
228+
error: &FarmingError,
229+
) {
230+
self.farming_errors
231+
.get_or_create(&vec![
232+
("farm_id".to_string(), single_disk_farm_id.to_string()),
233+
("error".to_string(), error.str_variant().to_string()),
234+
])
235+
.inc();
236+
}
237+
215238
pub(super) fn observe_sector_downloading_time(
216239
&self,
217240
single_disk_farm_id: &SingleDiskFarmId,

0 commit comments

Comments
 (0)