Skip to content

Commit 912aa3b

Browse files
authored
Merge pull request #2467 from subspace/graceful-farmer-error-handling
Graceful farmer error handling
2 parents f1b7358 + 6954364 commit 912aa3b

File tree

12 files changed

+343
-161
lines changed

12 files changed

+343
-161
lines changed

crates/pallet-subspace/src/mock.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -480,7 +480,8 @@ pub fn create_signed_vote(
480480
vote_solution_range,
481481
&plotted_sector_bytes,
482482
&plotted_sector.sector_metadata,
483-
);
483+
)
484+
.unwrap();
484485

485486
let Some(audit_result) = maybe_audit_result else {
486487
// Sector didn't have any solutions

crates/subspace-farmer-components/benches/auditing.rs

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -151,14 +151,17 @@ pub fn criterion_benchmark(c: &mut Criterion) {
151151
group.throughput(Throughput::Elements(1));
152152
group.bench_function("memory/sync", |b| {
153153
b.iter(|| async {
154-
black_box(audit_plot_sync(
155-
black_box(public_key),
156-
black_box(global_challenge),
157-
black_box(solution_range),
158-
black_box(&plotted_sector_bytes),
159-
black_box(slice::from_ref(&plotted_sector.sector_metadata)),
160-
black_box(None),
161-
));
154+
black_box(
155+
audit_plot_sync(
156+
black_box(public_key),
157+
black_box(global_challenge),
158+
black_box(solution_range),
159+
black_box(&plotted_sector_bytes),
160+
black_box(slice::from_ref(&plotted_sector.sector_metadata)),
161+
black_box(None),
162+
)
163+
.unwrap(),
164+
);
162165
})
163166
});
164167

@@ -193,14 +196,17 @@ pub fn criterion_benchmark(c: &mut Criterion) {
193196
group.throughput(Throughput::Elements(sectors_count));
194197
group.bench_function("disk/sync", |b| {
195198
b.iter(|| {
196-
black_box(audit_plot_sync(
197-
black_box(public_key),
198-
black_box(global_challenge),
199-
black_box(solution_range),
200-
black_box(&plot_file),
201-
black_box(&sectors_metadata),
202-
black_box(None),
203-
));
199+
black_box(
200+
audit_plot_sync(
201+
black_box(public_key),
202+
black_box(global_challenge),
203+
black_box(solution_range),
204+
black_box(&plot_file),
205+
black_box(&sectors_metadata),
206+
black_box(None),
207+
)
208+
.unwrap(),
209+
);
204210
});
205211
});
206212

crates/subspace-farmer-components/benches/proving.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,8 @@ pub fn criterion_benchmark(c: &mut Criterion) {
166166
&plotted_sector_bytes,
167167
slice::from_ref(&plotted_sector.sector_metadata),
168168
None,
169-
);
169+
)
170+
.unwrap();
170171

171172
let solution_candidates = match audit_results.into_iter().next() {
172173
Some(audit_result) => audit_result.solution_candidates,
@@ -249,7 +250,8 @@ pub fn criterion_benchmark(c: &mut Criterion) {
249250
&plot_file,
250251
&sectors_metadata,
251252
None,
252-
);
253+
)
254+
.unwrap();
253255
let solution_candidates = audit_results
254256
.into_iter()
255257
.map(|audit_result| audit_result.solution_candidates)

crates/subspace-farmer-components/src/auditing.rs

Lines changed: 39 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,28 @@ use crate::proving::SolutionCandidates;
22
use crate::sector::{sector_size, SectorContentsMap, SectorMetadataChecksummed};
33
use crate::{ReadAtOffset, ReadAtSync};
44
use rayon::prelude::*;
5+
use std::io;
56
use subspace_core_primitives::crypto::Scalar;
67
use subspace_core_primitives::{
78
Blake3Hash, PublicKey, SBucket, SectorId, SectorIndex, SectorSlotChallenge, SolutionRange,
89
};
910
use subspace_verification::is_within_solution_range;
10-
use tracing::warn;
11+
use thiserror::Error;
12+
13+
/// Errors that happen during proving
14+
#[derive(Debug, Error)]
15+
pub enum AuditingError {
16+
/// Failed read s-bucket
17+
#[error("Failed read s-bucket {s_bucket_audit_index} of sector {sector_index}: {error}")]
18+
SBucketReading {
19+
/// Sector index
20+
sector_index: SectorIndex,
21+
/// S-bucket audit index
22+
s_bucket_audit_index: SBucket,
23+
/// Low-level error
24+
error: io::Error,
25+
},
26+
}
1127

1228
/// Result of sector audit
1329
#[derive(Debug, Clone)]
@@ -42,7 +58,7 @@ pub fn audit_sector_sync<'a, Sector>(
4258
solution_range: SolutionRange,
4359
sector: Sector,
4460
sector_metadata: &'a SectorMetadataChecksummed,
45-
) -> Option<AuditResult<'a, Sector>>
61+
) -> Result<Option<AuditResult<'a, Sector>>, AuditingError>
4662
where
4763
Sector: ReadAtSync + 'a,
4864
{
@@ -55,26 +71,24 @@ where
5571
} = collect_sector_auditing_details(public_key.hash(), global_challenge, sector_metadata);
5672

5773
let mut s_bucket = vec![0; s_bucket_audit_size];
58-
let read_s_bucket_result = sector.read_at(&mut s_bucket, s_bucket_audit_offset_in_sector);
59-
60-
if let Err(error) = read_s_bucket_result {
61-
warn!(
62-
%error,
63-
sector_index = %sector_metadata.sector_index,
64-
%s_bucket_audit_index,
65-
"Failed read s-bucket",
66-
);
67-
return None;
68-
}
74+
sector
75+
.read_at(&mut s_bucket, s_bucket_audit_offset_in_sector)
76+
.map_err(|error| AuditingError::SBucketReading {
77+
sector_index: sector_metadata.sector_index,
78+
s_bucket_audit_index,
79+
error,
80+
})?;
6981

70-
let (winning_chunks, best_solution_distance) = map_winning_chunks(
82+
let Some((winning_chunks, best_solution_distance)) = map_winning_chunks(
7183
&s_bucket,
7284
global_challenge,
7385
&sector_slot_challenge,
7486
solution_range,
75-
)?;
87+
) else {
88+
return Ok(None);
89+
};
7690

77-
Some(AuditResult {
91+
Ok(Some(AuditResult {
7892
sector_index: sector_metadata.sector_index,
7993
solution_candidates: SolutionCandidates::new(
8094
public_key,
@@ -85,7 +99,7 @@ where
8599
winning_chunks.into(),
86100
),
87101
best_solution_distance,
88-
})
102+
}))
89103
}
90104

91105
/// Audit the whole plot and generate streams of solutions
@@ -96,7 +110,7 @@ pub fn audit_plot_sync<'a, Plot>(
96110
plot: &'a Plot,
97111
sectors_metadata: &'a [SectorMetadataChecksummed],
98112
maybe_sector_being_modified: Option<SectorIndex>,
99-
) -> Vec<AuditResult<'a, ReadAtOffset<'a, Plot>>>
113+
) -> Result<Vec<AuditResult<'a, ReadAtOffset<'a, Plot>>>, AuditingError>
100114
where
101115
Plot: ReadAtSync + 'a,
102116
{
@@ -135,14 +149,11 @@ where
135149
&mut s_bucket,
136150
sector_auditing_info.s_bucket_audit_offset_in_sector,
137151
) {
138-
warn!(
139-
%error,
140-
sector_index = %sector_metadata.sector_index,
141-
s_bucket_audit_index = %sector_auditing_info.s_bucket_audit_index,
142-
"Failed read s-bucket",
143-
);
144-
145-
return None;
152+
return Some(Err(AuditingError::SBucketReading {
153+
sector_index: sector_metadata.sector_index,
154+
s_bucket_audit_index: sector_auditing_info.s_bucket_audit_index,
155+
error,
156+
}));
146157
}
147158

148159
let (winning_chunks, best_solution_distance) = map_winning_chunks(
@@ -152,7 +163,7 @@ where
152163
solution_range,
153164
)?;
154165

155-
Some(AuditResult {
166+
Some(Ok(AuditResult {
156167
sector_index: sector_metadata.sector_index,
157168
solution_candidates: SolutionCandidates::new(
158169
public_key,
@@ -163,7 +174,7 @@ where
163174
winning_chunks.into(),
164175
),
165176
best_solution_distance,
166-
})
177+
}))
167178
})
168179
.collect()
169180
}

crates/subspace-farmer-components/src/proving.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,20 @@ pub enum ProvingError {
6060
RecordReadingError(#[from] ReadingError),
6161
}
6262

63+
impl ProvingError {
64+
/// Whether this error is fatal and makes farm unusable
65+
pub fn is_fatal(&self) -> bool {
66+
match self {
67+
ProvingError::InvalidErasureCodingInstance => true,
68+
ProvingError::FailedToCreatePolynomialForRecord { .. } => false,
69+
ProvingError::FailedToCreateChunkWitness { .. } => false,
70+
ProvingError::FailedToDecodeSectorContentsMap(_) => false,
71+
ProvingError::Io(_) => true,
72+
ProvingError::RecordReadingError(error) => error.is_fatal(),
73+
}
74+
}
75+
}
76+
6377
#[derive(Debug, Clone)]
6478
struct WinningChunk {
6579
/// Chunk offset within s-bucket

crates/subspace-farmer-components/src/reading.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,21 @@ pub enum ReadingError {
7575
ChecksumMismatch,
7676
}
7777

78+
impl ReadingError {
79+
/// Whether this error is fatal and renders farm unusable
80+
pub fn is_fatal(&self) -> bool {
81+
match self {
82+
ReadingError::FailedToReadChunk { .. } => false,
83+
ReadingError::InvalidChunk { .. } => false,
84+
ReadingError::FailedToErasureDecodeRecord { .. } => false,
85+
ReadingError::WrongRecordSizeAfterDecoding { .. } => false,
86+
ReadingError::FailedToDecodeSectorContentsMap(_) => false,
87+
ReadingError::Io(_) => true,
88+
ReadingError::ChecksumMismatch => false,
89+
}
90+
}
91+
}
92+
7893
/// Record contained in the plot
7994
#[derive(Debug, Clone)]
8095
pub struct PlotRecord {

crates/subspace-farmer/src/bin/subspace-farmer/commands/benchmark.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ fn prove(
250250
table_generator: &table_generator,
251251
};
252252

253-
let mut audit_results = plot_audit.audit(options);
253+
let mut audit_results = plot_audit.audit(options).unwrap();
254254

255255
group.bench_function("plot/single", |b| {
256256
b.iter_batched(
@@ -259,7 +259,7 @@ fn prove(
259259
return result;
260260
}
261261

262-
audit_results = plot_audit.audit(options);
262+
audit_results = plot_audit.audit(options).unwrap();
263263

264264
audit_results.pop().unwrap()
265265
},
@@ -293,7 +293,7 @@ fn prove(
293293
maybe_sector_being_modified: None,
294294
table_generator: &table_generator,
295295
};
296-
let mut audit_results = plot_audit.audit(options);
296+
let mut audit_results = plot_audit.audit(options).unwrap();
297297

298298
group.bench_function("plot/rayon", |b| {
299299
b.iter_batched(
@@ -302,7 +302,7 @@ fn prove(
302302
return result;
303303
}
304304

305-
audit_results = plot_audit.audit(options);
305+
audit_results = plot_audit.audit(options).unwrap();
306306

307307
audit_results.pop().unwrap()
308308
},

crates/subspace-farmer/src/bin/subspace-farmer/commands/farm.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -761,6 +761,9 @@ where
761761
proving_details.result,
762762
);
763763
}
764+
FarmingNotification::NonFatalError(error) => {
765+
farmer_metrics.note_farming_error(&single_disk_farm_id, error);
766+
}
764767
}
765768
}))
766769
.detach();

crates/subspace-farmer/src/bin/subspace-farmer/commands/farm/metrics.rs

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,13 @@ use prometheus_client::registry::{Registry, Unit};
55
use std::sync::atomic::AtomicU64;
66
use std::time::Duration;
77
use subspace_farmer::single_disk_farm::farming::ProvingResult;
8-
use subspace_farmer::single_disk_farm::SingleDiskFarmId;
8+
use subspace_farmer::single_disk_farm::{FarmingError, SingleDiskFarmId};
99

1010
#[derive(Debug, Clone)]
1111
pub(super) struct FarmerMetrics {
1212
auditing_time: Family<Vec<(String, String)>, Histogram>,
1313
proving_time: Family<Vec<(String, String)>, Histogram>,
14+
farming_errors: Family<Vec<(String, String)>, Counter<u64, AtomicU64>>,
1415
sector_downloading_time: Family<Vec<(String, String)>, Histogram>,
1516
sector_encoding_time: Family<Vec<(String, String)>, Histogram>,
1617
sector_writing_time: Family<Vec<(String, String)>, Histogram>,
@@ -51,6 +52,14 @@ impl FarmerMetrics {
5152
proving_time.clone(),
5253
);
5354

55+
let farming_errors = Family::<_, _>::new_with_constructor(Counter::<_, _>::default);
56+
57+
sub_registry.register(
58+
"farming_errors",
59+
"Non-fatal farming errors",
60+
farming_errors.clone(),
61+
);
62+
5463
let sector_downloading_time = Family::<_, _>::new_with_constructor(|| {
5564
Histogram::new(exponential_buckets(0.0001, 2.0, 15))
5665
});
@@ -170,6 +179,7 @@ impl FarmerMetrics {
170179
Self {
171180
auditing_time,
172181
proving_time,
182+
farming_errors,
173183
sector_downloading_time,
174184
sector_encoding_time,
175185
sector_writing_time,
@@ -212,6 +222,19 @@ impl FarmerMetrics {
212222
.observe(time.as_secs_f64());
213223
}
214224

225+
pub(super) fn note_farming_error(
226+
&self,
227+
single_disk_farm_id: &SingleDiskFarmId,
228+
error: &FarmingError,
229+
) {
230+
self.farming_errors
231+
.get_or_create(&vec![
232+
("farm_id".to_string(), single_disk_farm_id.to_string()),
233+
("error".to_string(), error.str_variant().to_string()),
234+
])
235+
.inc();
236+
}
237+
215238
pub(super) fn observe_sector_downloading_time(
216239
&self,
217240
single_disk_farm_id: &SingleDiskFarmId,

0 commit comments

Comments
 (0)