Skip to content

Commit 88479b5

Browse files
AndreiEresgithub-actions[bot]
authored andcommitted
Check artifact integrity before execution (#8833)
Fixes #677 Fixes #2399 # Description To detect potential corruption of PVF artifacts on disk, we store their checksums and verify if they match before execution. In case of a mismatch, we recreate the artifact. ## Integration In Candidate Validation, we treat the error similarly to PossiblyInvalidError::RuntimeConstruction due to their close nature. ## Review Notes The Black3 hashing algorithm has already been used. I believe we can switch to twox, as suggested in the issue, because the checksum does not need to be cryptographically hashed, and we do not reveal the checksum in logs. --------- Co-authored-by: cmd[bot] <41898282+github-actions[bot]@users.noreply.github.com>
1 parent 0fc0f7c commit 88479b5

File tree

16 files changed

+331
-62
lines changed

16 files changed

+331
-62
lines changed

Cargo.lock

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

polkadot/node/core/candidate-validation/src/lib.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -965,6 +965,8 @@ async fn validate_candidate_exhaustive(
965965
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(err))),
966966
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::RuntimeConstruction(err))) =>
967967
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(err))),
968+
Err(ValidationError::PossiblyInvalid(err @ PossiblyInvalidError::CorruptedArtifact)) =>
969+
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(err.to_string()))),
968970

969971
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousJobDeath(err))) =>
970972
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(format!(
@@ -1148,7 +1150,7 @@ trait ValidationBackend {
11481150
let mut num_death_retries_left = 1;
11491151
let mut num_job_error_retries_left = 1;
11501152
let mut num_internal_retries_left = 1;
1151-
let mut num_runtime_construction_retries_left = 1;
1153+
let mut num_execution_error_retries_left = 1;
11521154
loop {
11531155
// Stop retrying if we exceeded the timeout.
11541156
if total_time_start.elapsed() + retry_delay > exec_timeout {
@@ -1168,9 +1170,10 @@ trait ValidationBackend {
11681170
break_if_no_retries_left!(num_internal_retries_left),
11691171

11701172
Err(ValidationError::PossiblyInvalid(
1171-
PossiblyInvalidError::RuntimeConstruction(_),
1173+
PossiblyInvalidError::RuntimeConstruction(_) |
1174+
PossiblyInvalidError::CorruptedArtifact,
11721175
)) => {
1173-
break_if_no_retries_left!(num_runtime_construction_retries_left);
1176+
break_if_no_retries_left!(num_execution_error_retries_left);
11741177
self.precheck_pvf(pvf.clone()).await?;
11751178
// In this case the error is deterministic
11761179
// And a retry forces the ValidationBackend

polkadot/node/core/pvf/common/src/execute.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ pub enum JobResponse {
8181
InvalidCandidate(String),
8282
/// PoV decompression failed
8383
PoVDecompressionFailure,
84+
/// The artifact is corrupted, re-prepare the artifact and try again.
85+
CorruptedArtifact,
8486
}
8587

8688
impl JobResponse {

polkadot/node/core/pvf/common/src/lib.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ pub use sp_tracing;
3333
const LOG_TARGET: &str = "parachain::pvf-common";
3434

3535
use codec::{Decode, Encode};
36+
use sp_core::H256;
3637
use std::{
3738
io::{self, Read, Write},
3839
mem,
@@ -88,6 +89,15 @@ pub fn framed_recv_blocking(r: &mut (impl Read + Unpin)) -> io::Result<Vec<u8>>
8889
Ok(buf)
8990
}
9091

92+
#[derive(Debug, Default, Clone, Copy, Encode, Decode, PartialEq, Eq)]
93+
#[repr(transparent)]
94+
pub struct ArtifactChecksum(H256);
95+
96+
/// Compute the checksum of the given artifact.
97+
pub fn compute_checksum(data: &[u8]) -> ArtifactChecksum {
98+
ArtifactChecksum(H256::from_slice(&sp_crypto_hashing::twox_256(data)))
99+
}
100+
91101
#[cfg(all(test, not(feature = "test-utils")))]
92102
mod tests {
93103
use super::*;

polkadot/node/core/pvf/common/src/prepare.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,21 +14,24 @@
1414
// You should have received a copy of the GNU General Public License
1515
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.
1616

17+
use crate::ArtifactChecksum;
1718
use codec::{Decode, Encode};
1819
use std::path::PathBuf;
1920

2021
/// Result from prepare worker if successful.
2122
#[derive(Debug, Clone, Default, Encode, Decode)]
2223
pub struct PrepareWorkerSuccess {
2324
/// Checksum of the compiled PVF.
24-
pub checksum: String,
25+
pub checksum: ArtifactChecksum,
2526
/// Stats of the current preparation run.
2627
pub stats: PrepareStats,
2728
}
2829

2930
/// Result of PVF preparation if successful.
3031
#[derive(Debug, Clone, Default)]
3132
pub struct PrepareSuccess {
33+
/// Checksum of the compiled PVF.
34+
pub checksum: ArtifactChecksum,
3235
/// Canonical path to the compiled artifact.
3336
pub path: PathBuf,
3437
/// Size in bytes

polkadot/node/core/pvf/execute-worker/src/lib.rs

Lines changed: 38 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ use nix::{
3939
unistd::{ForkResult, Pid},
4040
};
4141
use polkadot_node_core_pvf_common::{
42+
compute_checksum,
4243
error::InternalValidationError,
4344
execute::{Handshake, JobError, JobResponse, JobResult, WorkerError, WorkerResponse},
4445
executor_interface::params_to_wasmtime_semantics,
@@ -49,7 +50,7 @@ use polkadot_node_core_pvf_common::{
4950
thread::{self, WaitOutcome},
5051
PipeFd, WorkerInfo, WorkerKind,
5152
},
52-
worker_dir,
53+
worker_dir, ArtifactChecksum,
5354
};
5455
use polkadot_node_primitives::{BlockData, PoV, POV_BOMB_LIMIT};
5556
use polkadot_parachain_primitives::primitives::ValidationResult;
@@ -87,7 +88,9 @@ fn recv_execute_handshake(stream: &mut UnixStream) -> io::Result<Handshake> {
8788
Ok(handshake)
8889
}
8990

90-
fn recv_request(stream: &mut UnixStream) -> io::Result<(PersistedValidationData, PoV, Duration)> {
91+
fn recv_request(
92+
stream: &mut UnixStream,
93+
) -> io::Result<(PersistedValidationData, PoV, Duration, ArtifactChecksum)> {
9194
let pvd = framed_recv_blocking(stream)?;
9295
let pvd = PersistedValidationData::decode(&mut &pvd[..]).map_err(|_| {
9396
io::Error::new(
@@ -111,7 +114,17 @@ fn recv_request(stream: &mut UnixStream) -> io::Result<(PersistedValidationData,
111114
"execute pvf recv_request: failed to decode duration".to_string(),
112115
)
113116
})?;
114-
Ok((pvd, pov, execution_timeout))
117+
118+
let artifact_checksum = framed_recv_blocking(stream)?;
119+
let artifact_checksum =
120+
ArtifactChecksum::decode(&mut &artifact_checksum[..]).map_err(|_| {
121+
io::Error::new(
122+
io::ErrorKind::Other,
123+
"execute pvf recv_request: failed to decode artifact checksum".to_string(),
124+
)
125+
})?;
126+
127+
Ok((pvd, pov, execution_timeout, artifact_checksum))
115128
}
116129

117130
/// Sends an error to the host and returns the original error wrapped in `io::Error`.
@@ -166,14 +179,15 @@ pub fn worker_entrypoint(
166179
let execute_thread_stack_size = max_stack_size(&executor_params);
167180

168181
loop {
169-
let (pvd, pov, execution_timeout) = recv_request(&mut stream).map_err(|e| {
170-
map_and_send_err!(
171-
e,
172-
InternalValidationError::HostCommunication,
173-
&mut stream,
174-
worker_info
175-
)
176-
})?;
182+
let (pvd, pov, execution_timeout, artifact_checksum) = recv_request(&mut stream)
183+
.map_err(|e| {
184+
map_and_send_err!(
185+
e,
186+
InternalValidationError::HostCommunication,
187+
&mut stream,
188+
worker_info
189+
)
190+
})?;
177191
gum::debug!(
178192
target: LOG_TARGET,
179193
?worker_info,
@@ -192,6 +206,19 @@ pub fn worker_entrypoint(
192206
)
193207
})?;
194208

209+
if artifact_checksum != compute_checksum(&compiled_artifact_blob) {
210+
send_result::<WorkerResponse, WorkerError>(
211+
&mut stream,
212+
Ok(WorkerResponse {
213+
job_response: JobResponse::CorruptedArtifact,
214+
duration: Duration::ZERO,
215+
pov_size: 0,
216+
}),
217+
worker_info,
218+
)?;
219+
continue;
220+
}
221+
195222
let (pipe_read_fd, pipe_write_fd) = pipe2_cloexec().map_err(|e| {
196223
map_and_send_err!(
197224
e,

polkadot/node/core/pvf/prepare-worker/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ name = "prepare_rococo_runtime"
1616
harness = false
1717

1818
[dependencies]
19-
blake3 = { workspace = true }
2019
cfg-if = { workspace = true }
2120
gum = { workspace = true, default-features = true }
2221
libc = { workspace = true }

polkadot/node/core/pvf/prepare-worker/src/lib.rs

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ const LOG_TARGET: &str = "parachain::pvf-prepare-worker";
2626
use crate::memory_stats::max_rss_stat::{extract_max_rss_stat, get_max_rss_thread};
2727
#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
2828
use crate::memory_stats::memory_tracker::{get_memory_tracker_loop_stats, memory_tracker_loop};
29+
use codec::{Decode, Encode};
2930
use nix::{
3031
errno::Errno,
3132
sys::{
@@ -35,22 +36,17 @@ use nix::{
3536
unistd::{ForkResult, Pid},
3637
};
3738
use polkadot_node_core_pvf_common::{
38-
executor_interface::{prepare, prevalidate},
39-
worker::{pipe2_cloexec, PipeFd, WorkerInfo},
40-
};
41-
42-
use codec::{Decode, Encode};
43-
use polkadot_node_core_pvf_common::{
39+
compute_checksum,
4440
error::{PrepareError, PrepareWorkerResult},
45-
executor_interface::create_runtime_from_artifact_bytes,
41+
executor_interface::{create_runtime_from_artifact_bytes, prepare, prevalidate},
4642
framed_recv_blocking, framed_send_blocking,
4743
prepare::{MemoryStats, PrepareJobKind, PrepareStats, PrepareWorkerSuccess},
4844
pvf::PvfPrepData,
4945
worker::{
50-
cpu_time_monitor_loop, get_total_cpu_usage, recv_child_response, run_worker, send_result,
51-
stringify_errno, stringify_panic_payload,
46+
cpu_time_monitor_loop, get_total_cpu_usage, pipe2_cloexec, recv_child_response, run_worker,
47+
send_result, stringify_errno, stringify_panic_payload,
5248
thread::{self, spawn_worker_thread, WaitOutcome},
53-
WorkerKind,
49+
PipeFd, WorkerInfo, WorkerKind,
5450
},
5551
worker_dir, ProcessTime,
5652
};
@@ -718,7 +714,7 @@ fn handle_parent_process(
718714
return Err(PrepareError::IoErr(err.to_string()))
719715
};
720716

721-
let checksum = blake3::hash(&artifact.as_ref()).to_hex().to_string();
717+
let checksum = compute_checksum(&artifact.as_ref());
722718
Ok(PrepareWorkerSuccess {
723719
checksum,
724720
stats: PrepareStats {

polkadot/node/core/pvf/src/artifacts.rs

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
5757
use crate::{host::PrecheckResultSender, worker_interface::WORKER_DIR_PREFIX};
5858
use always_assert::always;
59-
use polkadot_node_core_pvf_common::{error::PrepareError, pvf::PvfPrepData};
59+
use polkadot_node_core_pvf_common::{error::PrepareError, pvf::PvfPrepData, ArtifactChecksum};
6060
use polkadot_parachain_primitives::primitives::ValidationCodeHash;
6161
use polkadot_primitives::ExecutorParamsPrepHash;
6262
use std::{
@@ -120,11 +120,12 @@ impl ArtifactId {
120120
pub struct ArtifactPathId {
121121
pub(crate) id: ArtifactId,
122122
pub(crate) path: PathBuf,
123+
pub(crate) checksum: ArtifactChecksum,
123124
}
124125

125126
impl ArtifactPathId {
126-
pub(crate) fn new(artifact_id: ArtifactId, path: &Path) -> Self {
127-
Self { id: artifact_id, path: path.to_owned() }
127+
pub(crate) fn new(artifact_id: ArtifactId, path: &Path, checksum: ArtifactChecksum) -> Self {
128+
Self { id: artifact_id, path: path.to_owned(), checksum }
128129
}
129130
}
130131

@@ -135,6 +136,8 @@ pub enum ArtifactState {
135136
/// That means that the artifact should be accessible through the path obtained by the artifact
136137
/// id (unless, it was removed externally).
137138
Prepared {
139+
/// The checksum of the compiled artifact.
140+
checksum: ArtifactChecksum,
138141
/// The path of the compiled artifact.
139142
path: PathBuf,
140143
/// The time when the artifact was last needed.
@@ -212,6 +215,21 @@ impl Artifacts {
212215
self.inner.keys().cloned().collect()
213216
}
214217

218+
#[cfg(feature = "test-utils")]
219+
pub fn replace_artifact_checksum(
220+
&mut self,
221+
checksum: ArtifactChecksum,
222+
new_checksum: ArtifactChecksum,
223+
) {
224+
for artifact in self.inner.values_mut() {
225+
if let ArtifactState::Prepared { checksum: c, .. } = artifact {
226+
if *c == checksum {
227+
*c = new_checksum;
228+
}
229+
}
230+
}
231+
}
232+
215233
/// Create an empty table and the cache directory on-disk if it doesn't exist.
216234
pub async fn new(cache_path: &Path) -> Self {
217235
// Make sure that the cache path directory and all its parents are created.
@@ -265,13 +283,14 @@ impl Artifacts {
265283
&mut self,
266284
artifact_id: ArtifactId,
267285
path: PathBuf,
286+
checksum: ArtifactChecksum,
268287
last_time_needed: SystemTime,
269288
size: u64,
270289
) {
271290
// See the precondition.
272291
always!(self
273292
.inner
274-
.insert(artifact_id, ArtifactState::Prepared { path, last_time_needed, size })
293+
.insert(artifact_id, ArtifactState::Prepared { path, checksum, last_time_needed, size })
275294
.is_none());
276295
}
277296

@@ -376,18 +395,21 @@ mod tests {
376395
artifacts.insert_prepared(
377396
artifact_id1.clone(),
378397
path1.clone(),
398+
Default::default(),
379399
mock_now - Duration::from_secs(5),
380400
1024,
381401
);
382402
artifacts.insert_prepared(
383403
artifact_id2.clone(),
384404
path2.clone(),
405+
Default::default(),
385406
mock_now - Duration::from_secs(10),
386407
1024,
387408
);
388409
artifacts.insert_prepared(
389410
artifact_id3.clone(),
390411
path3.clone(),
412+
Default::default(),
391413
mock_now - Duration::from_secs(15),
392414
1024,
393415
);
@@ -421,18 +443,21 @@ mod tests {
421443
artifacts.insert_prepared(
422444
artifact_id1.clone(),
423445
path1.clone(),
446+
Default::default(),
424447
mock_now - Duration::from_secs(5),
425448
1024,
426449
);
427450
artifacts.insert_prepared(
428451
artifact_id2.clone(),
429452
path2.clone(),
453+
Default::default(),
430454
mock_now - Duration::from_secs(10),
431455
1024,
432456
);
433457
artifacts.insert_prepared(
434458
artifact_id3.clone(),
435459
path3.clone(),
460+
Default::default(),
436461
mock_now - Duration::from_secs(15),
437462
1024,
438463
);

polkadot/node/core/pvf/src/error.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,9 @@ pub enum PossiblyInvalidError {
9898
/// Possibly related to local issues or dirty node update. May be retried with re-preparation.
9999
#[error("possibly invalid: runtime construction: {0}")]
100100
RuntimeConstruction(String),
101+
/// The artifact is corrupted, re-prepare the artifact and try again.
102+
#[error("possibly invalid: artifact is corrupted")]
103+
CorruptedArtifact,
101104
}
102105

103106
impl From<PrepareError> for ValidationError {

0 commit comments

Comments
 (0)