Skip to content

Commit dcd20e2

Browse files
authored
CasObject validation with updated tests (#21)
* CasObject validation with updated tests - also added Copy trait to HexMerkleHash struct * Pulled fence-post out of loop
1 parent 6950115 commit dcd20e2

File tree

3 files changed

+113
-33
lines changed

3 files changed

+113
-33
lines changed

cas_object/src/cas_chunk_format.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use anyhow::anyhow;
99
use crate::CompressionScheme;
1010
use lz4_flex::frame::{FrameDecoder, FrameEncoder};
1111

12-
pub const CAS_CHUNK_HEADER_LENGTH: u8 = 8;
12+
pub const CAS_CHUNK_HEADER_LENGTH: usize = size_of::<CASChunkHeader>();
1313
const CURRENT_VERSION: u8 = 0;
1414

1515
#[repr(C, packed)]
@@ -131,10 +131,10 @@ pub fn deserialize_chunk_header<R: Read>(reader: &mut R) -> Result<CASChunkHeade
131131
Ok(result)
132132
}
133133

134-
pub fn deserialize_chunk<R: Read>(reader: &mut R) -> Result<Vec<u8>, CasObjectError> {
134+
pub fn deserialize_chunk<R: Read>(reader: &mut R) -> Result<(Vec<u8>, usize), CasObjectError> {
135135
let mut buf = Vec::new();
136-
let _ = deserialize_chunk_to_writer(reader, &mut buf)?;
137-
Ok(buf)
136+
let bytes_read = deserialize_chunk_to_writer(reader, &mut buf)?;
137+
Ok((buf, bytes_read))
138138
}
139139

140140
pub fn deserialize_chunk_to_writer<R: Read, W: Write>(
@@ -153,7 +153,7 @@ pub fn deserialize_chunk_to_writer<R: Read, W: Write>(
153153
}
154154
};
155155

156-
Ok(header.get_uncompressed_length() as usize)
156+
Ok(header.get_compressed_length() as usize + CAS_CHUNK_HEADER_LENGTH)
157157
}
158158

159159
pub fn deserialize_chunks<R: Read>(reader: &mut R) -> Result<Vec<u8>, CasObjectError> {
@@ -240,7 +240,7 @@ mod tests {
240240
write_chunk_header(&mut buf, &header).unwrap();
241241
buf.extend_from_slice(data);
242242

243-
let data_copy = deserialize_chunk(&mut Cursor::new(buf)).unwrap();
243+
let (data_copy, _) = deserialize_chunk(&mut Cursor::new(buf)).unwrap();
244244
assert_eq!(data_copy.as_slice(), data);
245245
}
246246

cas_object/src/cas_object_format.rs

Lines changed: 106 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
use bytes::Buf;
22
use merkledb::{prelude::MerkleDBHighLevelMethodsV1, Chunk, MerkleMemDB};
33
use merklehash::{DataHash, MerkleHash};
4+
use tracing::warn;
45
use std::{
56
cmp::min,
67
io::{Cursor, Error, Read, Seek, Write},
@@ -403,7 +404,7 @@ impl CasObject {
403404
let mut res = Vec::<u8>::new();
404405

405406
while reader.has_remaining() {
406-
let data = deserialize_chunk(&mut reader)?;
407+
let (data, _) = deserialize_chunk(&mut reader)?;
407408
res.extend_from_slice(&data);
408409
}
409410
Ok(res)
@@ -538,6 +539,82 @@ impl CasObject {
538539
*ret.hash() == *hash
539540
}
540541

542+
/// Validate CasObject.
543+
/// Verifies each chunk is valid and correctly represented in CasObjectInfo, along with
544+
/// recomputing the hash and validating it matches CasObjectInfo.
545+
///
546+
/// Returns Ok(true) if recomputed hash matches what is passed in.
547+
pub fn validate_cas_object<R: Read + Seek>(reader: &mut R, hash: &MerkleHash) -> Result<bool, CasObjectError> {
548+
549+
// 1. deserialize to get Info
550+
let cas = CasObject::deserialize(reader)?;
551+
552+
// 2. walk chunks from Info (skip the final dummy chunk)
553+
let mut hash_chunks: Vec<Chunk> = Vec::new();
554+
let mut cumulative_uncompressed_length: u32 = 0;
555+
let mut cumulative_compressed_length: u32 = 0;
556+
557+
if let Some(c) = cas.info.chunk_size_info.first() {
558+
if c.start_byte_index != 0 {
559+
// for 1st chunk verify that its start_byte_index is 0
560+
warn!("XORB Validation: Byte 0 does not contain 1st chunk.");
561+
return Ok(false);
562+
}
563+
} else {
564+
return Err(CasObjectError::FormatError(anyhow!("Invalid Xorb, no chunks")));
565+
}
566+
567+
for (idx, c) in cas.info.chunk_size_info[..cas.info.chunk_size_info.len() - 1].iter().enumerate() {
568+
569+
// 3. verify on each chunk:
570+
reader.seek(std::io::SeekFrom::Start(c.start_byte_index as u64))?;
571+
let (data, compressed_chunk_length) = deserialize_chunk(reader)?;
572+
let chunk_uncompressed_length = data.len();
573+
574+
// 3a. compute hash
575+
hash_chunks.push(Chunk {hash: merklehash::compute_data_hash(&data), length: chunk_uncompressed_length});
576+
577+
cumulative_uncompressed_length += data.len() as u32;
578+
cumulative_compressed_length += compressed_chunk_length as u32;
579+
580+
// 3b. verify deserialized chunk is expected size from Info object
581+
if cumulative_uncompressed_length != c.cumulative_uncompressed_len {
582+
warn!("XORB Validation: Chunk length does not match Info object.");
583+
return Ok(false);
584+
}
585+
586+
// 3c. verify start byte index of next chunk matches current byte index + compressed length
587+
if cas.info.chunk_size_info[idx+1].start_byte_index != (c.start_byte_index + compressed_chunk_length as u32) {
588+
warn!("XORB Validation: Chunk start byte index does not match Info object.");
589+
return Ok(false);
590+
}
591+
}
592+
593+
// validate that Info/footer begins immediately after final content xorb.
594+
// end of for loop completes the content chunks, now should be able to deserialize an Info directly
595+
let cur_position = reader.stream_position()? as u32;
596+
let expected_position = cumulative_compressed_length;
597+
let expected_from_end_position = reader.seek(std::io::SeekFrom::End(0))? as u32 - cas.info_length - size_of::<u32>() as u32;
598+
if cur_position != expected_position || cur_position != expected_from_end_position {
599+
warn!("XORB Validation: Content bytes after known chunks in Info object.");
600+
return Ok(false);
601+
}
602+
603+
// 4. combine hashes to get full xorb hash, compare to provided
604+
let mut db = MerkleMemDB::default();
605+
let mut staging = db.start_insertion_staging();
606+
db.add_file(&mut staging, &hash_chunks);
607+
let ret = db.finalize(staging);
608+
609+
if *ret.hash() != *hash || *ret.hash() != cas.info.cashash {
610+
warn!("XORB Validation: Computed hash does not match provided hash or Info hash.");
611+
return Ok(false);
612+
}
613+
614+
Ok(true)
615+
616+
}
617+
541618
}
542619

543620
#[cfg(test)]
@@ -767,43 +844,38 @@ mod tests {
767844
fn test_basic_serialization_mem() {
768845
// Arrange
769846
let (c, _cas_data, raw_data) = build_cas_object(3, 100, false, false);
770-
let mut writer: Cursor<Vec<u8>> = Cursor::new(Vec::new());
847+
let mut buf: Cursor<Vec<u8>> = Cursor::new(Vec::new());
771848
// Act & Assert
772849
assert!(CasObject::serialize(
773-
&mut writer,
850+
&mut buf,
774851
&c.info.cashash,
775852
&raw_data,
776853
&c.get_chunk_boundaries(),
777854
CompressionScheme::None
778855
)
779856
.is_ok());
780857

781-
let mut reader = writer.clone();
782-
reader.set_position(0);
783-
let res = CasObject::deserialize(&mut reader);
784-
assert!(res.is_ok());
785-
let c2 = res.unwrap();
786-
assert_eq!(c, c2);
787-
assert_eq!(c.info.cashash, c2.info.cashash);
788-
assert_eq!(c.info.num_chunks, c2.info.num_chunks);
858+
assert!(CasObject::validate_cas_object(&mut buf, &c.info.cashash).unwrap());
789859
}
790860

791861
#[test]
792862
fn test_serialization_deserialization_mem_medium() {
793863
// Arrange
794864
let (c, _cas_data, raw_data) = build_cas_object(32, 16384, false, false);
795-
let mut writer: Cursor<Vec<u8>> = Cursor::new(Vec::new());
865+
let mut buf: Cursor<Vec<u8>> = Cursor::new(Vec::new());
796866
// Act & Assert
797867
assert!(CasObject::serialize(
798-
&mut writer,
868+
&mut buf,
799869
&c.info.cashash,
800870
&raw_data,
801871
&c.get_chunk_boundaries(),
802872
CompressionScheme::None
803873
)
804874
.is_ok());
805875

806-
let mut reader = writer.clone();
876+
assert!(CasObject::validate_cas_object(&mut buf, &c.info.cashash).unwrap());
877+
878+
let mut reader = buf.clone();
807879
reader.set_position(0);
808880
let res = CasObject::deserialize(&mut reader);
809881
assert!(res.is_ok());
@@ -820,18 +892,20 @@ mod tests {
820892
fn test_serialization_deserialization_mem_large_random() {
821893
// Arrange
822894
let (c, _cas_data, raw_data) = build_cas_object(32, 65536, true, false);
823-
let mut writer: Cursor<Vec<u8>> = Cursor::new(Vec::new());
895+
let mut buf: Cursor<Vec<u8>> = Cursor::new(Vec::new());
824896
// Act & Assert
825897
assert!(CasObject::serialize(
826-
&mut writer,
898+
&mut buf,
827899
&c.info.cashash,
828900
&raw_data,
829901
&c.get_chunk_boundaries(),
830902
CompressionScheme::None
831903
)
832904
.is_ok());
833905

834-
let mut reader = writer.clone();
906+
assert!(CasObject::validate_cas_object(&mut buf, &c.info.cashash).unwrap());
907+
908+
let mut reader = buf.clone();
835909
reader.set_position(0);
836910
let res = CasObject::deserialize(&mut reader);
837911
assert!(res.is_ok());
@@ -847,18 +921,20 @@ mod tests {
847921
fn test_serialization_deserialization_file_large_random() {
848922
// Arrange
849923
let (c, _cas_data, raw_data) = build_cas_object(256, 65536, true, false);
850-
let mut writer: Cursor<Vec<u8>> = Cursor::new(Vec::new());
924+
let mut buf: Cursor<Vec<u8>> = Cursor::new(Vec::new());
851925
// Act & Assert
852926
assert!(CasObject::serialize(
853-
&mut writer,
927+
&mut buf,
854928
&c.info.cashash,
855929
&raw_data,
856930
&c.get_chunk_boundaries(),
857931
CompressionScheme::None
858932
)
859933
.is_ok());
860934

861-
let mut reader = writer.clone();
935+
assert!(CasObject::validate_cas_object(&mut buf, &c.info.cashash).unwrap());
936+
937+
let mut reader = buf.clone();
862938
reader.set_position(0);
863939
let res = CasObject::deserialize(&mut reader);
864940
assert!(res.is_ok());
@@ -902,18 +978,20 @@ mod tests {
902978
fn test_serialization_deserialization_mem_medium_lz4() {
903979
// Arrange
904980
let (c, _cas_data, raw_data) = build_cas_object(32, 16384, false, true);
905-
let mut writer: Cursor<Vec<u8>> = Cursor::new(Vec::new());
981+
let mut buf: Cursor<Vec<u8>> = Cursor::new(Vec::new());
906982
// Act & Assert
907983
assert!(CasObject::serialize(
908-
&mut writer,
984+
&mut buf,
909985
&c.info.cashash,
910986
&raw_data,
911987
&c.get_chunk_boundaries(),
912988
CompressionScheme::LZ4
913989
)
914990
.is_ok());
915991

916-
let mut reader = writer.clone();
992+
assert!(CasObject::validate_cas_object(&mut buf, &c.info.cashash).unwrap());
993+
994+
let mut reader = buf.clone();
917995
reader.set_position(0);
918996
let res = CasObject::deserialize(&mut reader);
919997
assert!(res.is_ok());
@@ -930,18 +1008,20 @@ mod tests {
9301008
fn test_serialization_deserialization_mem_large_random_lz4() {
9311009
// Arrange
9321010
let (c, _cas_data, raw_data) = build_cas_object(32, 65536, true, true);
933-
let mut writer: Cursor<Vec<u8>> = Cursor::new(Vec::new());
1011+
let mut buf: Cursor<Vec<u8>> = Cursor::new(Vec::new());
9341012
// Act & Assert
9351013
assert!(CasObject::serialize(
936-
&mut writer,
1014+
&mut buf,
9371015
&c.info.cashash,
9381016
&raw_data,
9391017
&c.get_chunk_boundaries(),
9401018
CompressionScheme::LZ4
9411019
)
9421020
.is_ok());
9431021

944-
let mut reader = writer.clone();
1022+
assert!(CasObject::validate_cas_object(&mut buf, &c.info.cashash).unwrap());
1023+
1024+
let mut reader = buf.clone();
9451025
reader.set_position(0);
9461026
let res = CasObject::deserialize(&mut reader);
9471027
assert!(res.is_ok());

cas_types/src/key.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ mod hex {
5858
}
5959
}
6060

61-
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
61+
#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)]
6262
pub struct HexMerkleHash(#[serde(with = "hex::serde")] pub MerkleHash);
6363

6464
impl From<MerkleHash> for HexMerkleHash {

0 commit comments

Comments
 (0)