diff --git a/crates/lib/src/core/db/merkle_node/merkle_node_db.rs b/crates/lib/src/core/db/merkle_node/merkle_node_db.rs index 1af44a3d1..e64187744 100644 --- a/crates/lib/src/core/db/merkle_node/merkle_node_db.rs +++ b/crates/lib/src/core/db/merkle_node/merkle_node_db.rs @@ -46,10 +46,6 @@ For example, data for a vnode of hash 1234 with two children: {dir data node} */ -use rmp_serde::Serializer; -use serde::Serialize; -use std::fmt::Debug; -use std::fmt::Display; use std::fs::File; use std::io::Read; use std::io::Seek; @@ -223,10 +219,14 @@ impl MerkleNodeDB { } pub fn node(&self) -> Result { - Self::to_node(self.dtype, &self.data()) + let node = Self::to_node(self.dtype, &self.data())?; + Ok(node) } - pub fn to_node(dtype: MerkleTreeNodeType, data: &[u8]) -> Result { + pub fn to_node( + dtype: MerkleTreeNodeType, + data: &[u8], + ) -> Result { match dtype { MerkleTreeNodeType::Commit => { Ok(EMerkleTreeNode::Commit(CommitNode::deserialize(data)?)) @@ -254,9 +254,9 @@ impl MerkleNodeDB { Self::open(path, true) } - pub fn open_read_write_if_not_exists( + pub fn open_read_write_if_not_exists( repo: &LocalRepository, - node: &impl TMerkleTreeNode, + node: &N, parent_id: Option, ) -> Result, OxenError> { if Self::exists(repo, &node.hash()) { @@ -271,9 +271,9 @@ impl MerkleNodeDB { } } - pub fn open_read_write( + pub fn open_read_write( repo: &LocalRepository, - node: &impl TMerkleTreeNode, + node: &N, parent_id: Option, ) -> Result { let path = node_db_path(repo, &node.hash()); @@ -322,10 +322,11 @@ impl MerkleNodeDB { (None, Some(node_file), Some(children_file)) }; - let dtype = lookup - .as_ref() - .map(|l| MerkleTreeNodeType::from_u8(l.data_type)) - .unwrap_or(MerkleTreeNodeType::Commit); + let dtype = match lookup.as_ref() { + Some(l) => MerkleTreeNodeType::from_u8(l.data_type)?, + None => MerkleTreeNodeType::Commit, + }; + let parent_id = lookup.as_ref().map(|l| l.parent_id); Ok(Self { read_only, @@ -364,7 +365,7 @@ impl MerkleNodeDB { } /// Write the base node info. - fn write_node( + fn write_node( &mut self, node: &N, parent_id: Option, @@ -393,8 +394,7 @@ impl MerkleNodeDB { } // Write data length - let mut buf = Vec::new(); - node.serialize(&mut Serializer::new(&mut buf)).unwrap(); + let buf = node.to_msgpack_bytes()?; let data_len = buf.len() as u32; node_file.write_all(&data_len.to_le_bytes())?; // log::debug!("write_node Wrote data length {}", data_len); @@ -425,9 +425,7 @@ impl MerkleNodeDB { return Err(OxenError::basic_str("Must call open() before writing")); }; - // TODO: Abstract and re-use in write_all - let mut buf = Vec::new(); - item.serialize(&mut Serializer::new(&mut buf)).unwrap(); + let buf = item.to_msgpack_bytes()?; let data_len = buf.len() as u64; // log::debug!("--add_child-- node_file {:?}", node_file); // log::debug!("--add_child-- dtype {:?}", item.dtype()); @@ -498,7 +496,7 @@ impl MerkleNodeDB { }; // Parse the node parent id - let data_type = MerkleTreeNodeType::from_u8(lookup.data_type); + let data_type = MerkleTreeNodeType::from_u8(lookup.data_type)?; let parent_id = MerkleTreeNode::deserialize_id(&lookup.data, data_type)?; let mut file_data = Vec::new(); @@ -517,7 +515,7 @@ impl MerkleNodeDB { cursor.seek(SeekFrom::Start(*offset))?; let mut data = vec![0; *len as usize]; cursor.read_exact(&mut data)?; - let dtype = MerkleTreeNodeType::from_u8(*dtype); + let dtype = MerkleTreeNodeType::from_u8(*dtype)?; let node = MerkleTreeNode { parent_id: Some(parent_id), hash: MerkleHash::new(*hash), diff --git a/crates/lib/src/error.rs b/crates/lib/src/error.rs index ca3de92a3..79133b41a 100644 --- a/crates/lib/src/error.rs +++ b/crates/lib/src/error.rs @@ -14,6 +14,7 @@ use crate::model::ParsedResource; use crate::model::RepoNew; use crate::model::Schema; use crate::model::Workspace; +use crate::model::merkle_tree::node_type::InvalidMerkleTreeNodeType; pub mod path_buf_error; pub mod string_error; @@ -143,6 +144,14 @@ pub enum OxenError { #[error("{0}")] CommitEntryNotFound(StringError), + // + // Merkle Tree Operations + // + /// A failure during serialization or deserialization of a merkle tree node: it has an unknown + /// u8 marker for its node type. + #[error("{0}")] + MerkleTreeError(#[from] InvalidMerkleTreeNodeType), + // // Schema (dataframes) // @@ -303,6 +312,10 @@ pub enum OxenError { #[error("Invalid integer: {0}")] ParseIntError(#[from] ParseIntError), + /// Wraps any error that we get from encoding message pack data. + #[error("Encode error: {0}")] + RmpEncodeError(#[from] rmp_serde::encode::Error), + /// Wraps any error that we get from decoding message pack data. #[error("Decode error: {0}")] RmpDecodeError(#[from] rmp_serde::decode::Error), diff --git a/crates/lib/src/model/merkle_tree/node/commit_node.rs b/crates/lib/src/model/merkle_tree/node/commit_node.rs index 8b0918e27..d9f18d66c 100644 --- a/crates/lib/src/model/merkle_tree/node/commit_node.rs +++ b/crates/lib/src/model/merkle_tree/node/commit_node.rs @@ -9,7 +9,7 @@ use crate::core::v_old::v0_19_0::model::merkle_tree::node::commit_node::CommitNo use crate::core::versions::MinOxenVersion; use crate::error::OxenError; use crate::model::{Commit, LocalRepository}; -use crate::model::{MerkleHash, MerkleTreeNodeIdType, MerkleTreeNodeType, TMerkleTreeNode}; +use crate::model::{MerkleHash, MerkleTreeNodeIdType, MerkleTreeNodeType}; pub trait TCommitNode { fn node_type(&self) -> &MerkleTreeNodeType; @@ -124,7 +124,7 @@ impl CommitNode { } } - pub fn deserialize(data: &[u8]) -> Result { + pub fn deserialize(data: &[u8]) -> Result { // In order to support versions that didn't have the enum, // if it fails we will fall back to the old struct, then populate the enum let commit: CommitNode = match rmp_serde::from_slice(data) { @@ -204,8 +204,6 @@ impl MerkleTreeNodeIdType for CommitNode { } } -impl TMerkleTreeNode for CommitNode {} - /// Debug is used for verbose multi-line output with println!("{:?}", node) impl fmt::Debug for CommitNode { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { diff --git a/crates/lib/src/model/merkle_tree/node/dir_node.rs b/crates/lib/src/model/merkle_tree/node/dir_node.rs index 4f95b87f1..d026190ec 100644 --- a/crates/lib/src/model/merkle_tree/node/dir_node.rs +++ b/crates/lib/src/model/merkle_tree/node/dir_node.rs @@ -8,9 +8,7 @@ use crate::core::v_latest::model::merkle_tree::node::dir_node::DirNodeData as Di use crate::core::v_old::v0_19_0::model::merkle_tree::node::dir_node::DirNodeData as DirNodeDataV0_19_0; use crate::core::versions::MinOxenVersion; use crate::error::OxenError; -use crate::model::{ - LocalRepository, MerkleHash, MerkleTreeNodeIdType, MerkleTreeNodeType, TMerkleTreeNode, -}; +use crate::model::{LocalRepository, MerkleHash, MerkleTreeNodeIdType, MerkleTreeNodeType}; use crate::view::DataTypeCount; pub trait TDirNode { @@ -120,7 +118,7 @@ impl DirNode { } } - pub fn deserialize(data: &[u8]) -> Result { + pub fn deserialize(data: &[u8]) -> Result { // In order to support versions that didn't have the enum, // if it fails we will fall back to the old struct, then populate the enum let dir_node: DirNode = match rmp_serde::from_slice(data) { @@ -262,8 +260,6 @@ impl MerkleTreeNodeIdType for DirNode { } } -impl TMerkleTreeNode for DirNode {} - /// Debug is used for verbose multi-line output with println!("{:?}", node) impl fmt::Debug for DirNode { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { diff --git a/crates/lib/src/model/merkle_tree/node/file_chunk_node.rs b/crates/lib/src/model/merkle_tree/node/file_chunk_node.rs index 6fd0c6c6c..b0cd92a4b 100644 --- a/crates/lib/src/model/merkle_tree/node/file_chunk_node.rs +++ b/crates/lib/src/model/merkle_tree/node/file_chunk_node.rs @@ -4,8 +4,7 @@ use serde::{Deserialize, Serialize}; -use crate::error::OxenError; -use crate::model::{MerkleHash, MerkleTreeNodeIdType, MerkleTreeNodeType, TMerkleTreeNode}; +use crate::model::{MerkleHash, MerkleTreeNodeIdType, MerkleTreeNodeType}; use std::fmt; @@ -17,9 +16,8 @@ pub struct FileChunkNode { } impl FileChunkNode { - pub fn deserialize(data: &[u8]) -> Result { + pub fn deserialize(data: &[u8]) -> Result { rmp_serde::from_slice(data) - .map_err(|e| OxenError::basic_str(format!("Error deserializing file chunk node: {e}"))) } } @@ -43,8 +41,6 @@ impl MerkleTreeNodeIdType for FileChunkNode { } } -impl TMerkleTreeNode for FileChunkNode {} - /// Debug is used for verbose multi-line output with println!("{:?}", node) impl fmt::Debug for FileChunkNode { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { diff --git a/crates/lib/src/model/merkle_tree/node/file_node.rs b/crates/lib/src/model/merkle_tree/node/file_node.rs index 1f484f920..5c4aa01ba 100644 --- a/crates/lib/src/model/merkle_tree/node/file_node.rs +++ b/crates/lib/src/model/merkle_tree/node/file_node.rs @@ -7,7 +7,6 @@ use crate::model::merkle_tree::node::file_node_types::{FileChunkType, FileStorag use crate::model::metadata::generic_metadata::GenericMetadata; use crate::model::{ EntryDataType, LocalRepository, MerkleHash, MerkleTreeNodeIdType, MerkleTreeNodeType, - TMerkleTreeNode, }; use serde::{Deserialize, Serialize}; use std::fmt; @@ -95,7 +94,7 @@ impl FileNode { } } - pub fn deserialize(data: &[u8]) -> Result { + pub fn deserialize(data: &[u8]) -> Result { let file_node: FileNode = match rmp_serde::from_slice(data) { Ok(file_node) => file_node, Err(_) => { @@ -263,8 +262,6 @@ impl Hash for FileNode { } } -impl TMerkleTreeNode for FileNode {} - /// Debug is used for verbose multi-line output with println!("{:?}", node) impl fmt::Debug for FileNode { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { diff --git a/crates/lib/src/model/merkle_tree/node/merkle_tree_node.rs b/crates/lib/src/model/merkle_tree/node/merkle_tree_node.rs index d036064a4..57f65a70f 100644 --- a/crates/lib/src/model/merkle_tree/node/merkle_tree_node.rs +++ b/crates/lib/src/model/merkle_tree/node/merkle_tree_node.rs @@ -684,7 +684,10 @@ impl MerkleTreeNode { self.node.clone() } - pub fn deserialize_id(data: &[u8], dtype: MerkleTreeNodeType) -> Result { + pub fn deserialize_id( + data: &[u8], + dtype: MerkleTreeNodeType, + ) -> Result { match dtype { MerkleTreeNodeType::Commit => { CommitNode::deserialize(data).map(|commit| *commit.hash()) diff --git a/crates/lib/src/model/merkle_tree/node/vnode.rs b/crates/lib/src/model/merkle_tree/node/vnode.rs index ca2f12193..300286b08 100644 --- a/crates/lib/src/model/merkle_tree/node/vnode.rs +++ b/crates/lib/src/model/merkle_tree/node/vnode.rs @@ -9,9 +9,7 @@ use crate::core::v_latest::model::merkle_tree::node::vnode::VNodeData as VNodeIm use crate::core::v_old::v0_19_0::model::merkle_tree::node::vnode::VNodeData as VNodeImplV0_19_0; use crate::core::versions::MinOxenVersion; use crate::error::OxenError; -use crate::model::{ - LocalRepository, MerkleHash, MerkleTreeNodeIdType, MerkleTreeNodeType, TMerkleTreeNode, -}; +use crate::model::{LocalRepository, MerkleHash, MerkleTreeNodeIdType, MerkleTreeNodeType}; pub trait TVNode { fn node_type(&self) -> &MerkleTreeNodeType; @@ -57,7 +55,7 @@ impl VNode { } } - pub fn deserialize(data: &[u8]) -> Result { + pub fn deserialize(data: &[u8]) -> Result { // In order to support versions that didn't have the enum, // if it fails we will fall back to the old struct, then populate the enum let vnode: VNode = match rmp_serde::from_slice(data) { @@ -153,5 +151,3 @@ impl fmt::Display for VNode { write!(f, "") } } - -impl TMerkleTreeNode for VNode {} diff --git a/crates/lib/src/model/merkle_tree/node_type.rs b/crates/lib/src/model/merkle_tree/node_type.rs index a2269b0af..6dc35163a 100644 --- a/crates/lib/src/model/merkle_tree/node_type.rs +++ b/crates/lib/src/model/merkle_tree/node_type.rs @@ -14,13 +14,28 @@ use std::{ fmt::{Debug, Display}, hash::{Hash, Hasher}, }; +use thiserror::Error; +/// The type of node that we are storing in the merkle tree. #[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize, Copy)] pub enum MerkleTreeNodeType { + /// A commit in the repository. From this commit node, we can traverse the tree to recover the + /// entire repository state at the commit. Commit, + + /// A file in the repository. File, + + /// A directory in the repository. Dir, + + /// Directories can be very large, so we split their contents into (potentially) multiple + /// virtual directory nodes: a `VNode`. A `VNode` is like a directory: it contains some + /// number of files and directories. It is always a subset of a real directory. If a + /// directory has multiple `VNode` children, these children form a strict paritioning. VNode, + + /// A chunk of a file in the repository. (TODO: unused at this point, but planned) FileChunk, } @@ -30,7 +45,40 @@ impl Hash for MerkleTreeNodeType { } } +#[derive(Debug, Error)] +#[error("Deserialization failure: Invalid MerkleTreeNodeType: {0}")] +/// Failed to deserialize a `MerkleTreeNodeType` due to unknown `u8` value. +pub struct InvalidMerkleTreeNodeType(u8); + impl MerkleTreeNodeType { + /* + ********************************************************************************************** + + NOTE: The mapping of u8 -> node type is important! DO NOT MODIFY THIS!!!! + + It is stored on disk / in a database as a `u8` and this value is used to deserialize nodes into + their correct type. Changing this will **BREAK REPOSITORIES**. + + IF it is absolutely necessary to change it, then a migration **MUST** be implemented and + operations **MUST** be gated behind a breaking version update to force this migration on + outdated repositories. + + That being said, it's difficult to imagine any valid reason why this mapping would need to be + changed for existing `MerkleTreeNode` variants. + + **NEW** variants can be added without a migration. New variants must have an incremented `u8` + value to not conflcit with any existing variants. Older repositories will still be able to be + read by newer variants: they will simply not contain nodes of the new variant. + + If there is some **OTHER** backwards-incompatible change with how the repository data is stored + that requires a new variant, then a migration **MUST** be implemented and operations **MUST** + be gated behind a breaking version update to force this migration on outdated repositories. + + ********************************************************************************************** + */ + + /// Serialize the node type into a stable `u8` value. + /// This function is 1:1 with `from_u8`. pub fn to_u8(&self) -> u8 { match self { MerkleTreeNodeType::Commit => 0u8, @@ -41,21 +89,75 @@ impl MerkleTreeNodeType { } } - pub fn from_u8(val: u8) -> MerkleTreeNodeType { + /// Deserialize a `u8` value into a `MerkleTreeNodeType`. + /// This function is 1:1 with `to_u8`: all outputs from `to_u8` result in an `Ok`. + pub fn from_u8(val: u8) -> Result { match val { - 0u8 => MerkleTreeNodeType::Commit, - 1u8 => MerkleTreeNodeType::Dir, - 2u8 => MerkleTreeNodeType::VNode, - 3u8 => MerkleTreeNodeType::File, - 4u8 => MerkleTreeNodeType::FileChunk, - _ => panic!("Invalid MerkleTreeNodeType: {val}"), + 0u8 => Ok(MerkleTreeNodeType::Commit), + 1u8 => Ok(MerkleTreeNodeType::Dir), + 2u8 => Ok(MerkleTreeNodeType::VNode), + 3u8 => Ok(MerkleTreeNodeType::File), + 4u8 => Ok(MerkleTreeNodeType::FileChunk), + _ => Err(InvalidMerkleTreeNodeType(val)), } } } +/// Allows for types to identify themselves as a specific kind of merkle tree node via the +/// `MerkleTreeNodeType` enum. This is critical for defining the node's stable on-disk +/// byte representation. pub trait MerkleTreeNodeIdType { + /// The type of merkle tree node this node can serialize & deserialize into. fn node_type(&self) -> MerkleTreeNodeType; + + /// The stable merkle hash of this node. fn hash(&self) -> MerkleHash; } -pub trait TMerkleTreeNode: MerkleTreeNodeIdType + Serialize + Debug + Display {} +/// Trait for things that are Merkle tree nodes. +/// +/// Critical functionality is to be able to serialize node, compute a stable merkle hash, and +/// identify the node as a `MerkleTreeNodeType`. The Debug & Display constraints are for +/// convenience. +/// +/// Since this trait is used as a parameter for generic functions, it must be object-safe. This +/// means that it cannot extend `Serialize` since that has a `` parameter on the +/// `serialize` method, which causes the type to not be implementable as a vtable lookup. +/// +/// This is why the trait contains a manual serialization (`to_msgpack_bytes`) method. For types +/// that do implement `Serialize`, there's a blanket implementation that delegates the `serialize` +/// call to the `to_msgpack_bytes` method. +pub trait TMerkleTreeNode: MerkleTreeNodeIdType + Debug + Display { + /// Serialize this node to a MsgPack byte array. + fn to_msgpack_bytes(&self) -> Result, rmp_serde::encode::Error>; +} + +/// Blanket implementation for Merkle tree nodes that implement serde's `Serialize` trait. +impl TMerkleTreeNode for T { + /// Uses serde to serialize this node. + fn to_msgpack_bytes(&self) -> Result, rmp_serde::encode::Error> { + let mut buf = Vec::new(); + let x = self.serialize(&mut rmp_serde::Serializer::new(&mut buf)); + x.map(|_| buf) + } +} + +#[cfg(test)] +mod tests { + use crate::model::merkle_tree::node::{CommitNode, DirNode, FileChunkNode, FileNode, VNode}; + + use super::*; + + #[test] + fn test_nodes_implement_trait() { + /// this only exists so we can check that all node types implement `TMerkleTreeNode` + /// it will be a compile-time error if a node type does not implement the trait + fn is_tmerkletreenode(_: T) {} + + is_tmerkletreenode(CommitNode::default()); + is_tmerkletreenode(DirNode::default()); + is_tmerkletreenode(VNode::default()); + is_tmerkletreenode(FileNode::default()); + is_tmerkletreenode(FileChunkNode::default()); + } +} diff --git a/crates/lib/src/repositories/tree.rs b/crates/lib/src/repositories/tree.rs index 6a1e4480a..c0498abd7 100644 --- a/crates/lib/src/repositories/tree.rs +++ b/crates/lib/src/repositories/tree.rs @@ -1077,10 +1077,16 @@ pub fn write_tree(repo: &LocalRepository, node: &MerkleTreeNode) -> Result<(), O Ok(()) } -fn p_write_tree( +/// Write the entire merkle tree, starting from the node (`node_impl`) to the local repository. +/// +/// Recursively writes the node and all its children to disk. To write a full tree, the node +/// (`node_impl`) **MUST** be the root of the tree -- i.e. a `Commit` node. +/// +/// [1] https://github.com/rust-lang/rust/issues/20041) +fn p_write_tree( repo: &LocalRepository, node: &MerkleTreeNode, - node_impl: &impl TMerkleTreeNode, + node_impl: &N, ) -> Result<(), OxenError> { let parent_id = node.parent_id;