From 4fffe77da22f7d735092175b5305929ed8f30d21 Mon Sep 17 00:00:00 2001 From: hammadb Date: Thu, 27 Jun 2024 23:39:08 -0700 Subject: [PATCH 01/10] [BUG][WIP] Ensure arrow sizing is correct / 64 byte aligned --- Cargo.lock | 80 +++--- rust/worker/Cargo.toml | 2 +- .../src/blockstore/arrow/block/delta.rs | 10 +- .../blockstore/arrow/block/delta_storage.rs | 1 + .../src/blockstore/arrow/block/types.rs | 256 ++++++++++++++---- rust/worker/test.arrow | Bin 0 -> 67710 bytes 6 files changed, 264 insertions(+), 85 deletions(-) create mode 100644 rust/worker/test.arrow diff --git a/Cargo.lock b/Cargo.lock index 4e882da95e7..f43f8872fb5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -75,9 +75,9 @@ checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457" [[package]] name = "arrow" -version = "50.0.0" +version = "52.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa285343fba4d829d49985bdc541e3789cf6000ed0e84be7c039438df4a4e78c" +checksum = "7ae9728f104939be6d8d9b368a354b4929b0569160ea1641f0721b55a861ce38" dependencies = [ "arrow-arith", "arrow-array", @@ -96,9 +96,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "50.0.0" +version = "52.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "753abd0a5290c1bcade7c6623a556f7d1659c5f4148b140b5b63ce7bd1a45705" +checksum = "a7029a5b3efbeafbf4a12d12dc16b8f9e9bff20a410b8c25c5d28acc089e1043" dependencies = [ "arrow-array", "arrow-buffer", @@ -111,9 +111,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "50.0.0" +version = "52.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d390feeb7f21b78ec997a4081a025baef1e2e0d6069e181939b61864c9779609" +checksum = "d33238427c60271710695f17742f45b1a5dc5bcfc5c15331c25ddfe7abf70d97" dependencies = [ "ahash", "arrow-buffer", @@ -127,9 +127,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "50.0.0" +version = "52.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69615b061701bcdffbc62756bc7e85c827d5290b472b580c972ebbbf690f5aa4" +checksum = "fe9b95e825ae838efaf77e366c00d3fc8cca78134c9db497d6bda425f2e7b7c1" dependencies = [ "bytes", "half 2.4.1", @@ -138,27 +138,29 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "50.0.0" +version = "52.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e448e5dd2f4113bf5b74a1f26531708f5edcacc77335b7066f9398f4bcf4cdef" +checksum = "87cf8385a9d5b5fcde771661dd07652b79b9139fea66193eda6a88664400ccab" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "arrow-select", - "base64 0.21.7", + "atoi", + "base64 0.22.1", "chrono", "half 2.4.1", "lexical-core", "num", + "ryu", ] [[package]] name = "arrow-csv" -version = "50.0.0" +version = "52.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46af72211f0712612f5b18325530b9ad1bfbdc87290d5fbfd32a7da128983781" +checksum = "cea5068bef430a86690059665e40034625ec323ffa4dd21972048eebb0127adc" dependencies = [ "arrow-array", "arrow-buffer", @@ -175,9 +177,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "50.0.0" +version = "52.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67d644b91a162f3ad3135ce1184d0a31c28b816a581e08f29e8e9277a574c64e" +checksum = "cb29be98f987bcf217b070512bb7afba2f65180858bca462edf4a39d84a23e10" dependencies = [ "arrow-buffer", "arrow-schema", @@ -187,9 +189,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "50.0.0" +version = "52.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03dea5e79b48de6c2e04f03f62b0afea7105be7b77d134f6c5414868feefb80d" +checksum = "ffc68f6523970aa6f7ce1dc9a33a7d9284cfb9af77d4ad3e617dbe5d79cc6ec8" dependencies = [ "arrow-array", "arrow-buffer", @@ -201,9 +203,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "50.0.0" +version = "52.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8950719280397a47d37ac01492e3506a8a724b3fb81001900b866637a829ee0f" +checksum = "2041380f94bd6437ab648e6c2085a045e45a0c44f91a1b9a4fe3fed3d379bfb1" dependencies = [ "arrow-array", "arrow-buffer", @@ -221,9 +223,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "50.0.0" +version = "52.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ed9630979034077982d8e74a942b7ac228f33dd93a93b615b4d02ad60c260be" +checksum = "fcb56ed1547004e12203652f12fe12e824161ff9d1e5cf2a7dc4ff02ba94f413" dependencies = [ "arrow-array", "arrow-buffer", @@ -236,9 +238,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "50.0.0" +version = "52.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "007035e17ae09c4e8993e4cb8b5b96edf0afb927cd38e2dff27189b274d83dcf" +checksum = "575b42f1fc588f2da6977b94a5ca565459f5ab07b60545e17243fb9a7ed6d43e" dependencies = [ "ahash", "arrow-array", @@ -251,15 +253,15 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "50.0.0" +version = "52.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ff3e9c01f7cd169379d269f926892d0e622a704960350d09d331be3ec9e0029" +checksum = "32aae6a60458a2389c0da89c9de0b7932427776127da1a738e2efc21d32f3393" [[package]] name = "arrow-select" -version = "50.0.0" +version = "52.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ce20973c1912de6514348e064829e50947e35977bb9d7fb637dc99ea9ffd78c" +checksum = "de36abaef8767b4220d7b4a8c2fe5ffc78b47db81b03d77e2136091c3ba39102" dependencies = [ "ahash", "arrow-array", @@ -271,15 +273,16 @@ dependencies = [ [[package]] name = "arrow-string" -version = "50.0.0" +version = "52.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00f3b37f2aeece31a2636d1b037dabb69ef590e03bdc7eb68519b51ec86932a7" +checksum = "e435ada8409bcafc910bc3e0077f532a4daa20e99060a496685c0e3e53cc2597" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "arrow-select", + "memchr", "num", "regex", "regex-syntax 0.8.2", @@ -324,6 +327,15 @@ dependencies = [ "syn 2.0.52", ] +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", +] + [[package]] name = "atomic" version = "0.6.0" @@ -807,6 +819,12 @@ version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + [[package]] name = "base64-simd" version = "0.8.0" @@ -1366,9 +1384,9 @@ checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" [[package]] name = "flatbuffers" -version = "23.5.26" +version = "24.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dac53e22462d78c16d64a1cd22371b54cc3fe94aa15e7886a2fa6e5d1ab8640" +checksum = "8add37afff2d4ffa83bc748a70b4b1370984f6980768554182424ef71447c35f" dependencies = [ "bitflags 1.3.2", "rustc_version", diff --git a/rust/worker/Cargo.toml b/rust/worker/Cargo.toml index f1862a1e7ff..4dc60a5ed7c 100644 --- a/rust/worker/Cargo.toml +++ b/rust/worker/Cargo.toml @@ -41,7 +41,7 @@ parking_lot = "0.12.1" aws-sdk-s3 = "1.5.0" aws-smithy-types = "1.1.0" aws-config = { version = "1.1.2", features = ["behavior-version-latest"] } -arrow = "50.0.0" +arrow = "52.0.0" roaring = "0.10.3" tantivy = "0.21.1" tracing = "0.1" diff --git a/rust/worker/src/blockstore/arrow/block/delta.rs b/rust/worker/src/blockstore/arrow/block/delta.rs index 84ab5d9e43f..42080ca7747 100644 --- a/rust/worker/src/blockstore/arrow/block/delta.rs +++ b/rust/worker/src/blockstore/arrow/block/delta.rs @@ -228,6 +228,7 @@ mod test { #[tokio::test] async fn test_sizing_string_val() { let tmp_dir = tempfile::tempdir().unwrap(); + let path = tmp_dir.path().to_str().unwrap(); let storage = Storage::Local(LocalStorage::new(tmp_dir.path().to_str().unwrap())); let block_manager = BlockManager::new(storage); let delta = block_manager.create::<&str, &str>(); @@ -243,6 +244,9 @@ mod test { let size = delta.get_size::<&str, &str>(); block_manager.commit::<&str, &str>(&delta); let block = block_manager.get(&delta_id).await.unwrap(); + // TODO: uncomment this when sizing is fixed + println!("==== COMPUTING DUMPED BLOCK SIZE ==== "); + let dumped_block_size = block.get_size(); assert_eq!(size, block.get_size()); for i in 0..n { let key = format!("key{}", i); @@ -255,7 +259,11 @@ mod test { let loaded = Block::load("test.arrow", delta_id).unwrap(); assert_eq!(loaded.id, delta_id); // TODO: make this sizing work - // assert_eq!(block.get_size(), loaded.get_size()); + println!("==== COMPUTING ORIGINAL BLOCK SIZE ==== "); + let original_size = block.get_size(); + println!("==== COMPUTING LOADED BLOCK SIZE ==== "); + let loaded_size = loaded.get_size(); + assert_eq!(block.get_size(), loaded.get_size()); for i in 0..n { let key = format!("key{}", i); let read = loaded.get::<&str, &str>("prefix", &key); diff --git a/rust/worker/src/blockstore/arrow/block/delta_storage.rs b/rust/worker/src/blockstore/arrow/block/delta_storage.rs index 7d5e77e1f04..965e3681af0 100644 --- a/rust/worker/src/blockstore/arrow/block/delta_storage.rs +++ b/rust/worker/src/blockstore/arrow/block/delta_storage.rs @@ -9,6 +9,7 @@ use arrow::{ Int32Array, Int32Builder, ListBuilder, RecordBatch, StringBuilder, StructArray, UInt32Builder, }, + buffer::{Buffer, MutableBuffer}, datatypes::{Field, Fields}, util::bit_util, }; diff --git a/rust/worker/src/blockstore/arrow/block/types.rs b/rust/worker/src/blockstore/arrow/block/types.rs index 702f1525220..ec95bde83a2 100644 --- a/rust/worker/src/blockstore/arrow/block/types.rs +++ b/rust/worker/src/blockstore/arrow/block/types.rs @@ -1,10 +1,20 @@ +use std::io::{Read, Seek, SeekFrom}; +use std::sync::Arc; + use super::delta::BlockDelta; use crate::blockstore::arrow::types::{ArrowReadableKey, ArrowReadableValue}; use crate::errors::ChromaError; +use arrow::buffer::Buffer; +use arrow::ipc::convert::fb_to_schema; +use arrow::ipc::reader::{read_footer_length, FileDecoder}; +use arrow::ipc::{root_as_footer, root_as_message, MessageHeader}; +use arrow::util::bit_util; use arrow::{ array::{Array, StringArray}, record_batch::RecordBatch, }; +use rand::rngs::mock; +use tantivy::HasLen; use uuid::Uuid; /// A block in a blockfile. A block is a sorted collection of data that is immutable once it has been committed. @@ -206,10 +216,55 @@ impl Block { /// Returns the size of the block in bytes pub(crate) fn get_size(&self) -> usize { let mut total_size = 0; + let mut mock_size = 0; + let mut column_index = 0; for column in self.data.columns() { + let column_size = column.get_buffer_memory_size(); + println!( + "[ORIGINAL] Column {} column size: {}", + column_index, column_size + ); total_size += column.get_buffer_memory_size(); + let array_data = column.to_data(); + let array_data_size = array_data.get_slice_memory_size().unwrap(); + println!( + "[MAYBE] Column {} array data size: {}", + column_index, array_data_size + ); + let mut buffer_index = 0; + let mut total_for_buffers = 0; + for buffer in array_data.buffers() { + // let slice = buffer.slice_with_length(offset, len); + // let slice_len = slice.as_slice().len(); + // mock_size += slice_len; + // SYSTEM ASSUMPTION: ALL BUFFERS ARE PADDED TO 64 bytes + // We maintain this invariant in two places + // 1. In the to_arrow methods of delta storage, we allocate + // padded buffers + // 2. In block load() we validate that the buffers are of size 64 + // Why do we do this instead of using get_buffer_memory_size() + // or using the buffers capacity? TODO: answer + let size = bit_util::round_upto_multiple_of_64(buffer.len()); + println!( + "[NEW] Column {} buffer {} size: {}", + column_index, buffer_index, size + ); + mock_size += size; + total_for_buffers += size; + buffer_index += 1; + } + println!( + "[NEW] Total for buffers: {} for column {}", + total_for_buffers, column_index + ); + column_index += 1; } - total_size + // total_size + println!( + "Size via total_size: {}, size via mock_size: {}", + total_size, mock_size + ); + return mock_size; } /// Returns the number of items in the block @@ -278,15 +333,108 @@ impl Block { } }; let mut reader = std::io::BufReader::new(file); - let reader = arrow::ipc::reader::FileReader::try_new(&mut reader, None); - let mut reader = match reader { - Ok(reader) => reader, - Err(e) => { - // TODO: Return a proper error - panic!("Error creating reader: {:?}", e) - } + + // Read IPC File - https://docs.rs/arrow-ipc/52.0.0/arrow_ipc/reader/struct.FileDecoder.html + // Space for ARROW_MAGIC (6 bytes) and length (4 bytes) + let mut buffer = [0; 10]; + reader + .seek(SeekFrom::End(-10)) + .expect("TODO: change to error"); + reader + .read_exact(&mut buffer) + .expect("TODO: change to error"); + + let footer_len = read_footer_length(buffer).expect("TODO: change to error"); + + // read footer + let mut footer_data = vec![0; footer_len]; + reader + .seek(SeekFrom::End(-10 - footer_len as i64)) + .expect("TODO: change to error"); + reader + .read_exact(&mut footer_data) + .expect("TODO: change to error"); + + let footer = root_as_footer(&footer_data).expect("TODO: change to error"); + let schema = footer.schema().expect("TODO: change to error"); + let arrow_schema = Arc::new(fb_to_schema(schema)); + // Create a file decoder, requiring alignment of 64 bytes + let decoder = FileDecoder::new(arrow_schema, footer.version()); + let decoder = decoder.with_require_alignment(true); + + // Read the record batch + let record_batch_definitions = footer.recordBatches().expect("TODO: change to error"); + let record_batch = record_batch_definitions.get(0); + let block_len = record_batch.bodyLength() as usize + record_batch.metaDataLength() as usize; + + println!("BODY LENGTH: {}", record_batch.bodyLength()); + println!("RECORD BATCH OFFSET: {}", record_batch.offset()); + let mut file_buffer = vec![0; block_len]; + reader + .seek(SeekFrom::Start(record_batch.offset() as u64)) + .expect("TODO: change to error"); + reader + .read_exact(&mut file_buffer) + .expect("TODO: change to error"); + + let buffer = Buffer::from(file_buffer); + + // This is borrowed from arrow-ipc parse_message.rs + // https://arrow.apache.org/docs/format/Columnar.html#encapsulated-message-format + let buf = match buffer[..4] == [0xff; 4] { + true => &buffer[8..], + false => &buffer[4..], }; - return Self::load_with_reader(reader, id); + let message = root_as_message(buf).expect("TODO: change to error"); + match message.header_type() { + MessageHeader::RecordBatch => { + let rb = message + .header_as_record_batch() + .expect("TODO: change to error"); + // Loop over offsets and ensure the lengths of each buffer are 64 byte aligned + let blocks = rb.buffers().expect("TODO: change to error"); + let mut prev_offset = blocks.get(0).offset(); + for block in blocks.iter().skip(1) { + let curr_offset = block.offset(); + let len = curr_offset - prev_offset; + println!("CURRENT OFFSET: {}", curr_offset); + println!("BUFFER LENGTH IS: {}", len); + let remainder = len % 64; + if remainder != 0 { + panic!("Buffer length is not 64 byte aligned"); + } + prev_offset = curr_offset; + } + // We have to add the last buffer length based on the body length + let last_buffer_len = record_batch.bodyLength() as usize - prev_offset as usize; + let remainder = last_buffer_len % 64; + println!("LAST BUFFER LENGTH IS: {}", last_buffer_len); + if remainder != 0 { + panic!("Buffer length is not 64 byte aligned"); + } + } + _ => { + panic!("Unexpected message type"); + } + } + + let read = decoder + .read_record_batch(record_batch, &buffer) + .unwrap() + .unwrap(); + Ok(Self::from_record_batch(id, read)) + + // // TODO: require_alignment + // let reader = arrow::ipc::reader::FileReader::try_new(&mut reader, None); + // let reader = match reader { + // Ok(reader) => reader, + // Err(e) => { + // // TODO: Return a proper error + // panic!("Error creating reader: {:?}", e) + // } + // }; + + // return Self::load_with_reader(reader, id); } fn load_with_reader( @@ -299,7 +447,11 @@ impl Block { let batch = reader.next().unwrap(); // TODO: how to store / hydrate id? match batch { - Ok(batch) => Ok(Self::from_record_batch(id, batch)), + Ok(batch) => { + println!("Loaded batch with {} rows", batch.num_rows()); + println!("Batch size is {}", batch.get_array_memory_size()); + Ok(Self::from_record_batch(id, batch)) + } Err(e) => { panic!("Error reading batch: {:?}", e); } @@ -321,52 +473,52 @@ impl Block { // } // } -// // #[cfg(test)] -// // mod test { -// // use super::*; -// // use crate::blockstore::types::Key; -// // use arrow::array::Int32Array; +// #[cfg(test)] +// mod test { +// use super::*; +// use crate::blockstore::types::Key; +// use arrow::array::Int32Array; -// // #[test] -// // fn test_block_builder_can_add() { -// // let num_entries = 1000; - -// // let mut keys = Vec::new(); -// // let mut key_bytes = 0; -// // for i in 0..num_entries { -// // keys.push(Key::String(format!("{:04}", i))); -// // key_bytes += i.to_string().len(); -// // } +// #[test] +// fn test_block_builder_can_add() { +// let num_entries = 1000; -// // let prefix = "key".to_string(); -// // let prefix_bytes = prefix.len() * num_entries; -// // let mut block_builder = BlockDataBuilder::new( -// // KeyType::String, -// // ValueType::Int32Array, -// // Some(BlockBuilderOptions::new( -// // num_entries, -// // prefix_bytes, -// // key_bytes, -// // num_entries, // 2 int32s per entry -// // num_entries * 2 * 4, // 2 int32s per entry -// // )), -// // ); +// let mut keys = Vec::new(); +// let mut key_bytes = 0; +// for i in 0..num_entries { +// keys.push(Key::String(format!("{:04}", i))); +// key_bytes += i.to_string().len(); +// } -// // for i in 0..num_entries { -// // block_builder -// // .add( -// // BlockfileKey::new(prefix.clone(), keys[i].clone()), -// // Value::Int32ArrayValue(Int32Array::from(vec![i as i32, (i + 1) as i32])), -// // ) -// // .unwrap(); -// // } +// let prefix = "key".to_string(); +// let prefix_bytes = prefix.len() * num_entries; +// let mut block_builder = BlockDataBuilder::new( +// KeyType::String, +// ValueType::Int32Array, +// Some(BlockBuilderOptions::new( +// num_entries, +// prefix_bytes, +// key_bytes, +// num_entries, // 2 int32s per entry +// num_entries * 2 * 4, // 2 int32s per entry +// )), +// ); -// // // Basic sanity check -// // let block_data = block_builder.build().unwrap(); -// // assert_eq!(block_data.data.column(0).len(), num_entries); -// // assert_eq!(block_data.data.column(1).len(), num_entries); -// // assert_eq!(block_data.data.column(2).len(), num_entries); -// // } +// for i in 0..num_entries { +// block_builder +// .add( +// BlockfileKey::new(prefix.clone(), keys[i].clone()), +// Value::Int32ArrayValue(Int32Array::from(vec![i as i32, (i + 1) as i32])), +// ) +// .unwrap(); +// } + +// // Basic sanity check +// let block_data = block_builder.build().unwrap(); +// assert_eq!(block_data.data.column(0).len(), num_entries); +// assert_eq!(block_data.data.column(1).len(), num_entries); +// assert_eq!(block_data.data.column(2).len(), num_entries); +// } // // #[test] // // fn test_out_of_order_key_fails() { diff --git a/rust/worker/test.arrow b/rust/worker/test.arrow new file mode 100644 index 0000000000000000000000000000000000000000..0181f2bf47b4bc7b2206f76b4681267ab4c089ad GIT binary patch literal 67710 zcmeF)akvd-|L^ggP(4W=BuSEx5PQv(#$Jw6eWv@Lmd+s$epL@^VU(bA>D_gW^ zc5@=X^ zK1%Ys2>4UTf6ST6@W z&*`Pl+XR05DgV>B^7>-dGi-~ZTa5`pT9u{H=R$vX*V>7m6H}>Hmj^H>7kg3yA z6eUpx-6#$qC- zU^-@D9u{H=R$vX*V>7m6H}>Hmj^H>7aF042MNtxEP#%?09RXt0MFTWObF@NRbVOJ5 zKp*tSAPmI_jK+9O!c@$_Y|O_ZEX7K!#RhD_4(!2x9KunYKtX=k_!o+y6w0Cks-Ol! z)IvQpL=&_?YqUcrbVD!n#Q+S(FpR_)Ou%GJ!%WP<0xZUItin2M!Zz&0UL3$-90NZr z4Lq$|YE+W)MeKbN-v_u$~!3wOw zdThpa?8ZJE#1R}v0e)aP9Ys+RWl$cKQ5^wd)I|d{Msu`6TXaNM^gti<#~=*F2#m&f zOu|&mz--LNA}qy9ti=Xw!4B-fejLJ4oIt@dx&A1IQYecGsDc^@Q496Z5KYhmt4KpzZ3$Pf=u?p+33EQv}dvO4VaSZOpUl>JD9HmhX6;Tzr zh)^5#(Fje^5^c}{ozWe=(GLSL1j8{3V=)m^Fdefn4-2sbE3gLZu^HR38~bn&M{pbk z&f@x`C`zIX%A+!>BS4J0Xn@9Oj#g-kj_8UW=!5jH6jK?HQ#SF~Gd@RCJti)Puz!vPl9_+^<9K{I~ z{5RJh#ZU@mQ2|v@10iam9vY$vTA(%Bp%c2H7y4oV24fgTVhko=GNxfB=3oI9V>wn~ z9X4Sbc499M;4qHC{m@?+MNk~2Q4SSR6}gB|8}-o$P0>oy2J5jI+p!z_a1cju90f{n{ZSMpQ3mBv8PyRWMqM;OV>CxAv_(gB zMGy2re+T=#73Dh#?q`Q5cJfn1bn;g?U(rC0Kzq zSdY!vj@{UYgE)fYC{T*)kD@4vGANJAsEzY@P}qd8ikEjpqrdY}*bV-SX71V&># zCSfXOU^eDs5td>l)?x#;UCg%Ax|Qpaw$JLOnD@6SP2Uv_mI!Lof8j01U=3jKmmB zz+_CrOw7RoEXH!I!a8iiHtfV+9Kc~5L(chJe-uG+ltwvJL{;P>LT%JXBQ!-zv_S`S zMtAf^KMceW496&p#Y9ZObj-p$EW{G5z#6Q_W^Bi9?88AE!EqEQ%k@W5ltdYnM`ct; zfEabr0FBWctjJ6*Dj!^RWm^u@Y;s0b8&Gd$1pea1SMJ^)LMtw9wQ?x`IbU0~EWrw_!Fp`QcI?JJ9K;bEM*(Wdr=uuJq72HTGO8m$jJj}jzVp%{VD7>`MqiW!)V`B;RdSc$dRfGya8J=l*!IEoV}Sf1;TVkm{OsDLV{ zfe^J&4-L@-Ezlb6&>oy z2J5jI+p!z_a1cju90e+H{ZSMpQ3mBv8PyRWMqM;OV>CxAv_(gBMGy2re+HQ~ z3Ej{OeK7!oF$^Oy1`{wD(=ZcrumFp(9ILPno3IT#u@?t$7{}lW{e@8k#ZemNP!Uy; ziwLz*AC1ryEzt%Y&>7v)8~rd4LoghpFcuRr1=BGL^RN(0umWqa9-FZpyRi=kaRkRv zpc2;~MNtxEP#%?09RXt0MFTWObF@NRbVOJ5Kp*tSAPmI_jK+9O!c@$_Y|O_ZEX7K! z#RhD_4(!2x9KunYK*7pfe-uM0ltl$pK@Ehcg?ea+CTM}yXopVdhF<840T_&77>O~M zfXSGKnV5qGSd8UZg>~43ZP-dGi;0+m>6nFiScoN9fi+l<&Df6J*oT8Sg5xMqh3k)^D2XyCkIJZy z05R&K0UDz@TA?jEqAPl!5Bg&ehGGOpV>~8dDrR6d=3^0-VkOpM1GZoX_Fz8_;V4d^ z;H6xD6hkSLMFmtr4TPwLdT59yXo1#fhfe5*Ug(Pf7>r>Ui7}Xf$(V+jn1cmajOAE` zb=ZV$*onP3fWtTj_m6yqQ3Sxl~El5V$?+gG)8l@LR)l1 zSM)$1^v56!#R!bXcuc}n%)o5S$097nO02~OY{3rf!G0XVQJg@*>Rf*mLn)L+1yn%| zgs6pjXox0gf!1h;PUwbS=!*dujA0mwF_?hKn1-2{g9TWOT=#73Dh#?q`Q5cJfn1bn;g?U(rC0Kzq zSdY!vj@{UYgE)fYC{TmzkD@4vGANJAsEzY@P}qd8ikEjpqrdY}*bV-SX71V&># zCSfXOU^eDs5td>l)?x#;!2b6PoXP*zAN_AX^8e011Nmnl{|w}xf&4R&e+Kf;K>iuX zKLhz^ApZ>HpMm@{kbeg9&p`ee$Ug)5XCVI!iuXKLhz^ApZ>HpMm@{@c;G<{O|rp z@&AAJ-g8I&UuUKuFHgr=pmhXY9J7yQ=I|I%PEACpg9f+`EpQ9kqciS7A3TJ?coL&9 z4wLX2-ozZdk0tmNYq1gAunYU}8~(%z6gq|HLQn#(*WN{_f|`g>2Uo%M*}DO)(H@;~ z4_uGEhcFmV;u(y^OK|=5-ozZdk0tmNuD9MsY{gFegoF4KuCHF90=y@bKp9+wOW=Cy zrBEAJp)qcN>!;TqopBHDLw`I9*GumijKxcsiZ|i<=)I4RumYcB1Gd8T(EAAo@h488 z(5XD@gJL)ru76%7R7VoEa3x&tyk=;Hwzvb`a4%fnyhkt;Ph$*Tg!RmOo7eO4A(mq` zT)(_;umeBh01o4CxL$c@pcu|YIaGq{lb1v-)WbDsikski#-T%!S%)a1&8rB3KZf!!S%#D7v)e9)es2!H0r?hxVsK5a0@!13tWG@`!N8I;Yp0bIJn++uVE(UVj(_;>udJ~ zHewrgVIO{j>uGlqh506R7E0oLTm;w8t|lVXK?7U|*URn}bU+u}gFbi&u8-Z57=>|| zgxBDD*v-ZJSb|Tn78~LE*X_bS{Dz}AiGpV^_waAH-gOtEGA=`i%i;RgU5n;uh1<{x zcf<9pdk};01V-X{SiibAcs(2MVKF{|>s9v^w%~j0#R2>Q*Qf5(fAOC1Z=}WF?bPE@CIC8y7#acpI{Bv;~Ti1bU)$%{=jjZ zdM57)|3+z)LnTy460R5Bm1u-!XocHgedzAxbw50Up?Dgu2i=R9f;TW5^YI~E|GCvz zk8iL8Kf?8%`vb>uN)hHBisBrU#l>)a=Mso<1sdXd+z8im?sjxVPxQsZaQ)_giIG%;;EB7*{;VsO=2XH;*KEoH-gm1AMKg0Et`wKZmc~2;cb8tRfFS$#R zi!|!uYPddfH=+%0M;G+K{ct_x9>Z`vi}83Fu7BKHn2Uw@7%TAwT<^GVu^T_*5dOjm z6g)d8=S;Z1ap$2tE2dhT$1lueeuvJp=Dx0X~B3 z6ZbhbU@LZFFMfsV5%&)Y(tGVp6vugR{oyV_4TQKH^>Hm+Z@8P$4tJtE?t}G(dxF;^ z@jNEtRk)sT?_dEw!V0XxS8)B{zQyD2~#&5S4KmTpzg0Q6JZ$Ic|pa zfV-R5z40Ih;R(3@Z_i^QUd0T|#(QwR-#)<_e1$Fe9(;To2dp?N)TeUFd}e;Cj71juChcFJLlWhwJn9E*4=KR^dyy9&g{_2kgi1 zIEGXB;n($dI~%2N0V<*z5^%lUu0TUvkCwO^Y8(d;xo9uZC_(Me!zbGj-xn%Lg(;vFiM~dE@7;d+iyF#Y>oq8F&Y- z@7hOLfzPo4Tj6@H{e*+~1An6cz0A%)F`Nt6Z>TDu8taR<8LUbsGM zk6 z*n?ki2uE=eu3y?&D2ek?0hhw{N=u_I8sIv#z%6in(z>7r?#BQ;2J4YFj@Og$8fIcH zTz|BWu@YZk6SiR&TyM1Ba1Tu-#?&;qxh1G>Qap*_Uw z!FUp*Fb=L4+H07JxmbuL_!O=W+D2@{F6_f^a6QmYqVNUGJ(NHhTm;wutR^DVK?7U| z>wVUq*PU?>`rskBzGqKj6vkl^rs7Swo@ei42|mSIY=rA~whQ~<`knobzi<+T=p}Y0 z{*7~S0WQX+sEH7@P!A2!1TD}S?a&F`&@h%o( z307bY)?+iaV>kBUAdcW~oKl{jQ*jnb;5=N2N~nee(x{F4XoRL{i8knf&ghQb=!XG# z6i?u3Jck$XGG4=*cn9y{LoCHgti=Xw!4B-fejLJ4oWQ9Uac*%o&cXS(2$$e8B$2@t zXn@9Oj#g-kj_8UW=!53w!u1;5=N2N~nee z(x`*0&yo6Wr2HwWI z_y8Z{Q+$r~*o^JijeR(XBRGx%75NztMNtCh;X+hGH6)Nm9bAQL(G34d547jm{zbfk z*YOtS;(dICPp}$aViUGuC-&k14&xYdDsgU61jTVKF2KdO6g82;<+u{p;CkGM)@X-L z=!RbCivbvnVHk-qaDC8T!mD@#Z{uBjfRFJhKF3%12HUY4`*09Pa2y3Hb8b--B~b`dIJc;Y6fVb=xCYnbM%;q-=!93fH0;Zo;j&9bM2Jz0nT?F$BXg3S%)5 zQ!pKGVJ_asNB9J*@g=^-x7dZBZ~%vK3^|vQTNFWYltwvJL{-#83YX(bT!ZUzBW^)^ z+=;vKpY%j~hwb0PhggQs@C7zvE565%_yvb>6em!yD(4o(Pzq&H0aZ`~Nn~&ZuEuq^ z0XO3|+=08$6MfJhgD?~$FdE}A2~#lxv*G%py^oLZ30C7ve2s6h3qRpk{DEW0sm8fQ z5fn#hltV>SMJ^&-jw^8uuE&kI1?_Ps?#8`%00S@>!!QzKFaeV>4KpzZ3-BS9;WK=J zjo6Cs@gsi0@AwPzw3g6&6{D7bF8~(&UC~z6yzfcq6nFiScoN9fi+l<&G;6(@DqN;ANU)m1pMBBvrq!(p&TlrDsmB_ zHtM4hnxZA_{~wU=g8!{Q{C_s)BH?6Tn``Vi_vN*^M9Nb5ta51H%- zeYM|l!hfcHCA6=E_Lb1S653Zn`$}kE3GGW?7U+Ei+E?HgYF~l&6=+|9_7!Mff%X+> zU)nyYeI>Q8r1q8c>$I<=_LbDWlG;~N`$}qG+A!3V(lx|zGCew*1lrxE7rbZ?JL&4 zV*lmZm;YP)%4lC1?JJ{wWwfu1_Lb4TGTK*0`^seZlKo;OHK93>$o0#zJ4`72iQMcC z6S>(PCUUboOyp*Fn8?lUFp;Yr>Z|=O75+p*`%?H53V%Z3PbmBeg+HP2Clr4FWeR^n z;ZG?1i6DER357qQ@Fx`hgu5X^Lw_1+3*Jnf1vOO3V)#R2MT|n@N2RJ3V)#R2MT|n z@COQipzsF@f1vOO3V)#R`#}}{K;aJ*{y^am6#hWr4;21D;SUu4K;ifAT;UHC{y^am z6#hWr4;21D;SUu4K;aJ*et-B1f1vOO3V)#R2MT|n@COQipzsF@f1vRDbExnK3V)#R z2MT|n@COQipzsF@f1vOO3ctS)3V)#R2MT|n@COQipzsF@f1vOO3V)#R`>UDBhCfjF z1BE|O_ydJMQ1}CdKT!Asg+EaEvk_7vk_vxP;ZG|3NrgYD@Fx}iq{5$6_>&5MQsGyu zk_vxP;ZG|3NrgYD@Fx}iq{5$6_>&5MQsGxNlL~)Q;ZG|3NrgG793~Z^q^^Ea7cQxD zp4915z?1$^{k!x7*t+a@{-na6RQQt$e^TL3D*T!yNrgYD@Fx}iq{5$6_>&5MQsGZ3 z{7HpBsqp*J6#k^bpH%ph3V%}JPb&OLg+HnACl&sr!tdX*!k<+5lL~)Q;ZG|3NrgYD z@Fx}iq{5$6`2BG!{7HpBsqiNi{-na6RQQt$e^TL3D*Q=>-=92%KdJC175=2cpH%ph z3V%}JPb&OLg+HnA`zxUECl&sr!k<+5lL~)Q;ZG|3NrgYD@Fx}i?Df`F3>E%R;SUx5 zP~i_1{!rl$75-4+4;B7U;a3zwg+EmILxn$7_(O$1RQN-MKUDZbg+EmIm8VeQ4;B7U z;SUx5P~i_1{!rl$75-4+4;6leGF13Og+EmILxn$7rbERsRFXo4Ak_5>b?TMu(4RJc z#Qx3tacq<9k3FHnA1eHz!XGO9njWFTA1eHz!XGO9p~4?3{Gq}hD*U0sA1eHQ7==Gn z_(O$1RQN-MKUDZbg+EmILxn$7`2BlT_(O$1RQN-MKUDZbg+EmILxn$7_(O%?AF#q7 zD*U0sA1eHz!XGO9p~4?3{Gq}hD*XPeDg2?rA1eHz!XGO9p~4?3{Gq}hD*U0spS?Ib z^C^WtrSPW|{*=O>QutE}e@fv`Df}sgKc(>N@}(61l)|4<_)`jhO5sl_{3(S$rSPW| z{*=P645SqPl)|4<_)`jhO5sl_{3(S$rSPW|{*=P6Af*)kl)|4<_)`jhO5sl_{3(S$ zrSPW|{*=P6G^P~(l)|4<_)`jhO5sl_+9{Wg_JI%BAxP=z@I&T;Ql@P;cUz7 z&-^KcKc(=e6n@Q%l)|4<_)`jhO5sl_{3(S$rSPW|{*=O>QuzJ;75QutE}e@fv`Df}sgKYJ#1wjzZ;QurfVM+$$W@cSJr{E@;RDg2SbA1VBi!XGL8k-{G-{E@=% z-<`rADg2SbA1VBi!XGL8k-{G-{E@;RDg4>P))A%^{Usg+HzErxpIR!msG075=ospH}$O3V&MRPb>Usg+HzErxpIR!ms?L75=ospH}$O z3V&MRPb>Usg+HyJrt5%|IF>$26Jw8F2MkXHE93V&MR zPb>Usg+HzErxpIR!k&tnkMQf2{Dw3V*Ed#|nR}@GE$+!XGRAvBDoK{IS9xEBvv-A1nN^ z@*XRyN^k6A<`d#CuD>q+l=`#b55&I*`|@m&Q&S*T_+y1XR`_FuKUVl-g+EsKV}(Cf z_+y1XyF=|Mqwr@G{*1z(QTQ_oe@5ZYDEt|PKcnzx6n?#(jKZH$_%jNBM&ZvW{27Hm zqwr@G{*1z(QTTOC8HGQi@MjeMjKZH$_%jNBM&ZvW{27Hmqwwp*WEB34!khQAXjlk5dv@^fN6=GG&S0!JCeAV$)##a?TI}=pH zR|#Jgd=>CjzgPKQ)q551Rl8T|K0AkkY?@WKSJ_@wdll_fvscMp6?+xzRj<#^z91W6 z73)>2SE*i=dKKzbr&pO?ReBZav-38{=2jJY73fu;S9xC5c@^hXn^$RGmHF(953*rZ zSzc9n73EcvS4mzKc@^YUk5@T9`@JH_CRDX}mEu*2S0P??c$MK*g;xgXz?tBS57x@zbup{s(f0($oQcQTts zRnJvCSM6M-b5+h&I9J_VWph={v)}oX*)*zTu8O$|=Bk&gT&`-lish=6t5lx7XC$*} zRGnO9a#hJyBv*}GC302BRUlV=JbSlEX49zRxN74njjJ-Q!no?R~1}EaMi$70#^lG1#s2BRsL4>J9}qT z?OUa9RlZgDR^3}=Z&kfj^j6JVCGYG#RRwR=yH)O1wOhq*)w)&cR;62oZq>Q7cVkt# zRpeHUTP1E)xK-d*eOu*iRku~#&fdpW+75L%dQaN1U+W5|Dr&2yt&+AX+S#7~RL@p9 zTh(k8vsKGhDO;6n6|z;wRvA0{vqP9oqiWbHVXK0z0=DYcDqpL5t>U$6*V&&s!fYB< zxK`a-WouQfRkT*kS|w{$tW~hi{=5=q)2M2-iq)!Bt5mH@wF=d$Q>#p^Ds}cJoiLk5 zm8eyrR)JdeX_co{omO#LwP}^6-3iW4Xw{`vmR40-MQPQfRgzXkS_Nsj@pH+QU@maNJm7Z03R^d7O^PI}g zsyeIateUe*&Z;=8;H-MH%FU`aXMYk@ty!gJRhm_3R-IX8W>uM0WLAw?CFbnUkSZ{z zbUt)YdS}|b-^LY0RasVHS#{;?PoJtPtEjA+vP#OTD662Xda}yNswS(Loc*~ql})26 z$tonPj;u1Ws>muLtA?x+vMR{gpLkQ*G^%{8>amK)svWCztje(p$Eq8vY@GdBIh9SL zYQ`!Vt75EzvFgPt7pq#VVzFw)?#yK8qYA~U6RS+DDzS>hsu8P1tO~IT#HtTxe>zun zSjA!0hE*C?Wmtt_)rD0SR#jL<;q1@#DhaD1tb(xW!72x<8mwZlYQZW6s}h|3jX-r^ zm4Q_SRuNb=V3mMX0agK6^~B7*?W(k^ z%B~8#s_Uw(tE#Swx~l1_q?`SnDaxi%^<0&6Rn1i~SG8P~a#hJyAy;+W>~C99HjOIc zs)nl)t}3`H;HrMB@~x`3D&D$7%1$R$ZdJHd-Bx8=Rc%$YRn1lf!38&x${#aPv1Rf<(5R)tvAVO54z6;?&q?C-HE!KwnQ z0<7w8vQWZ;8E6r9VR4G-VRCQ97 zNmV6PkyJHOl}J?~Re>~H&ro?()ln5kRU1`lRFzQ`MpYM8SyWZgY$ZfBQI$kh5miA{ z^-z^VRSi`!RJBl*LbEj#6+%@9RT)%OP!&N{162uB6;KsGRsYOZUsU~6@l&-=l|EJZ zRN+%~PnA7Y^;FR_TenfkGu9>5b0n5kN( zN|~x;s*tHVX0|$|DyE8aO zshXuqma150D`Kiws$8jRrHYlRRjO2}N~H>ws#B^=nXRR%NU0j7N|dTlsz9mwq{@@3 zPO3Pm+GMs0r^=)Xld4OqEUBubijt~HswAn3qzaPR`kl&=sz$09sam8;k*Y+h5UDz( z%8;r;W-EWHL8=6)3Zx2>{VC2xzyHLo16M_n0$}vSs^TUctvkxB+c&2fE{aJdB}u2IKJxW?(KB z;S;RI*Z2;5@f(ie)KeG}=b#)eK`t`560WD^jcAKI(E|@)5L_?I(RdNBVkX|jVz?fb z>+lU+|H_|n82`Zat}KdkQ685fz~yLw>(L6XUu75c!h;x$r!WStPvtbcjRp7^pJ6>* zf6CqX1xIiar=QC6YAAz?Q4JyLpdnm8%GS6Yt{3Hf7=Xv&`cRI=%b1RLun^0z1{<** z)_d{@uTSFif{ckWxEQYIWD0fR`b{=RYupakYw|t}z~dN+ahQxZFb4~<3~R6v+wmg~ z;xD+ql4qhM%Ayi#AdPys7AT_+<~8P2(E8r zK|K7jjPcVH=_f(;$HNJ>j60e&%^b9d>ym#K9=G$e2Fb^eINJZ51hbhr}Hc+ z&O-%MLlU)dHJajPxIT|}p*LKQ$02wc&%^b1d>ymldOI$~YHYx_*n?m3CvwiQ*gZ(U&2((#JgCGPq7ZJ=i*NMjKlZ`h507ydM%!d^0*WM zE=L2n9*eEe9~9KE^7n$5!mdFF1meIQ=ZX7oZF-Ms%)fGKzr^RNh?U@gAJ4*Uey3-NEbK8RH0q z;x=?fPxQkhcmkvF0;b?i%)^IRfiJKb-{U77!ryRx4$nd20^ z507Fvo`dUY_!{2Ae7JswEAb^h^1IDkKqQ=IPwD30^t`V(G;6zanDCTxyd;QA8Y zjXro7L-7p8!Sy4219R{Jmg945g6l)L7r)^z6yS%DGf@(*|6pa*gzG(cB^u*Kw1Mk8 zcsKgM^&A|EXD}YGUoIsL{rrl+^%pFM%BTs~TkuLW#*Ju; zJJACVU=W7kSxmsIn2C3>7@xxR5&Q-_u@A0?;BmPAfkkmH%HvW5xE!u;U=!SgcIbj$ zco42%;8Pd_*DG)uX5l@!K7pTMJ+@*ue!&r(#Od_oD~9t>0o4$q4jQ5vTu;E;(G9)v z5QgAsxL$xSV>;f!LM+1?xE_GtVh?`BQJg}ry?>zu%Ayji?_V8WH$*eIo`1Ka8(hD? z0eBoEF%Bg;<6)*of`;5w5@AUpR$cdjCQRl!fc-R|9F(!?kFETjBcob;tb} zh$k=#FJKDZ#5}kjexG11zQzvxghOz>`wG%4?<|zUg}4N{a6S9#<2tm&ZRm`i=!Zw~ zBu3*!OvPJpJ^DVvO1S=fTd)g1!}aF-2ZiZ_=lb%M#zm-#Bx>VoT#r_0kFK~E{qY!F zAHL@?39bj<+gN~);rj1=g>Co&2k!}Zx4 zf#>0R?7fcJcppo#8XMqx>-~TO_!BwwusZ|A;d<&-#AR^(^y=aoG>7Y@*AaK44<5!) zJcIF=j5ja`A7D8?$0mG-y>NZ=j-dek>CQw+T!6}O{qka5iN?4QZP5v?PhMX{!S%np39k2D7r4H64`MKe z!}YtH2-oXw7T&`We1`Sdirx4bzvDOx(J!tj&P91#iU60R0j|eQXooK71=qW7FkIid zF>pQWror{Adk?Nx-Dhxp>bAo5sQU%3Kix^3UY?&ta2_h48bZ`TLo`Ec+>UOz4-a7o zp2k?bjOlm>3$YAquo2&44}Qf_oN^K03s3@OQ3*AWhU+QU2sfY&?m&0kkAZjsqwoS= z!3@mBB7B0i_!>L#6As~ToLYf<3C=+|T!LI=P#@R9^@O_(ccKR#z#zCjaHHXRz)gkg z|27}4_uC4r#n;$@pKu6&qu|AS|3N8Sh$_g1>+e<{uD4rD+=kBRiGFw#!!a5!Vk+Lk ze0+qJ_!3*N3;S^x|DbS1zW<;!E<#l#Q5#pIDO#aDy5e5+$72`)*N<%yTralSSb&eQ z3SYtXU;6>B_u8Lueb>%_>$z42uHRa9xL#{@;rgsK$1UiHyU_=(x7y=yebvTcGF(5k zIdHwymc#W?+XUA`?MJx&X@9}>PCFB>Z`uW@jGBm153W~Q3%EXMop2BO;t>qPvzUO% zaDCC<#(Qu*(LTi&aQ)D}!;knCf8qoRU&8!EDU?HH)Ifx~Xo#k`3Af=++=Kh^FdoO# zcpfjo^*Wo0d02$ySc47NhTU-e%?{%@POZYRqXf>!#c(~%lDHgK;X2%iHt2|MaDB}B zV=!C~v(cCU*S~BA-ht~~_Ax$#>s$5>zK82s_8b1fDVOs70>$8Zm6b;oEiM2IKHDUdP**kHuJlwb+F1*n=%aB4HT#f5-6I^ewJ8=*CU;u`|^#mJ(iE#bEW?~*(FR2lnD1jv^@1eaDBBlVLNui_0;+uf5Y|DIuj*uJ}yQzBylwZej+07EbWV=xh~ z;!VuO2Uv#H_zGLG3qRv`97BPSpQlh9Wl#ZC5ug_8qcNJ}X0%5a^uz;r1W(`@jKj-# z9kVbWi?IT0u?gF;2M2HjCr~KGIlwu%0F`hVQmBKgaXoHATXaHq^uYiO!3d1OL`=m@ z%*6*-hSm5ATd@m2<9GaxQzOm+ilYoFpeh2?LVYww3$(%QxC{5|K;wW;`Ti%glU+C`B;n< z_#7MYEq=f+_yhl-P%UzZb8rDFqB=snlLKEN`p z#(HeQPVB=W97BQ2$svlP3@V@sY9fs*a1EN_X0%5a^uz;r1Vb?rV=)QSFbngs7%Q+A zoA51uz%Td%|DaH9&H>KB1*n9}5TZ63pb1){Ejpn)`d|Qt;3+(Z7x60I#9VxUWmt`` zuoXM84~K9J1?q4PP#k4Y0aXzogDY_@Zon)JI0hVAT)?qVt zU@s2hPn^VQ^*9GO7Z>6Z)I=Ir;2JbVE3`vr^gv$>#88aHSWLoecnk01LwthIu@T?m z2mFFRa2y4%3e(k`|r%n{+-a@{*&Cik9jX_gWSMxnVYm1p}k1ii^yK2?L}-aGX90_ zFJU7j@|M^L2^%3{BP48ugpH7}5fU~+!bY&&1U5ooBLsP?Y=po@2yBGFMhI+#z(xpc z1pDfwjgYhvk~TszZyOsSX(J?Ugrtp-v=Ndvf_+hFBZM|WXd{F+LYTL)jS$)hp^XsQ z2%(K&+oo)Ul#P(G5mGin%0@`#eU*)nvJp}?Ldr(4O(Gj1vJoO1A+ix78zHh0qP#D+ z5h5EQvJq@e+D1s*2x%K3Z6l;@gtU#2wh_{K`>_$yHi9jPZG_lHh;4+}Mu=^M*hYwL zgxE%i^LA(>>3sBK~+*wm&EsI9UQY--aD)K=LDwi}z; zbOrUB*wm&os4cN$u&GUdP+MikU{jkep?;NpwM}h$h1wb$+@>}iLv58EgH3JvhT1AS z2AkS>2X0fF4x)Zro7!{{wIwz}Xd~Fvrkkj3VFyY5o~JHXVkBAn@ZPeD-S+c23&rw@t$6!;N&ZD-?Escm8-*wm(PsjadRY--cJ)K>XD`@Ps~-~nfCz%$PFB5xBv5Dz=sDnAg9Jlhg~ zC_MOVOKc$9pG|EZg7%x()aFrWTVf+v5P2rrR@n%4_IWtkud+9qcQ7`!1Do1`O>G{R zwlA_{u&K>+)3(Zv!KQZJSKHL)F>1f9-L-j`+LqV|cGu>qYFlL^*j<~)tNkkbBAePg zXKicj@N8=Hz_qQiW3Z{slh?M&j=`q3eJkTBY`=+3ZJx!pC3XyU*XE&YTV=;!cWs`| zwpDfvwuwz`p49f6*j=0FwQY$VgWa`xaNAbdG1y(3C%FA8TVqq3N4af{9fM76p6Rw# zb__PPdAQqF*)iDEw(sgZ=%t}S_B++O&Bcx>F3_7tfpfO>AoO)VVFOW3Z{s z@5ZJ!PpR7y8^NY_-WS=_=Am`J zt=+YGblsNNS+cu!(xx_#vD-E_f=%tbcVknVr`r9tHnn-S-Imx0Hnn-k-B#HMHnn-$ z-LJAuY-;n^yREV7U{jmt-))thC7aqj2yd(G7;I|ioj;q}JQwe`wYxS?$J-Jc!S32g zo7y}nZ`;@iHnr`0E>F$-O>AoO?7S_p5o~Jn5WTIkQ(#k@r|E5#ou#~wHiAuU9 z+SKL|ds|{7*wp4(ds}5A*wpr4V@cxCdu_?n_x8dM#54G|B=3v-Ks=0ZtNcJbl5b1= zAy`CtHs7zYsm&Amw!}uTsm-JMw#r7Zsm(L{w#r7Z-PqLT`F+2MO>LgywDqCYyn7^fAx|v@)<4ejpke*b;vzv^B6Leju6~*b+YwEe`w=+n-Hs zdL7sr8^P|{G(E6YHiF%?>3(3VYy`V&+f9c~2!0cr+Vn%PC3XyU*QPOot+HdVsZECj zTV=;!Uu{#HZV7%9o7(hDuqAd3HnmfB*QS4hZDS+YUEAgoy%hW=Hnr)fU`uQSo7%Ki zuvK;pHnr)oV5{sHY}>p`Zc{szcLHr{r}Ey8-L+G8*QNu5-?2??`Y`w<_9kp<(~!Z| z*fH4DrZa=BviD_En-&eW%8tRNcHY=FwdvU4x3#-AeH(0vjbL}}lud1VIM_Bef=%tb zgR-ejO9#KLO>G)G*b*DTrZycOY?X~*Q=3)~epTK_yV5qbY5QQC*a$YY>HlD>Yy_Ly zG=i{Ie$Rdjy9zXg&<6B{uor$HxTI!Vs~xY zN7xb@!S33TO>J69*futTO>Mg+&{o24VpE&u61K!fu&GU#30q~yU{jlR6Sm5Z!M@t2 zHmxW8CN{NcKw(Sl7;I`sHnnL+VcXaUHnlD3G^X&I*wm&yg)OlWY--b|!dBTa*wm(B zg{`t)4alOVpE$I7PiEW!KOC7ENqn>gUte(TKHAANnWUIYDamO$fkCbcffYn zj%;d2HnnMaVLP@FY;2p_w7>A1*wm&8hApvUu&Eu{)TSYZZDS+Y)Xp2yrZ)XC{I)i= z>5^efYy_Lyw9BwnHiAuU`eyi5c^~ac+tj9qhHYXa*wm((hOM#@Y--b0!&dn{`=RX| z(P2Xy&}742_&~U?+_B3=aYy_LywBxW+H~`< zC3XxpwbM4W>Fie3wQ2fctLzwTYSaD0R@pJwwl=lt z1mZWbsZBo+TVlsxQ#)-_o8BO{jg4Sa+hz`ZLi{E+wdodOOKb$2+O!R^Rdx(Ewdo&X ztLzwT?7Z08)K2Gxz@~OO@08lqPUpQro7!oskkezt4`oBz)TY^pt+8XUsZG}rTV=;! zQ=9f9wkmIjHbUM|O>An@)x?(A2sX88Z(^%#1e@CQIk8nX zg6+npHa$=LCN{O{d}2%N7;I|C_PaJ+P;46;!G71ai%CZmzllw4`l8qp8^M0pra_9W zvSYBRO{WxFWyfG|%ceG6Q~V}2wdtK=OY%0h5$tzu`l#48HiG@GZFd%Ws`yQ8YSUT8 zme>gP134{LY?U2@O>KIu*eW{)+r*|eeOLS@Hnr)#VoU58Y--1L*QOJTZDS+Y59Bss z=*r?Zv8hdO7F%K?*wm&;i>HVTDY5!s`{6I8; zu_b;WTEW;7f3j!@V@v!%w1x3YY=1Vj=?`OTYy`V&(q##Y%e*wm&kjjgg{ux)K>)1$_3VpE$=HMYc# z!KQY`?%H&%v2APwyK7tO=wRbFv8hcT8(U%{*j<~3Hnz%+!KOByZETetgN-^8Xiop5Z4jbKxomN>S`j=`ojy>V=n z9Yfwn8^NY_CNIb~wKI9Kuph`Xc_-PXb|&vF`>CzB>F0v|qqhBEo@>b-wB4?LW28{cinCrl+@G?|18`F`d2ra=%+Yhy6UI!?#xZ z32gs(ZU1&{|8z~SZ@-;?w|?r<@!K!=@75orpSJc-*7W|?W`54n|J#4=ck3rCUBLZv zzgs_B{r3>Mforv&s&oYRpZj6`Jf$zVU+&+npQLmL_sjje^_%$#N|$i0_S2J2;r?^~ zZvEU0{M4jlxL@yg>*uBa(S^?8TJ0w#-NXIoez$%`(nH)Y_wUwEM>>i7<^J9J(fm}j ze}<-~xHj|i(EbgY&fsj6^cVNb{cioIe*V#CT&w-;qu03q+@AzL_2@Y6m;1Nn z=N)~=wcI}Vli=r^{d+SV$o*!1w|>6ShukmsyY-WeZsghJ`YInU`&VZBM`rqxYa>6` z=uPfF*QrSQe}uAsUZzL6U+;IWL#I=@7W?}q-OByvesn+A=vnTU>pk1QDbu;!FZa9k zU+(7`9n7`b&q2DF`_KKm^>Z!hb3`|DzuxcG|NQIc5?#%;+TS7VpOBMz>-}#1-7)Fs z+W)BQj&UQaf+!4kFqRxxg1`aWK(gJhGiC)yldu91WQd3bu>=AZF#!u;2`pm%e)W|_ z!XUd{RreiN-+qVN?UK&f>2WIOuGg*QxVD4i^v-9Q_q%DHi*158yE`&Y`&=EYWw^G} z;#AOQ8Lp*-F5dO_7S~clR|l(%Yda!N8-12}XS?&^6w>AQI(v(2DWy+c8rOC%oMO7N zm(07_bkpU*;#%tI@?eFymWuk6aV;fvu}#3W^wj0Gg=?v*PktZV3AnZc;Pllk1B+{^ ztjmMNwY1jH{H%5#?AUjwzNxOyB3w&(T|8Uyf@|ro%g=L9gMH$8>O1RAiCr43b1gk~ zd9cp4RN3XhI@jE#sk2WR*HUN~&sL<;E(VKhJK0UGU0t^>!?je~r|euyxm^qv*IsZf z1$T9@&NUoL$$iSswU>Psxc2hCPPe$WGu)KjU9VfqaBau8X}r%eT-(`gO7HS>Kcx09 z2J2kg-->HHvrYMZmg8FL?_#h%LZ$&P4;I%P&S}9<**nbi;KlY9*HVR-*DbE44KHsK zgt)fT*;L|Z8Lp)iF9wTi>BY-04%mIrEw1fEHud33VuBA#Z2J1a!+Vt|e#kCab<#nqITuZ5b%D9$V zy?EDwb=Rj_uMXBD#@BYCh%{~dxeV8KKAJ+lJXl;yC;!Y()z0oF`e(*V5t_?>ex!mL|VCSX^_Hr_DcQTuYx{ z3>Mc?>6h0nuBFv4uUqB4BYa-lq2}9el5_2C+i%ih#jsxmoan0SB^T3pGE%yO2SX|44Kwh`FmJ@-zZk2H@M}jFk z*K#Eg>lW9pxRyJCI#^u8m0Sv@>^>m=&sT`;gdujOAL z)-A5(Vj%BX;#yt?@-|@~z~b5t5%V>e<+zr&ff%g6MCNcHKX?=Abw`IgIn3?gT)cC` z91p|;e&E{940AqE2R<*<&I@xum@=;Af*=NqYlm|!Hw1OCKCjuyToI;>YdIr`!TP+G zKZ5*h#ZCuvNs!mA%Wy5Xgel`%jtOGj;@aW!TFwdTV0~WmI_ID;Wn9ZeK@8UCwY(JM zb&G2`D#+{BGF;19Vam9cyMkD^xOVtEWG)NpV0~WmfBf7Qri^PjE{MT8*YaJE*DbE) zz96q#zYW)NVVJUWEhh%CZgK6vwHz7L!QvW@m>vCMn^#QxlD+`I@j`= zkk<~bpqviW9H#fKhI}RU!BO0=g;3%TkLzk_kSPG|I$e7 k=U;#O@uy#YPq(h0zq^0?&*SAmqTIiIef;$4`yamj0$#$~UjP6A literal 0 HcmV?d00001 From 0bdf0972665525697ac819723a2a9f69d681ca09 Mon Sep 17 00:00:00 2001 From: hammadb Date: Fri, 28 Jun 2024 18:19:52 -0700 Subject: [PATCH 02/10] wip cleanup --- Cargo.lock | 1 + rust/worker/Cargo.toml | 1 + .../src/blockstore/arrow/block/delta.rs | 10 +- .../src/blockstore/arrow/block/types.rs | 443 +++++++++++------- 4 files changed, 277 insertions(+), 178 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f43f8872fb5..7e3cde827c0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4677,6 +4677,7 @@ dependencies = [ "cc", "criterion", "figment", + "flatbuffers", "futures", "k8s-openapi", "kube", diff --git a/rust/worker/Cargo.toml b/rust/worker/Cargo.toml index 4dc60a5ed7c..ece7c9c1646 100644 --- a/rust/worker/Cargo.toml +++ b/rust/worker/Cargo.toml @@ -55,6 +55,7 @@ opentelemetry = { version = "0.19.0", default-features = false, features = [ opentelemetry-otlp = "0.12.0" shuttle = "0.7.1" regex = "1.10.5" +flatbuffers = "24.3.25" [dev-dependencies] proptest = "1.4.0" diff --git a/rust/worker/src/blockstore/arrow/block/delta.rs b/rust/worker/src/blockstore/arrow/block/delta.rs index 42080ca7747..5464f5a3594 100644 --- a/rust/worker/src/blockstore/arrow/block/delta.rs +++ b/rust/worker/src/blockstore/arrow/block/delta.rs @@ -244,9 +244,6 @@ mod test { let size = delta.get_size::<&str, &str>(); block_manager.commit::<&str, &str>(&delta); let block = block_manager.get(&delta_id).await.unwrap(); - // TODO: uncomment this when sizing is fixed - println!("==== COMPUTING DUMPED BLOCK SIZE ==== "); - let dumped_block_size = block.get_size(); assert_eq!(size, block.get_size()); for i in 0..n { let key = format!("key{}", i); @@ -256,13 +253,8 @@ mod test { // test save/load block.save("test.arrow").unwrap(); - let loaded = Block::load("test.arrow", delta_id).unwrap(); + let loaded = Block::load_with_validation("test.arrow", delta_id).unwrap(); assert_eq!(loaded.id, delta_id); - // TODO: make this sizing work - println!("==== COMPUTING ORIGINAL BLOCK SIZE ==== "); - let original_size = block.get_size(); - println!("==== COMPUTING LOADED BLOCK SIZE ==== "); - let loaded_size = loaded.get_size(); assert_eq!(block.get_size(), loaded.get_size()); for i in 0..n { let key = format!("key{}", i); diff --git a/rust/worker/src/blockstore/arrow/block/types.rs b/rust/worker/src/blockstore/arrow/block/types.rs index ec95bde83a2..017552448fd 100644 --- a/rust/worker/src/blockstore/arrow/block/types.rs +++ b/rust/worker/src/blockstore/arrow/block/types.rs @@ -1,20 +1,17 @@ -use std::io::{Read, Seek, SeekFrom}; -use std::sync::Arc; - use super::delta::BlockDelta; use crate::blockstore::arrow::types::{ArrowReadableKey, ArrowReadableValue}; -use crate::errors::ChromaError; +use crate::errors::{ChromaError, ErrorCodes}; +use arrow::array::ArrayData; use arrow::buffer::Buffer; -use arrow::ipc::convert::fb_to_schema; -use arrow::ipc::reader::{read_footer_length, FileDecoder}; -use arrow::ipc::{root_as_footer, root_as_message, MessageHeader}; +use arrow::ipc::reader::read_footer_length; +use arrow::ipc::{root_as_footer, root_as_message, MessageHeader, MetadataVersion}; use arrow::util::bit_util; use arrow::{ array::{Array, StringArray}, record_batch::RecordBatch, }; -use rand::rngs::mock; -use tantivy::HasLen; +use std::io::SeekFrom; +use thiserror::Error; use uuid::Uuid; /// A block in a blockfile. A block is a sorted collection of data that is immutable once it has been committed. @@ -216,55 +213,22 @@ impl Block { /// Returns the size of the block in bytes pub(crate) fn get_size(&self) -> usize { let mut total_size = 0; - let mut mock_size = 0; - let mut column_index = 0; + let mut alt_size = 0; for column in self.data.columns() { - let column_size = column.get_buffer_memory_size(); - println!( - "[ORIGINAL] Column {} column size: {}", - column_index, column_size - ); - total_size += column.get_buffer_memory_size(); let array_data = column.to_data(); - let array_data_size = array_data.get_slice_memory_size().unwrap(); - println!( - "[MAYBE] Column {} array data size: {}", - column_index, array_data_size - ); - let mut buffer_index = 0; - let mut total_for_buffers = 0; - for buffer in array_data.buffers() { - // let slice = buffer.slice_with_length(offset, len); - // let slice_len = slice.as_slice().len(); - // mock_size += slice_len; - // SYSTEM ASSUMPTION: ALL BUFFERS ARE PADDED TO 64 bytes - // We maintain this invariant in two places - // 1. In the to_arrow methods of delta storage, we allocate - // padded buffers - // 2. In block load() we validate that the buffers are of size 64 - // Why do we do this instead of using get_buffer_memory_size() - // or using the buffers capacity? TODO: answer - let size = bit_util::round_upto_multiple_of_64(buffer.len()); - println!( - "[NEW] Column {} buffer {} size: {}", - column_index, buffer_index, size - ); - mock_size += size; - total_for_buffers += size; - buffer_index += 1; - } + total_size += get_size_of_array_data(&array_data); + + let column_buffer_size = column.get_buffer_memory_size(); + let alt_column_size = get_size_of_array_data(&array_data); + alt_size += column_buffer_size; + let alt_column_size = get_size_of_array_data(&array_data); println!( - "[NEW] Total for buffers: {} for column {}", - total_for_buffers, column_index + "Column buffer size: {} vs {}", + column_buffer_size, alt_column_size ); - column_index += 1; } - // total_size - println!( - "Size via total_size: {}, size via mock_size: {}", - total_size, mock_size - ); - return mock_size; + println!("Total size: {} vs {}", total_size, alt_size); + return total_size; } /// Returns the number of items in the block @@ -273,8 +237,7 @@ impl Block { } pub fn save(&self, path: &str) -> Result<(), Box> { - let file = std::fs::File::create(path); - let mut file = match file { + let file = match std::fs::File::create(path) { Ok(file) => file, Err(e) => { // TODO: Return a proper error @@ -282,7 +245,19 @@ impl Block { } }; let mut writer = std::io::BufWriter::new(file); - let writer = arrow::ipc::writer::FileWriter::try_new(&mut writer, &self.data.schema()); + let options = + match arrow::ipc::writer::IpcWriteOptions::try_new(64, false, MetadataVersion::V5) { + Ok(options) => options, + Err(e) => { + panic!("Error creating options: {:?}", e); + } + }; + + let writer = arrow::ipc::writer::FileWriter::try_new_with_options( + &mut writer, + &self.data.schema(), + options, + ); let mut writer = match writer { Ok(writer) => writer, Err(e) => { @@ -317,13 +292,34 @@ impl Block { } pub fn from_bytes(bytes: &[u8], id: Uuid) -> Result> { + return Self::from_bytes_internal(bytes, id, false); + } + + pub fn from_bytes_with_validation( + bytes: &[u8], + id: Uuid, + ) -> Result> { + return Self::from_bytes_internal(bytes, id, true); + } + + fn from_bytes_internal( + bytes: &[u8], + id: Uuid, + validate: bool, + ) -> Result> { let cursor = std::io::Cursor::new(bytes); - let mut reader = - arrow::ipc::reader::FileReader::try_new(cursor, None).expect("Error creating reader"); - return Self::load_with_reader(reader, id); + return Self::load_with_reader(cursor, id, validate); + } + + pub fn load_with_validation(path: &str, id: Uuid) -> Result> { + return Self::load_internal(path, id, true); } pub fn load(path: &str, id: Uuid) -> Result> { + return Self::load_internal(path, id, false); + } + + fn load_internal(path: &str, id: Uuid, validate: bool) -> Result> { let file = std::fs::File::open(path); let file = match file { Ok(file) => file, @@ -332,126 +328,35 @@ impl Block { panic!("Error opening file: {:?}", e) } }; - let mut reader = std::io::BufReader::new(file); - - // Read IPC File - https://docs.rs/arrow-ipc/52.0.0/arrow_ipc/reader/struct.FileDecoder.html - // Space for ARROW_MAGIC (6 bytes) and length (4 bytes) - let mut buffer = [0; 10]; - reader - .seek(SeekFrom::End(-10)) - .expect("TODO: change to error"); - reader - .read_exact(&mut buffer) - .expect("TODO: change to error"); - - let footer_len = read_footer_length(buffer).expect("TODO: change to error"); - - // read footer - let mut footer_data = vec![0; footer_len]; - reader - .seek(SeekFrom::End(-10 - footer_len as i64)) - .expect("TODO: change to error"); - reader - .read_exact(&mut footer_data) - .expect("TODO: change to error"); - - let footer = root_as_footer(&footer_data).expect("TODO: change to error"); - let schema = footer.schema().expect("TODO: change to error"); - let arrow_schema = Arc::new(fb_to_schema(schema)); - // Create a file decoder, requiring alignment of 64 bytes - let decoder = FileDecoder::new(arrow_schema, footer.version()); - let decoder = decoder.with_require_alignment(true); - - // Read the record batch - let record_batch_definitions = footer.recordBatches().expect("TODO: change to error"); - let record_batch = record_batch_definitions.get(0); - let block_len = record_batch.bodyLength() as usize + record_batch.metaDataLength() as usize; - - println!("BODY LENGTH: {}", record_batch.bodyLength()); - println!("RECORD BATCH OFFSET: {}", record_batch.offset()); - let mut file_buffer = vec![0; block_len]; - reader - .seek(SeekFrom::Start(record_batch.offset() as u64)) - .expect("TODO: change to error"); - reader - .read_exact(&mut file_buffer) - .expect("TODO: change to error"); - - let buffer = Buffer::from(file_buffer); - - // This is borrowed from arrow-ipc parse_message.rs - // https://arrow.apache.org/docs/format/Columnar.html#encapsulated-message-format - let buf = match buffer[..4] == [0xff; 4] { - true => &buffer[8..], - false => &buffer[4..], - }; - let message = root_as_message(buf).expect("TODO: change to error"); - match message.header_type() { - MessageHeader::RecordBatch => { - let rb = message - .header_as_record_batch() - .expect("TODO: change to error"); - // Loop over offsets and ensure the lengths of each buffer are 64 byte aligned - let blocks = rb.buffers().expect("TODO: change to error"); - let mut prev_offset = blocks.get(0).offset(); - for block in blocks.iter().skip(1) { - let curr_offset = block.offset(); - let len = curr_offset - prev_offset; - println!("CURRENT OFFSET: {}", curr_offset); - println!("BUFFER LENGTH IS: {}", len); - let remainder = len % 64; - if remainder != 0 { - panic!("Buffer length is not 64 byte aligned"); - } - prev_offset = curr_offset; - } - // We have to add the last buffer length based on the body length - let last_buffer_len = record_batch.bodyLength() as usize - prev_offset as usize; - let remainder = last_buffer_len % 64; - println!("LAST BUFFER LENGTH IS: {}", last_buffer_len); - if remainder != 0 { - panic!("Buffer length is not 64 byte aligned"); - } - } - _ => { - panic!("Unexpected message type"); - } - } - - let read = decoder - .read_record_batch(record_batch, &buffer) - .unwrap() - .unwrap(); - Ok(Self::from_record_batch(id, read)) - - // // TODO: require_alignment - // let reader = arrow::ipc::reader::FileReader::try_new(&mut reader, None); - // let reader = match reader { - // Ok(reader) => reader, - // Err(e) => { - // // TODO: Return a proper error - // panic!("Error creating reader: {:?}", e) - // } - // }; - - // return Self::load_with_reader(reader, id); + let reader = std::io::BufReader::new(file); + return Self::load_with_reader(reader, id, validate); } fn load_with_reader( - mut reader: arrow::ipc::reader::FileReader, + mut reader: R, id: Uuid, + validate: bool, ) -> Result> where R: std::io::Read + std::io::Seek, { - let batch = reader.next().unwrap(); + if validate { + let res = verify_buffers_layout(&mut reader); + match res { + Ok(_) => {} + Err(e) => { + return Err(Box::new(e)); + } + } + } + + let mut arrow_reader = arrow::ipc::reader::FileReader::try_new(&mut reader, None) + .expect("Error creating reader"); + + let batch = arrow_reader.next().unwrap(); // TODO: how to store / hydrate id? match batch { - Ok(batch) => { - println!("Loaded batch with {} rows", batch.num_rows()); - println!("Batch size is {}", batch.get_array_memory_size()); - Ok(Self::from_record_batch(id, batch)) - } + Ok(batch) => Ok(Self::from_record_batch(id, batch)), Err(e) => { panic!("Error reading batch: {:?}", e); } @@ -459,6 +364,206 @@ impl Block { } } +fn get_size_of_array_data(array_data: &ArrayData) -> usize { + let mut total_size = 0; + for buffer in array_data.buffers() { + // SYSTEM ASSUMPTION: ALL BUFFERS ARE PADDED TO 64 bytes + // We maintain this invariant in two places + // 1. In the to_arrow methods of delta storage, we allocate + // padded buffers + // 2. In block load() we validate that the buffers are of size 64 + // Why do we do this instead of using get_buffer_memory_size() + // or using the buffers capacity? TODO: answer + let size = bit_util::round_upto_multiple_of_64(buffer.len()); + total_size += size; + } + // List and Struct arrays have child arrays + for child in array_data.child_data() { + total_size += get_size_of_array_data(child); + } + return total_size; +} + +#[derive(Error, Debug)] +pub enum ArrowLayoutVerificationError { + #[error("Buffer length is not 64 byte aligned")] + BufferLengthNotAligned, + #[error(transparent)] + IOError(#[from] std::io::Error), + #[error(transparent)] + ArrowError(#[from] arrow::error::ArrowError), + #[error(transparent)] + InvalidFlatbuffer(#[from] flatbuffers::InvalidFlatbuffer), + #[error("No schema in footer")] + NoSchema, + #[error("No record batches in footer")] + NoRecordBatches, + #[error("More than one record batch in IPC file")] + MultipleRecordBatches, + #[error("Invalid message type")] + InvalidMessageType, + #[error("Error decoding record batch message as record batch")] + RecordBatchDecodeError, + #[error("Record batch has no buffer blocks")] + NoBufferBlocks, +} + +impl ChromaError for ArrowLayoutVerificationError { + fn code(&self) -> ErrorCodes { + match self { + ArrowLayoutVerificationError::BufferLengthNotAligned => ErrorCodes::Internal, + ArrowLayoutVerificationError::IOError(_) => ErrorCodes::Internal, + ArrowLayoutVerificationError::ArrowError(_) => ErrorCodes::Internal, + ArrowLayoutVerificationError::InvalidFlatbuffer(_) => ErrorCodes::Internal, + ArrowLayoutVerificationError::NoSchema => ErrorCodes::Internal, + ArrowLayoutVerificationError::NoRecordBatches => ErrorCodes::Internal, + ArrowLayoutVerificationError::MultipleRecordBatches => ErrorCodes::Internal, + ArrowLayoutVerificationError::InvalidMessageType => ErrorCodes::Internal, + ArrowLayoutVerificationError::RecordBatchDecodeError => ErrorCodes::Internal, + ArrowLayoutVerificationError::NoBufferBlocks => ErrorCodes::Internal, + } + } +} + +fn verify_buffers_layout(mut reader: R) -> Result<(), ArrowLayoutVerificationError> +where + R: std::io::Read + std::io::Seek, +{ + // Read the IPC file and verify that the buffers are 64 byte aligned + // by inspecting the offsets, this is required since our + // size calculation assumes that the buffers are 64 byte aligned + // Space for ARROW_MAGIC (6 bytes) and length (4 bytes) + let mut footer_buffer = [0; 10]; + match reader.seek(SeekFrom::End(-10)) { + Ok(_) => {} + Err(e) => { + return Err(ArrowLayoutVerificationError::IOError(e)); + } + } + + match reader.read_exact(&mut footer_buffer) { + Ok(_) => {} + Err(e) => { + return Err(ArrowLayoutVerificationError::IOError(e)); + } + } + + let footer_len = read_footer_length(footer_buffer); + let footer_len = match footer_len { + Ok(footer_len) => footer_len, + Err(e) => { + return Err(ArrowLayoutVerificationError::ArrowError(e)); + } + }; + + // read footer + let mut footer_data = vec![0; footer_len]; + match reader.seek(SeekFrom::End(-10 - footer_len as i64)) { + Ok(_) => {} + Err(e) => { + return Err(ArrowLayoutVerificationError::IOError(e)); + } + } + match reader.read_exact(&mut footer_data) { + Ok(_) => {} + Err(e) => { + return Err(ArrowLayoutVerificationError::IOError(e)); + } + } + + let footer = match root_as_footer(&footer_data) { + Ok(footer) => footer, + Err(e) => { + return Err(ArrowLayoutVerificationError::InvalidFlatbuffer(e)); + } + }; + + // Read the record batch + let record_batch_definitions = match footer.recordBatches() { + Some(record_batch_definitions) => record_batch_definitions, + None => { + return Err(ArrowLayoutVerificationError::NoRecordBatches); + } + }; + + // Ensure there is only ONE record batch, which is how we store data + if record_batch_definitions.len() != 1 { + return Err(ArrowLayoutVerificationError::MultipleRecordBatches); + } + + let record_batch_definition = record_batch_definitions.get(0); + let record_batch_len = record_batch_definition.bodyLength() as usize + + record_batch_definition.metaDataLength() as usize; + let record_batch_body_len = record_batch_definition.bodyLength() as usize; + + // Read the actual record batch + let mut file_buffer = vec![0; record_batch_len]; + match reader.seek(SeekFrom::Start(record_batch_definition.offset() as u64)) { + Ok(_) => {} + Err(e) => { + return Err(ArrowLayoutVerificationError::IOError(e)); + } + } + match reader.read_exact(&mut file_buffer) { + Ok(_) => {} + Err(e) => { + return Err(ArrowLayoutVerificationError::IOError(e)); + } + } + let buffer = Buffer::from(file_buffer); + + // This is borrowed from arrow-ipc parse_message.rs + // https://arrow.apache.org/docs/format/Columnar.html#encapsulated-message-format + let buf = match buffer[..4] == [0xff; 4] { + true => &buffer[8..], + false => &buffer[4..], + }; + let message = match root_as_message(buf) { + Ok(message) => message, + Err(e) => { + return Err(ArrowLayoutVerificationError::InvalidFlatbuffer(e)); + } + }; + + match message.header_type() { + MessageHeader::RecordBatch => { + let record_batch = match message.header_as_record_batch() { + Some(record_batch) => record_batch, + None => { + return Err(ArrowLayoutVerificationError::RecordBatchDecodeError); + } + }; + // Loop over offsets and ensure the lengths of each buffer are 64 byte aligned + let blocks = match record_batch.buffers() { + Some(blocks) => blocks, + None => { + return Err(ArrowLayoutVerificationError::RecordBatchDecodeError); + } + }; + + let mut prev_offset = blocks.get(0).offset(); + for block in blocks.iter().skip(1) { + let curr_offset = block.offset(); + let len = curr_offset - prev_offset; + if len % 64 != 0 { + return Err(ArrowLayoutVerificationError::BufferLengthNotAligned); + } + prev_offset = curr_offset; + } + // Check the remaining buffer length based on the body length + let last_buffer_len = record_batch_body_len - prev_offset as usize; + if last_buffer_len % 64 != 0 { + return Err(ArrowLayoutVerificationError::BufferLengthNotAligned); + } + } + _ => { + return Err(ArrowLayoutVerificationError::InvalidMessageType); + } + } + + Ok(()) +} + // #[derive(Error, Debug)] // pub enum FinishError { // #[error("Arrow error")] From d495e70d2d6bbb9ece9f821d3f5f099f118dcd1c Mon Sep 17 00:00:00 2001 From: hammadb Date: Mon, 1 Jul 2024 13:05:52 -0700 Subject: [PATCH 03/10] Factor in null size --- rust/worker/src/blockstore/arrow/block/types.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/rust/worker/src/blockstore/arrow/block/types.rs b/rust/worker/src/blockstore/arrow/block/types.rs index 017552448fd..5a0f57cd3e3 100644 --- a/rust/worker/src/blockstore/arrow/block/types.rs +++ b/rust/worker/src/blockstore/arrow/block/types.rs @@ -381,6 +381,11 @@ fn get_size_of_array_data(array_data: &ArrayData) -> usize { for child in array_data.child_data() { total_size += get_size_of_array_data(child); } + // Some data types have null buffers + if let Some(buffer) = array_data.nulls() { + let size = bit_util::round_upto_multiple_of_64(buffer.len()); + total_size += size; + } return total_size; } From bf4bbb52bf9b9c408d511ea8369f74b6bfa40175 Mon Sep 17 00:00:00 2001 From: hammadb Date: Mon, 1 Jul 2024 13:06:13 -0700 Subject: [PATCH 04/10] cln --- rust/worker/src/blockstore/arrow/block/types.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/worker/src/blockstore/arrow/block/types.rs b/rust/worker/src/blockstore/arrow/block/types.rs index 5a0f57cd3e3..d6f9437a59f 100644 --- a/rust/worker/src/blockstore/arrow/block/types.rs +++ b/rust/worker/src/blockstore/arrow/block/types.rs @@ -381,7 +381,7 @@ fn get_size_of_array_data(array_data: &ArrayData) -> usize { for child in array_data.child_data() { total_size += get_size_of_array_data(child); } - // Some data types have null buffers + // Some data types (like our data record) have null buffers if let Some(buffer) = array_data.nulls() { let size = bit_util::round_upto_multiple_of_64(buffer.len()); total_size += size; From 84ff80eba0f0907f0e01528aec6aea32e3b0dcb4 Mon Sep 17 00:00:00 2001 From: hammadb Date: Mon, 1 Jul 2024 13:07:16 -0700 Subject: [PATCH 05/10] cln 2 --- rust/worker/src/blockstore/arrow/block/types.rs | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/rust/worker/src/blockstore/arrow/block/types.rs b/rust/worker/src/blockstore/arrow/block/types.rs index d6f9437a59f..ee349c581b0 100644 --- a/rust/worker/src/blockstore/arrow/block/types.rs +++ b/rust/worker/src/blockstore/arrow/block/types.rs @@ -217,17 +217,7 @@ impl Block { for column in self.data.columns() { let array_data = column.to_data(); total_size += get_size_of_array_data(&array_data); - - let column_buffer_size = column.get_buffer_memory_size(); - let alt_column_size = get_size_of_array_data(&array_data); - alt_size += column_buffer_size; - let alt_column_size = get_size_of_array_data(&array_data); - println!( - "Column buffer size: {} vs {}", - column_buffer_size, alt_column_size - ); } - println!("Total size: {} vs {}", total_size, alt_size); return total_size; } From 6fb55303bb80343c79f69a48824e4a1e4c46ea9f Mon Sep 17 00:00:00 2001 From: hammadb Date: Mon, 1 Jul 2024 14:14:49 -0700 Subject: [PATCH 06/10] Tests --- .../src/blockstore/arrow/block/delta.rs | 85 +++++-- .../src/blockstore/arrow/block/types.rs | 232 +++++++++++++----- rust/worker/src/blockstore/arrow/provider.rs | 16 +- rust/worker/test.arrow | Bin 67710 -> 0 bytes 4 files changed, 244 insertions(+), 89 deletions(-) delete mode 100644 rust/worker/test.arrow diff --git a/rust/worker/src/blockstore/arrow/block/delta.rs b/rust/worker/src/blockstore/arrow/block/delta.rs index 5464f5a3594..3d96fd90254 100644 --- a/rust/worker/src/blockstore/arrow/block/delta.rs +++ b/rust/worker/src/blockstore/arrow/block/delta.rs @@ -198,10 +198,26 @@ mod test { use roaring::RoaringBitmap; use std::collections::HashMap; + /// Saves a block to a random file under the given path, then loads the block + /// and validates that the loaded block has the same size as the original block. + /// ### Returns + /// - The loaded block + /// ### Notes + /// - Assumes that path will be cleaned up by the caller + fn test_save_load_size(path: &str, block: &Block) -> Block { + let save_path = format!("{}/{}", path, random::()); + block.save(&save_path).unwrap(); + let loaded = Block::load_with_validation(&save_path, block.id).unwrap(); + assert_eq!(loaded.id, block.id); + assert_eq!(block.get_size(), loaded.get_size()); + loaded + } + #[tokio::test] async fn test_sizing_int_arr_val() { let tmp_dir = tempfile::tempdir().unwrap(); - let storage = Storage::Local(LocalStorage::new(tmp_dir.path().to_str().unwrap())); + let path = tmp_dir.path().to_str().unwrap(); + let storage = Storage::Local(LocalStorage::new(path)); let block_manager = BlockManager::new(storage); let delta = block_manager.create::<&str, &Int32Array>(); @@ -222,7 +238,10 @@ mod test { // Semantically, that makes sense, since a delta is unsuable after commit block_manager.commit::<&str, &Int32Array>(&delta); let block = block_manager.get(&delta.id).await.unwrap(); + // Ensure the deltas estimated size matches the actual size of the block assert_eq!(size, block.get_size()); + + test_save_load_size(path, &block); } #[tokio::test] @@ -252,10 +271,7 @@ mod test { } // test save/load - block.save("test.arrow").unwrap(); - let loaded = Block::load_with_validation("test.arrow", delta_id).unwrap(); - assert_eq!(loaded.id, delta_id); - assert_eq!(block.get_size(), loaded.get_size()); + let loaded = test_save_load_size(path, &block); for i in 0..n { let key = format!("key{}", i); let read = loaded.get::<&str, &str>("prefix", &key); @@ -277,7 +293,8 @@ mod test { #[tokio::test] async fn test_sizing_float_key() { let tmp_dir = tempfile::tempdir().unwrap(); - let storage = Storage::Local(LocalStorage::new(tmp_dir.path().to_str().unwrap())); + let path = tmp_dir.path().to_str().unwrap(); + let storage = Storage::Local(LocalStorage::new(path)); let block_manager = BlockManager::new(storage); let delta = block_manager.create::(); @@ -293,12 +310,16 @@ mod test { block_manager.commit::(&delta); let block = block_manager.get(&delta.id).await.unwrap(); assert_eq!(size, block.get_size()); + + // test save/load + test_save_load_size(path, &block); } #[tokio::test] async fn test_sizing_roaring_bitmap_val() { let tmp_dir = tempfile::tempdir().unwrap(); - let storage = Storage::Local(LocalStorage::new(tmp_dir.path().to_str().unwrap())); + let path = tmp_dir.path().to_str().unwrap(); + let storage = Storage::Local(LocalStorage::new(path)); let block_manager = BlockManager::new(storage); let delta = block_manager.create::<&str, &RoaringBitmap>(); @@ -321,12 +342,16 @@ mod test { let expected = RoaringBitmap::from_iter((0..i).map(|x| x as u32)); assert_eq!(read, Some(expected)); } + + // test save/load + test_save_load_size(path, &block); } #[tokio::test] async fn test_data_record() { let tmp_dir = tempfile::tempdir().unwrap(); - let storage = Storage::Local(LocalStorage::new(tmp_dir.path().to_str().unwrap())); + let path = tmp_dir.path().to_str().unwrap(); + let storage = Storage::Local(LocalStorage::new(path)); let block_manager = BlockManager::new(storage); let ids = vec!["embedding_id_2", "embedding_id_0", "embedding_id_1"]; let embeddings = vec![ @@ -378,23 +403,33 @@ mod test { assert_eq!(read.document, documents[i]); } assert_eq!(size, block.get_size()); + + // test save/load + test_save_load_size(path, &block); } - // #[test] - // fn test_sizing_uint_key_val() { - // let block_provider = ArrowBlockProvider::new(); - // let block = block_provider.create_block(KeyType::Uint, ValueType::Uint); - // let delta = BlockDelta::from(block.clone()); - - // let n = 2000; - // for i in 0..n { - // let key = BlockfileKey::new("prefix".to_string(), Key::Uint(i as u32)); - // let value = Value::UintValue(i as u32); - // delta.add(key, value); - // } - - // let size = delta.get_size(); - // let block_data = BlockData::try_from(&delta).unwrap(); - // assert_eq!(size, block_data.get_size()); - // } + #[tokio::test] + async fn test_sizing_uint_key_val() { + let tmp_dir = tempfile::tempdir().unwrap(); + let path = tmp_dir.path().to_str().unwrap(); + let storage = Storage::Local(LocalStorage::new(path)); + let block_manager = BlockManager::new(storage); + let delta = block_manager.create::(); + + let n = 2000; + for i in 0..n { + let prefix = "prefix"; + let key = i as u32; + let value = format!("value{}", i); + delta.add(prefix, key, value.as_str()); + } + + let size = delta.get_size::(); + block_manager.commit::(&delta); + let block = block_manager.get(&delta.id).await.unwrap(); + assert_eq!(size, block.get_size()); + + // test save/load + test_save_load_size(path, &block); + } } diff --git a/rust/worker/src/blockstore/arrow/block/types.rs b/rust/worker/src/blockstore/arrow/block/types.rs index ee349c581b0..54d5c7082c5 100644 --- a/rust/worker/src/blockstore/arrow/block/types.rs +++ b/rust/worker/src/blockstore/arrow/block/types.rs @@ -34,10 +34,12 @@ pub struct Block { } impl Block { + /// Create a concrete block from an id and the underlying record batch of data pub fn from_record_batch(id: Uuid, data: RecordBatch) -> Self { Self { id, data } } + /// Converts the block to a block delta for writing to a new block pub fn to_block_delta<'me, K: ArrowReadableKey<'me>, V: ArrowReadableValue<'me>>( &'me self, mut delta: BlockDelta, @@ -58,6 +60,13 @@ impl Block { delta } + /* + ===== Block Queries ===== + */ + + /// Get the value for a given key in the block + /// ### Panics + /// - If the underlying data types are not the same as the types specified in the function signature pub fn get<'me, K: ArrowReadableKey<'me>, V: ArrowReadableValue<'me>>( &'me self, prefix: &str, @@ -79,6 +88,9 @@ impl Block { None } + /// Get all the values for a given prefix in the block + /// ### Panics + /// - If the underlying data types are not the same as the types specified in the function signature pub fn get_prefix<'me, K: ArrowReadableKey<'me>, V: ArrowReadableValue<'me>>( &'me self, prefix: &str, @@ -103,6 +115,9 @@ impl Block { return Some(res); } + /// Get all the values for a given prefix in the block where the key is greater than the given key + /// ### Panics + /// - If the underlying data types are not the same as the types specified in the function signature pub fn get_gt<'me, K: ArrowReadableKey<'me>, V: ArrowReadableValue<'me>>( &'me self, prefix: &str, @@ -125,6 +140,9 @@ impl Block { return Some(res); } + /// Get all the values for a given prefix in the block where the key is less than the given key + /// ### Panics + /// - If the underlying data types are not the same as the types specified in the function signature pub fn get_lt<'me, K: ArrowReadableKey<'me>, V: ArrowReadableValue<'me>>( &'me self, prefix: &str, @@ -147,6 +165,9 @@ impl Block { return Some(res); } + /// Get all the values for a given prefix in the block where the key is less than or equal to the given key + /// ### Panics + /// - If the underlying data types are not the same as the types specified in the function signature pub fn get_lte<'me, K: ArrowReadableKey<'me>, V: ArrowReadableValue<'me>>( &'me self, prefix: &str, @@ -169,6 +190,9 @@ impl Block { return Some(res); } + /// Get all the values for a given prefix in the block where the key is greater than or equal to the given key + /// ### Panics + /// - If the underlying data types are not the same as the types specified in the function signature pub fn get_gte<'me, K: ArrowReadableKey<'me>, V: ArrowReadableValue<'me>>( &'me self, prefix: &str, @@ -191,6 +215,12 @@ impl Block { return Some(res); } + /// Get all the values for a given prefix in the block where the key is between the given keys + /// ### Notes + /// - Returns a tuple of (prefix, key, value) + /// - Returns None if the requested index is out of bounds + /// ### Panics + /// - If the underlying data types are not the same as the types specified in the function signature pub fn get_at_index<'me, K: ArrowReadableKey<'me>, V: ArrowReadableValue<'me>>( &'me self, index: usize, @@ -210,10 +240,13 @@ impl Block { Some((prefix, key, value)) } + /* + ===== Block Metadata ===== + */ + /// Returns the size of the block in bytes pub(crate) fn get_size(&self) -> usize { let mut total_size = 0; - let mut alt_size = 0; for column in self.data.columns() { let array_data = column.to_data(); total_size += get_size_of_array_data(&array_data); @@ -226,20 +259,27 @@ impl Block { self.data.num_rows() } - pub fn save(&self, path: &str) -> Result<(), Box> { + /* + ===== Block Serialization ===== + */ + + /// Save the block in Arrow IPC format to the given path + pub fn save(&self, path: &str) -> Result<(), BlockSaveError> { let file = match std::fs::File::create(path) { Ok(file) => file, Err(e) => { - // TODO: Return a proper error - panic!("Error creating file: {:?}", e) + return Err(BlockSaveError::IOError(e)); } }; + + // We force the block to be written with 64 byte alignment + // this is the default, but we are just being defensive let mut writer = std::io::BufWriter::new(file); let options = match arrow::ipc::writer::IpcWriteOptions::try_new(64, false, MetadataVersion::V5) { Ok(options) => options, Err(e) => { - panic!("Error creating options: {:?}", e); + return Err(BlockSaveError::ArrowError(e)); } }; @@ -251,82 +291,84 @@ impl Block { let mut writer = match writer { Ok(writer) => writer, Err(e) => { - // TODO: Return a proper error - panic!("Error creating writer: {:?}", e) + return Err(BlockSaveError::ArrowError(e)); } }; match writer.write(&self.data) { Ok(_) => match writer.finish() { Ok(_) => return Ok(()), Err(e) => { - panic!("Error finishing writer: {:?}", e); + return Err(BlockSaveError::ArrowError(e)); } }, Err(e) => { - panic!("Error writing data: {:?}", e); + return Err(BlockSaveError::ArrowError(e)); } } } - pub fn to_bytes(&self) -> Vec { + /// Convert the block to bytes in Arrow IPC format + pub fn to_bytes(&self) -> Result, BlockToBytesError> { let mut bytes = Vec::new(); // Scope the writer so that it is dropped before we return the bytes { let mut writer = - arrow::ipc::writer::FileWriter::try_new(&mut bytes, &self.data.schema()) - .expect("Error creating writer"); - writer.write(&self.data).expect("Error writing data"); - writer.finish().expect("Error finishing writer"); + match arrow::ipc::writer::FileWriter::try_new(&mut bytes, &self.data.schema()) { + Ok(writer) => writer, + Err(e) => { + return Err(BlockToBytesError::ArrowError(e)); + } + }; + match writer.write(&self.data) { + Ok(_) => {} + Err(e) => { + return Err(BlockToBytesError::ArrowError(e)); + } + } + match writer.finish() { + Ok(_) => {} + Err(e) => { + return Err(BlockToBytesError::ArrowError(e)); + } + } } - bytes + Ok(bytes) } - pub fn from_bytes(bytes: &[u8], id: Uuid) -> Result> { + pub fn from_bytes(bytes: &[u8], id: Uuid) -> Result { return Self::from_bytes_internal(bytes, id, false); } - pub fn from_bytes_with_validation( - bytes: &[u8], - id: Uuid, - ) -> Result> { + pub fn from_bytes_with_validation(bytes: &[u8], id: Uuid) -> Result { return Self::from_bytes_internal(bytes, id, true); } - fn from_bytes_internal( - bytes: &[u8], - id: Uuid, - validate: bool, - ) -> Result> { + fn from_bytes_internal(bytes: &[u8], id: Uuid, validate: bool) -> Result { let cursor = std::io::Cursor::new(bytes); return Self::load_with_reader(cursor, id, validate); } - pub fn load_with_validation(path: &str, id: Uuid) -> Result> { + pub fn load_with_validation(path: &str, id: Uuid) -> Result { return Self::load_internal(path, id, true); } - pub fn load(path: &str, id: Uuid) -> Result> { + pub fn load(path: &str, id: Uuid) -> Result { return Self::load_internal(path, id, false); } - fn load_internal(path: &str, id: Uuid, validate: bool) -> Result> { + fn load_internal(path: &str, id: Uuid, validate: bool) -> Result { let file = std::fs::File::open(path); let file = match file { Ok(file) => file, Err(e) => { - // TODO: Return a proper error - panic!("Error opening file: {:?}", e) + return Err(BlockLoadError::IOError(e)); } }; let reader = std::io::BufReader::new(file); return Self::load_with_reader(reader, id, validate); } - fn load_with_reader( - mut reader: R, - id: Uuid, - validate: bool, - ) -> Result> + fn load_with_reader(mut reader: R, id: Uuid, validate: bool) -> Result where R: std::io::Read + std::io::Seek, { @@ -335,22 +377,30 @@ impl Block { match res { Ok(_) => {} Err(e) => { - return Err(Box::new(e)); + return Err(BlockLoadError::ArrowLayoutVerificationError(e)); } } } - let mut arrow_reader = arrow::ipc::reader::FileReader::try_new(&mut reader, None) - .expect("Error creating reader"); - - let batch = arrow_reader.next().unwrap(); - // TODO: how to store / hydrate id? - match batch { - Ok(batch) => Ok(Self::from_record_batch(id, batch)), + let mut arrow_reader = match arrow::ipc::reader::FileReader::try_new(&mut reader, None) { + Ok(arrow_reader) => arrow_reader, Err(e) => { - panic!("Error reading batch: {:?}", e); + return Err(BlockLoadError::ArrowError(e)); } - } + }; + + let batch = match arrow_reader.next() { + Some(Ok(batch)) => batch, + Some(Err(e)) => { + return Err(BlockLoadError::ArrowError(e)); + } + None => { + return Err(BlockLoadError::NoRecordBatches); + } + }; + + // TODO: how to store / hydrate id? + Ok(Self::from_record_batch(id, batch)) } } @@ -358,12 +408,27 @@ fn get_size_of_array_data(array_data: &ArrayData) -> usize { let mut total_size = 0; for buffer in array_data.buffers() { // SYSTEM ASSUMPTION: ALL BUFFERS ARE PADDED TO 64 bytes - // We maintain this invariant in two places + // We maintain this invariant in three places // 1. In the to_arrow methods of delta storage, we allocate // padded buffers - // 2. In block load() we validate that the buffers are of size 64 + // 2. In calls to load() in tests we validate that the buffers are of size 64 + // 3. In writing to the IPC block file we use an option ensure 64 byte alignment + // which makes the arrow writer add padding to the buffers // Why do we do this instead of using get_buffer_memory_size() - // or using the buffers capacity? TODO: answer + // or using the buffers capacity? + // The reason is that arrow can dramatically overreport the size of buffers + // if the underlying buffers are shared. If we use something like get_buffer_memory_size() + // or capacity. This is because the buffer may be shared with other arrays. + // In the case of Arrow IPC data, all the data is one buffer + // so get_buffer_memory_size() would overreport the size of the buffer + // by the number of columns and also by the number of validity, and offset buffers. + // This is why we use the buffer.len() method which gives us the actual size of the buffer + // however len() excludes the capacity of the buffer which is why we round up to the nearest + // multiple of 64 bytes. We ensure, both when we construct the buffer and when we write it to disk + // that the buffer is also block.len() + padding of 64 bytes exactly. + // (As an added note, arrow throws away explicit knowledge of this padding, + // see verify_buffers_layout() for how we infer the padding based on + // the offsets of each buffer) let size = bit_util::round_upto_multiple_of_64(buffer.len()); total_size += size; } @@ -379,6 +444,57 @@ fn get_size_of_array_data(array_data: &ArrayData) -> usize { return total_size; } +/* +===== ErrorTypes ===== +*/ + +#[derive(Error, Debug)] +pub enum BlockSaveError { + #[error(transparent)] + IOError(#[from] std::io::Error), + #[error(transparent)] + ArrowError(#[from] arrow::error::ArrowError), +} + +impl ChromaError for BlockSaveError { + fn code(&self) -> ErrorCodes { + match self { + BlockSaveError::IOError(_) => ErrorCodes::Internal, + BlockSaveError::ArrowError(_) => ErrorCodes::Internal, + } + } +} + +#[derive(Error, Debug)] +pub enum BlockToBytesError { + #[error(transparent)] + ArrowError(#[from] arrow::error::ArrowError), +} + +impl ChromaError for BlockToBytesError { + fn code(&self) -> ErrorCodes { + match self { + BlockToBytesError::ArrowError(_) => ErrorCodes::Internal, + } + } +} + +#[derive(Error, Debug)] +pub enum BlockLoadError { + #[error(transparent)] + IOError(#[from] std::io::Error), + #[error(transparent)] + ArrowError(#[from] arrow::error::ArrowError), + #[error(transparent)] + ArrowLayoutVerificationError(#[from] ArrowLayoutVerificationError), + #[error("No record batches in IPC file")] + NoRecordBatches, +} + +/* +===== Layout Verification ===== +*/ + #[derive(Error, Debug)] pub enum ArrowLayoutVerificationError { #[error("Buffer length is not 64 byte aligned")] @@ -389,8 +505,6 @@ pub enum ArrowLayoutVerificationError { ArrowError(#[from] arrow::error::ArrowError), #[error(transparent)] InvalidFlatbuffer(#[from] flatbuffers::InvalidFlatbuffer), - #[error("No schema in footer")] - NoSchema, #[error("No record batches in footer")] NoRecordBatches, #[error("More than one record batch in IPC file")] @@ -399,27 +513,21 @@ pub enum ArrowLayoutVerificationError { InvalidMessageType, #[error("Error decoding record batch message as record batch")] RecordBatchDecodeError, - #[error("Record batch has no buffer blocks")] - NoBufferBlocks, } impl ChromaError for ArrowLayoutVerificationError { fn code(&self) -> ErrorCodes { match self { - ArrowLayoutVerificationError::BufferLengthNotAligned => ErrorCodes::Internal, - ArrowLayoutVerificationError::IOError(_) => ErrorCodes::Internal, - ArrowLayoutVerificationError::ArrowError(_) => ErrorCodes::Internal, - ArrowLayoutVerificationError::InvalidFlatbuffer(_) => ErrorCodes::Internal, - ArrowLayoutVerificationError::NoSchema => ErrorCodes::Internal, - ArrowLayoutVerificationError::NoRecordBatches => ErrorCodes::Internal, - ArrowLayoutVerificationError::MultipleRecordBatches => ErrorCodes::Internal, - ArrowLayoutVerificationError::InvalidMessageType => ErrorCodes::Internal, - ArrowLayoutVerificationError::RecordBatchDecodeError => ErrorCodes::Internal, - ArrowLayoutVerificationError::NoBufferBlocks => ErrorCodes::Internal, + // All errors are internal for this error type + _ => ErrorCodes::Internal, } } } +/// Verifies that the buffers in the IPC file are 64 byte aligned +/// and stored in Arrow in the way we expect. +/// All non-benchmark test code should use this by loading the block +/// with verification enabled. fn verify_buffers_layout(mut reader: R) -> Result<(), ArrowLayoutVerificationError> where R: std::io::Read + std::io::Seek, diff --git a/rust/worker/src/blockstore/arrow/provider.rs b/rust/worker/src/blockstore/arrow/provider.rs index 2ee8e951688..373fd728eb6 100644 --- a/rust/worker/src/blockstore/arrow/provider.rs +++ b/rust/worker/src/blockstore/arrow/provider.rs @@ -200,7 +200,13 @@ impl BlockManager { match block { Some(block) => { - let bytes = block.to_bytes(); + let bytes = match block.to_bytes() { + Ok(bytes) => bytes, + Err(e) => { + return Err(Box::new(e)); + } + }; + let key = format!("block/{}", id); let res = self.storage.put_bytes(&key, bytes).await; match res { @@ -334,7 +340,13 @@ impl SparseIndexManager { let as_block = index.to_block::(); match as_block { Ok(block) => { - let bytes = block.to_bytes(); + let bytes = match block.to_bytes() { + Ok(bytes) => bytes, + Err(e) => { + return Err(Box::new(e)); + } + }; + let key = format!("sparse_index/{}", id); let res = self.storage.put_bytes(&key, bytes).await; match res { diff --git a/rust/worker/test.arrow b/rust/worker/test.arrow deleted file mode 100644 index 0181f2bf47b4bc7b2206f76b4681267ab4c089ad..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 67710 zcmeF)akvd-|L^ggP(4W=BuSEx5PQv(#$Jw6eWv@Lmd+s$epL@^VU(bA>D_gW^ zc5@=X^ zK1%Ys2>4UTf6ST6@W z&*`Pl+XR05DgV>B^7>-dGi-~ZTa5`pT9u{H=R$vX*V>7m6H}>Hmj^H>7kg3yA z6eUpx-6#$qC- zU^-@D9u{H=R$vX*V>7m6H}>Hmj^H>7aF042MNtxEP#%?09RXt0MFTWObF@NRbVOJ5 zKp*tSAPmI_jK+9O!c@$_Y|O_ZEX7K!#RhD_4(!2x9KunYKtX=k_!o+y6w0Cks-Ol! z)IvQpL=&_?YqUcrbVD!n#Q+S(FpR_)Ou%GJ!%WP<0xZUItin2M!Zz&0UL3$-90NZr z4Lq$|YE+W)MeKbN-v_u$~!3wOw zdThpa?8ZJE#1R}v0e)aP9Ys+RWl$cKQ5^wd)I|d{Msu`6TXaNM^gti<#~=*F2#m&f zOu|&mz--LNA}qy9ti=Xw!4B-fejLJ4oIt@dx&A1IQYecGsDc^@Q496Z5KYhmt4KpzZ3$Pf=u?p+33EQv}dvO4VaSZOpUl>JD9HmhX6;Tzr zh)^5#(Fje^5^c}{ozWe=(GLSL1j8{3V=)m^Fdefn4-2sbE3gLZu^HR38~bn&M{pbk z&f@x`C`zIX%A+!>BS4J0Xn@9Oj#g-kj_8UW=!5jH6jK?HQ#SF~Gd@RCJti)Puz!vPl9_+^<9K{I~ z{5RJh#ZU@mQ2|v@10iam9vY$vTA(%Bp%c2H7y4oV24fgTVhko=GNxfB=3oI9V>wn~ z9X4Sbc499M;4qHC{m@?+MNk~2Q4SSR6}gB|8}-o$P0>oy2J5jI+p!z_a1cju90f{n{ZSMpQ3mBv8PyRWMqM;OV>CxAv_(gB zMGy2re+T=#73Dh#?q`Q5cJfn1bn;g?U(rC0Kzq zSdY!vj@{UYgE)fYC{T*)kD@4vGANJAsEzY@P}qd8ikEjpqrdY}*bV-SX71V&># zCSfXOU^eDs5td>l)?x#;UCg%Ax|Qpaw$JLOnD@6SP2Uv_mI!Lof8j01U=3jKmmB zz+_CrOw7RoEXH!I!a8iiHtfV+9Kc~5L(chJe-uG+ltwvJL{;P>LT%JXBQ!-zv_S`S zMtAf^KMceW496&p#Y9ZObj-p$EW{G5z#6Q_W^Bi9?88AE!EqEQ%k@W5ltdYnM`ct; zfEabr0FBWctjJ6*Dj!^RWm^u@Y;s0b8&Gd$1pea1SMJ^)LMtw9wQ?x`IbU0~EWrw_!Fp`QcI?JJ9K;bEM*(Wdr=uuJq72HTGO8m$jJj}jzVp%{VD7>`MqiW!)V`B;RdSc$dRfGya8J=l*!IEoV}Sf1;TVkm{OsDLV{ zfe^J&4-L@-Ezlb6&>oy z2J5jI+p!z_a1cju90e+H{ZSMpQ3mBv8PyRWMqM;OV>CxAv_(gBMGy2re+HQ~ z3Ej{OeK7!oF$^Oy1`{wD(=ZcrumFp(9ILPno3IT#u@?t$7{}lW{e@8k#ZemNP!Uy; ziwLz*AC1ryEzt%Y&>7v)8~rd4LoghpFcuRr1=BGL^RN(0umWqa9-FZpyRi=kaRkRv zpc2;~MNtxEP#%?09RXt0MFTWObF@NRbVOJ5Kp*tSAPmI_jK+9O!c@$_Y|O_ZEX7K! z#RhD_4(!2x9KunYK*7pfe-uM0ltl$pK@Ehcg?ea+CTM}yXopVdhF<840T_&77>O~M zfXSGKnV5qGSd8UZg>~43ZP-dGi;0+m>6nFiScoN9fi+l<&Df6J*oT8Sg5xMqh3k)^D2XyCkIJZy z05R&K0UDz@TA?jEqAPl!5Bg&ehGGOpV>~8dDrR6d=3^0-VkOpM1GZoX_Fz8_;V4d^ z;H6xD6hkSLMFmtr4TPwLdT59yXo1#fhfe5*Ug(Pf7>r>Ui7}Xf$(V+jn1cmajOAE` zb=ZV$*onP3fWtTj_m6yqQ3Sxl~El5V$?+gG)8l@LR)l1 zSM)$1^v56!#R!bXcuc}n%)o5S$097nO02~OY{3rf!G0XVQJg@*>Rf*mLn)L+1yn%| zgs6pjXox0gf!1h;PUwbS=!*dujA0mwF_?hKn1-2{g9TWOT=#73Dh#?q`Q5cJfn1bn;g?U(rC0Kzq zSdY!vj@{UYgE)fYC{TmzkD@4vGANJAsEzY@P}qd8ikEjpqrdY}*bV-SX71V&># zCSfXOU^eDs5td>l)?x#;!2b6PoXP*zAN_AX^8e011Nmnl{|w}xf&4R&e+Kf;K>iuX zKLhz^ApZ>HpMm@{kbeg9&p`ee$Ug)5XCVI!iuXKLhz^ApZ>HpMm@{@c;G<{O|rp z@&AAJ-g8I&UuUKuFHgr=pmhXY9J7yQ=I|I%PEACpg9f+`EpQ9kqciS7A3TJ?coL&9 z4wLX2-ozZdk0tmNYq1gAunYU}8~(%z6gq|HLQn#(*WN{_f|`g>2Uo%M*}DO)(H@;~ z4_uGEhcFmV;u(y^OK|=5-ozZdk0tmNuD9MsY{gFegoF4KuCHF90=y@bKp9+wOW=Cy zrBEAJp)qcN>!;TqopBHDLw`I9*GumijKxcsiZ|i<=)I4RumYcB1Gd8T(EAAo@h488 z(5XD@gJL)ru76%7R7VoEa3x&tyk=;Hwzvb`a4%fnyhkt;Ph$*Tg!RmOo7eO4A(mq` zT)(_;umeBh01o4CxL$c@pcu|YIaGq{lb1v-)WbDsikski#-T%!S%)a1&8rB3KZf!!S%#D7v)e9)es2!H0r?hxVsK5a0@!13tWG@`!N8I;Yp0bIJn++uVE(UVj(_;>udJ~ zHewrgVIO{j>uGlqh506R7E0oLTm;w8t|lVXK?7U|*URn}bU+u}gFbi&u8-Z57=>|| zgxBDD*v-ZJSb|Tn78~LE*X_bS{Dz}AiGpV^_waAH-gOtEGA=`i%i;RgU5n;uh1<{x zcf<9pdk};01V-X{SiibAcs(2MVKF{|>s9v^w%~j0#R2>Q*Qf5(fAOC1Z=}WF?bPE@CIC8y7#acpI{Bv;~Ti1bU)$%{=jjZ zdM57)|3+z)LnTy460R5Bm1u-!XocHgedzAxbw50Up?Dgu2i=R9f;TW5^YI~E|GCvz zk8iL8Kf?8%`vb>uN)hHBisBrU#l>)a=Mso<1sdXd+z8im?sjxVPxQsZaQ)_giIG%;;EB7*{;VsO=2XH;*KEoH-gm1AMKg0Et`wKZmc~2;cb8tRfFS$#R zi!|!uYPddfH=+%0M;G+K{ct_x9>Z`vi}83Fu7BKHn2Uw@7%TAwT<^GVu^T_*5dOjm z6g)d8=S;Z1ap$2tE2dhT$1lueeuvJp=Dx0X~B3 z6ZbhbU@LZFFMfsV5%&)Y(tGVp6vugR{oyV_4TQKH^>Hm+Z@8P$4tJtE?t}G(dxF;^ z@jNEtRk)sT?_dEw!V0XxS8)B{zQyD2~#&5S4KmTpzg0Q6JZ$Ic|pa zfV-R5z40Ih;R(3@Z_i^QUd0T|#(QwR-#)<_e1$Fe9(;To2dp?N)TeUFd}e;Cj71juChcFJLlWhwJn9E*4=KR^dyy9&g{_2kgi1 zIEGXB;n($dI~%2N0V<*z5^%lUu0TUvkCwO^Y8(d;xo9uZC_(Me!zbGj-xn%Lg(;vFiM~dE@7;d+iyF#Y>oq8F&Y- z@7hOLfzPo4Tj6@H{e*+~1An6cz0A%)F`Nt6Z>TDu8taR<8LUbsGM zk6 z*n?ki2uE=eu3y?&D2ek?0hhw{N=u_I8sIv#z%6in(z>7r?#BQ;2J4YFj@Og$8fIcH zTz|BWu@YZk6SiR&TyM1Ba1Tu-#?&;qxh1G>Qap*_Uw z!FUp*Fb=L4+H07JxmbuL_!O=W+D2@{F6_f^a6QmYqVNUGJ(NHhTm;wutR^DVK?7U| z>wVUq*PU?>`rskBzGqKj6vkl^rs7Swo@ei42|mSIY=rA~whQ~<`knobzi<+T=p}Y0 z{*7~S0WQX+sEH7@P!A2!1TD}S?a&F`&@h%o( z307bY)?+iaV>kBUAdcW~oKl{jQ*jnb;5=N2N~nee(x{F4XoRL{i8knf&ghQb=!XG# z6i?u3Jck$XGG4=*cn9y{LoCHgti=Xw!4B-fejLJ4oWQ9Uac*%o&cXS(2$$e8B$2@t zXn@9Oj#g-kj_8UW=!53w!u1;5=N2N~nee z(x`*0&yo6Wr2HwWI z_y8Z{Q+$r~*o^JijeR(XBRGx%75NztMNtCh;X+hGH6)Nm9bAQL(G34d547jm{zbfk z*YOtS;(dICPp}$aViUGuC-&k14&xYdDsgU61jTVKF2KdO6g82;<+u{p;CkGM)@X-L z=!RbCivbvnVHk-qaDC8T!mD@#Z{uBjfRFJhKF3%12HUY4`*09Pa2y3Hb8b--B~b`dIJc;Y6fVb=xCYnbM%;q-=!93fH0;Zo;j&9bM2Jz0nT?F$BXg3S%)5 zQ!pKGVJ_asNB9J*@g=^-x7dZBZ~%vK3^|vQTNFWYltwvJL{-#83YX(bT!ZUzBW^)^ z+=;vKpY%j~hwb0PhggQs@C7zvE565%_yvb>6em!yD(4o(Pzq&H0aZ`~Nn~&ZuEuq^ z0XO3|+=08$6MfJhgD?~$FdE}A2~#lxv*G%py^oLZ30C7ve2s6h3qRpk{DEW0sm8fQ z5fn#hltV>SMJ^&-jw^8uuE&kI1?_Ps?#8`%00S@>!!QzKFaeV>4KpzZ3-BS9;WK=J zjo6Cs@gsi0@AwPzw3g6&6{D7bF8~(&UC~z6yzfcq6nFiScoN9fi+l<&G;6(@DqN;ANU)m1pMBBvrq!(p&TlrDsmB_ zHtM4hnxZA_{~wU=g8!{Q{C_s)BH?6Tn``Vi_vN*^M9Nb5ta51H%- zeYM|l!hfcHCA6=E_Lb1S653Zn`$}kE3GGW?7U+Ei+E?HgYF~l&6=+|9_7!Mff%X+> zU)nyYeI>Q8r1q8c>$I<=_LbDWlG;~N`$}qG+A!3V(lx|zGCew*1lrxE7rbZ?JL&4 zV*lmZm;YP)%4lC1?JJ{wWwfu1_Lb4TGTK*0`^seZlKo;OHK93>$o0#zJ4`72iQMcC z6S>(PCUUboOyp*Fn8?lUFp;Yr>Z|=O75+p*`%?H53V%Z3PbmBeg+HP2Clr4FWeR^n z;ZG?1i6DER357qQ@Fx`hgu5X^Lw_1+3*Jnf1vOO3V)#R2MT|n@N2RJ3V)#R2MT|n z@COQipzsF@f1vOO3V)#R`#}}{K;aJ*{y^am6#hWr4;21D;SUu4K;ifAT;UHC{y^am z6#hWr4;21D;SUu4K;aJ*et-B1f1vOO3V)#R2MT|n@COQipzsF@f1vRDbExnK3V)#R z2MT|n@COQipzsF@f1vOO3ctS)3V)#R2MT|n@COQipzsF@f1vOO3V)#R`>UDBhCfjF z1BE|O_ydJMQ1}CdKT!Asg+EaEvk_7vk_vxP;ZG|3NrgYD@Fx}iq{5$6_>&5MQsGyu zk_vxP;ZG|3NrgYD@Fx}iq{5$6_>&5MQsGxNlL~)Q;ZG|3NrgG793~Z^q^^Ea7cQxD zp4915z?1$^{k!x7*t+a@{-na6RQQt$e^TL3D*T!yNrgYD@Fx}iq{5$6_>&5MQsGZ3 z{7HpBsqp*J6#k^bpH%ph3V%}JPb&OLg+HnACl&sr!tdX*!k<+5lL~)Q;ZG|3NrgYD z@Fx}iq{5$6`2BG!{7HpBsqiNi{-na6RQQt$e^TL3D*Q=>-=92%KdJC175=2cpH%ph z3V%}JPb&OLg+HnA`zxUECl&sr!k<+5lL~)Q;ZG|3NrgYD@Fx}i?Df`F3>E%R;SUx5 zP~i_1{!rl$75-4+4;B7U;a3zwg+EmILxn$7_(O$1RQN-MKUDZbg+EmIm8VeQ4;B7U z;SUx5P~i_1{!rl$75-4+4;6leGF13Og+EmILxn$7rbERsRFXo4Ak_5>b?TMu(4RJc z#Qx3tacq<9k3FHnA1eHz!XGO9njWFTA1eHz!XGO9p~4?3{Gq}hD*U0sA1eHQ7==Gn z_(O$1RQN-MKUDZbg+EmILxn$7`2BlT_(O$1RQN-MKUDZbg+EmILxn$7_(O%?AF#q7 zD*U0sA1eHz!XGO9p~4?3{Gq}hD*XPeDg2?rA1eHz!XGO9p~4?3{Gq}hD*U0spS?Ib z^C^WtrSPW|{*=O>QutE}e@fv`Df}sgKc(>N@}(61l)|4<_)`jhO5sl_{3(S$rSPW| z{*=P645SqPl)|4<_)`jhO5sl_{3(S$rSPW|{*=P6Af*)kl)|4<_)`jhO5sl_{3(S$ zrSPW|{*=P6G^P~(l)|4<_)`jhO5sl_+9{Wg_JI%BAxP=z@I&T;Ql@P;cUz7 z&-^KcKc(=e6n@Q%l)|4<_)`jhO5sl_{3(S$rSPW|{*=O>QuzJ;75QutE}e@fv`Df}sgKYJ#1wjzZ;QurfVM+$$W@cSJr{E@;RDg2SbA1VBi!XGL8k-{G-{E@=% z-<`rADg2SbA1VBi!XGL8k-{G-{E@;RDg4>P))A%^{Usg+HzErxpIR!msG075=ospH}$O3V&MRPb>Usg+HzErxpIR!ms?L75=ospH}$O z3V&MRPb>Usg+HyJrt5%|IF>$26Jw8F2MkXHE93V&MR zPb>Usg+HzErxpIR!k&tnkMQf2{Dw3V*Ed#|nR}@GE$+!XGRAvBDoK{IS9xEBvv-A1nN^ z@*XRyN^k6A<`d#CuD>q+l=`#b55&I*`|@m&Q&S*T_+y1XR`_FuKUVl-g+EsKV}(Cf z_+y1XyF=|Mqwr@G{*1z(QTQ_oe@5ZYDEt|PKcnzx6n?#(jKZH$_%jNBM&ZvW{27Hm zqwr@G{*1z(QTTOC8HGQi@MjeMjKZH$_%jNBM&ZvW{27Hmqwwp*WEB34!khQAXjlk5dv@^fN6=GG&S0!JCeAV$)##a?TI}=pH zR|#Jgd=>CjzgPKQ)q551Rl8T|K0AkkY?@WKSJ_@wdll_fvscMp6?+xzRj<#^z91W6 z73)>2SE*i=dKKzbr&pO?ReBZav-38{=2jJY73fu;S9xC5c@^hXn^$RGmHF(953*rZ zSzc9n73EcvS4mzKc@^YUk5@T9`@JH_CRDX}mEu*2S0P??c$MK*g;xgXz?tBS57x@zbup{s(f0($oQcQTts zRnJvCSM6M-b5+h&I9J_VWph={v)}oX*)*zTu8O$|=Bk&gT&`-lish=6t5lx7XC$*} zRGnO9a#hJyBv*}GC302BRUlV=JbSlEX49zRxN74njjJ-Q!no?R~1}EaMi$70#^lG1#s2BRsL4>J9}qT z?OUa9RlZgDR^3}=Z&kfj^j6JVCGYG#RRwR=yH)O1wOhq*)w)&cR;62oZq>Q7cVkt# zRpeHUTP1E)xK-d*eOu*iRku~#&fdpW+75L%dQaN1U+W5|Dr&2yt&+AX+S#7~RL@p9 zTh(k8vsKGhDO;6n6|z;wRvA0{vqP9oqiWbHVXK0z0=DYcDqpL5t>U$6*V&&s!fYB< zxK`a-WouQfRkT*kS|w{$tW~hi{=5=q)2M2-iq)!Bt5mH@wF=d$Q>#p^Ds}cJoiLk5 zm8eyrR)JdeX_co{omO#LwP}^6-3iW4Xw{`vmR40-MQPQfRgzXkS_Nsj@pH+QU@maNJm7Z03R^d7O^PI}g zsyeIateUe*&Z;=8;H-MH%FU`aXMYk@ty!gJRhm_3R-IX8W>uM0WLAw?CFbnUkSZ{z zbUt)YdS}|b-^LY0RasVHS#{;?PoJtPtEjA+vP#OTD662Xda}yNswS(Loc*~ql})26 z$tonPj;u1Ws>muLtA?x+vMR{gpLkQ*G^%{8>amK)svWCztje(p$Eq8vY@GdBIh9SL zYQ`!Vt75EzvFgPt7pq#VVzFw)?#yK8qYA~U6RS+DDzS>hsu8P1tO~IT#HtTxe>zun zSjA!0hE*C?Wmtt_)rD0SR#jL<;q1@#DhaD1tb(xW!72x<8mwZlYQZW6s}h|3jX-r^ zm4Q_SRuNb=V3mMX0agK6^~B7*?W(k^ z%B~8#s_Uw(tE#Swx~l1_q?`SnDaxi%^<0&6Rn1i~SG8P~a#hJyAy;+W>~C99HjOIc zs)nl)t}3`H;HrMB@~x`3D&D$7%1$R$ZdJHd-Bx8=Rc%$YRn1lf!38&x${#aPv1Rf<(5R)tvAVO54z6;?&q?C-HE!KwnQ z0<7w8vQWZ;8E6r9VR4G-VRCQ97 zNmV6PkyJHOl}J?~Re>~H&ro?()ln5kRU1`lRFzQ`MpYM8SyWZgY$ZfBQI$kh5miA{ z^-z^VRSi`!RJBl*LbEj#6+%@9RT)%OP!&N{162uB6;KsGRsYOZUsU~6@l&-=l|EJZ zRN+%~PnA7Y^;FR_TenfkGu9>5b0n5kN( zN|~x;s*tHVX0|$|DyE8aO zshXuqma150D`Kiws$8jRrHYlRRjO2}N~H>ws#B^=nXRR%NU0j7N|dTlsz9mwq{@@3 zPO3Pm+GMs0r^=)Xld4OqEUBubijt~HswAn3qzaPR`kl&=sz$09sam8;k*Y+h5UDz( z%8;r;W-EWHL8=6)3Zx2>{VC2xzyHLo16M_n0$}vSs^TUctvkxB+c&2fE{aJdB}u2IKJxW?(KB z;S;RI*Z2;5@f(ie)KeG}=b#)eK`t`560WD^jcAKI(E|@)5L_?I(RdNBVkX|jVz?fb z>+lU+|H_|n82`Zat}KdkQ685fz~yLw>(L6XUu75c!h;x$r!WStPvtbcjRp7^pJ6>* zf6CqX1xIiar=QC6YAAz?Q4JyLpdnm8%GS6Yt{3Hf7=Xv&`cRI=%b1RLun^0z1{<** z)_d{@uTSFif{ckWxEQYIWD0fR`b{=RYupakYw|t}z~dN+ahQxZFb4~<3~R6v+wmg~ z;xD+ql4qhM%Ayi#AdPys7AT_+<~8P2(E8r zK|K7jjPcVH=_f(;$HNJ>j60e&%^b9d>ym#K9=G$e2Fb^eINJZ51hbhr}Hc+ z&O-%MLlU)dHJajPxIT|}p*LKQ$02wc&%^b1d>ymldOI$~YHYx_*n?m3CvwiQ*gZ(U&2((#JgCGPq7ZJ=i*NMjKlZ`h507ydM%!d^0*WM zE=L2n9*eEe9~9KE^7n$5!mdFF1meIQ=ZX7oZF-Ms%)fGKzr^RNh?U@gAJ4*Uey3-NEbK8RH0q z;x=?fPxQkhcmkvF0;b?i%)^IRfiJKb-{U77!ryRx4$nd20^ z507Fvo`dUY_!{2Ae7JswEAb^h^1IDkKqQ=IPwD30^t`V(G;6zanDCTxyd;QA8Y zjXro7L-7p8!Sy4219R{Jmg945g6l)L7r)^z6yS%DGf@(*|6pa*gzG(cB^u*Kw1Mk8 zcsKgM^&A|EXD}YGUoIsL{rrl+^%pFM%BTs~TkuLW#*Ju; zJJACVU=W7kSxmsIn2C3>7@xxR5&Q-_u@A0?;BmPAfkkmH%HvW5xE!u;U=!SgcIbj$ zco42%;8Pd_*DG)uX5l@!K7pTMJ+@*ue!&r(#Od_oD~9t>0o4$q4jQ5vTu;E;(G9)v z5QgAsxL$xSV>;f!LM+1?xE_GtVh?`BQJg}ry?>zu%Ayji?_V8WH$*eIo`1Ka8(hD? z0eBoEF%Bg;<6)*of`;5w5@AUpR$cdjCQRl!fc-R|9F(!?kFETjBcob;tb} zh$k=#FJKDZ#5}kjexG11zQzvxghOz>`wG%4?<|zUg}4N{a6S9#<2tm&ZRm`i=!Zw~ zBu3*!OvPJpJ^DVvO1S=fTd)g1!}aF-2ZiZ_=lb%M#zm-#Bx>VoT#r_0kFK~E{qY!F zAHL@?39bj<+gN~);rj1=g>Co&2k!}Zx4 zf#>0R?7fcJcppo#8XMqx>-~TO_!BwwusZ|A;d<&-#AR^(^y=aoG>7Y@*AaK44<5!) zJcIF=j5ja`A7D8?$0mG-y>NZ=j-dek>CQw+T!6}O{qka5iN?4QZP5v?PhMX{!S%np39k2D7r4H64`MKe z!}YtH2-oXw7T&`We1`Sdirx4bzvDOx(J!tj&P91#iU60R0j|eQXooK71=qW7FkIid zF>pQWror{Adk?Nx-Dhxp>bAo5sQU%3Kix^3UY?&ta2_h48bZ`TLo`Ec+>UOz4-a7o zp2k?bjOlm>3$YAquo2&44}Qf_oN^K03s3@OQ3*AWhU+QU2sfY&?m&0kkAZjsqwoS= z!3@mBB7B0i_!>L#6As~ToLYf<3C=+|T!LI=P#@R9^@O_(ccKR#z#zCjaHHXRz)gkg z|27}4_uC4r#n;$@pKu6&qu|AS|3N8Sh$_g1>+e<{uD4rD+=kBRiGFw#!!a5!Vk+Lk ze0+qJ_!3*N3;S^x|DbS1zW<;!E<#l#Q5#pIDO#aDy5e5+$72`)*N<%yTralSSb&eQ z3SYtXU;6>B_u8Lueb>%_>$z42uHRa9xL#{@;rgsK$1UiHyU_=(x7y=yebvTcGF(5k zIdHwymc#W?+XUA`?MJx&X@9}>PCFB>Z`uW@jGBm153W~Q3%EXMop2BO;t>qPvzUO% zaDCC<#(Qu*(LTi&aQ)D}!;knCf8qoRU&8!EDU?HH)Ifx~Xo#k`3Af=++=Kh^FdoO# zcpfjo^*Wo0d02$ySc47NhTU-e%?{%@POZYRqXf>!#c(~%lDHgK;X2%iHt2|MaDB}B zV=!C~v(cCU*S~BA-ht~~_Ax$#>s$5>zK82s_8b1fDVOs70>$8Zm6b;oEiM2IKHDUdP**kHuJlwb+F1*n=%aB4HT#f5-6I^ewJ8=*CU;u`|^#mJ(iE#bEW?~*(FR2lnD1jv^@1eaDBBlVLNui_0;+uf5Y|DIuj*uJ}yQzBylwZej+07EbWV=xh~ z;!VuO2Uv#H_zGLG3qRv`97BPSpQlh9Wl#ZC5ug_8qcNJ}X0%5a^uz;r1W(`@jKj-# z9kVbWi?IT0u?gF;2M2HjCr~KGIlwu%0F`hVQmBKgaXoHATXaHq^uYiO!3d1OL`=m@ z%*6*-hSm5ATd@m2<9GaxQzOm+ilYoFpeh2?LVYww3$(%QxC{5|K;wW;`Ti%glU+C`B;n< z_#7MYEq=f+_yhl-P%UzZb8rDFqB=snlLKEN`p z#(HeQPVB=W97BQ2$svlP3@V@sY9fs*a1EN_X0%5a^uz;r1Vb?rV=)QSFbngs7%Q+A zoA51uz%Td%|DaH9&H>KB1*n9}5TZ63pb1){Ejpn)`d|Qt;3+(Z7x60I#9VxUWmt`` zuoXM84~K9J1?q4PP#k4Y0aXzogDY_@Zon)JI0hVAT)?qVt zU@s2hPn^VQ^*9GO7Z>6Z)I=Ir;2JbVE3`vr^gv$>#88aHSWLoecnk01LwthIu@T?m z2mFFRa2y4%3e(k`|r%n{+-a@{*&Cik9jX_gWSMxnVYm1p}k1ii^yK2?L}-aGX90_ zFJU7j@|M^L2^%3{BP48ugpH7}5fU~+!bY&&1U5ooBLsP?Y=po@2yBGFMhI+#z(xpc z1pDfwjgYhvk~TszZyOsSX(J?Ugrtp-v=Ndvf_+hFBZM|WXd{F+LYTL)jS$)hp^XsQ z2%(K&+oo)Ul#P(G5mGin%0@`#eU*)nvJp}?Ldr(4O(Gj1vJoO1A+ix78zHh0qP#D+ z5h5EQvJq@e+D1s*2x%K3Z6l;@gtU#2wh_{K`>_$yHi9jPZG_lHh;4+}Mu=^M*hYwL zgxE%i^LA(>>3sBK~+*wm&EsI9UQY--aD)K=LDwi}z; zbOrUB*wm&os4cN$u&GUdP+MikU{jkep?;NpwM}h$h1wb$+@>}iLv58EgH3JvhT1AS z2AkS>2X0fF4x)Zro7!{{wIwz}Xd~Fvrkkj3VFyY5o~JHXVkBAn@ZPeD-S+c23&rw@t$6!;N&ZD-?Escm8-*wm(PsjadRY--cJ)K>XD`@Ps~-~nfCz%$PFB5xBv5Dz=sDnAg9Jlhg~ zC_MOVOKc$9pG|EZg7%x()aFrWTVf+v5P2rrR@n%4_IWtkud+9qcQ7`!1Do1`O>G{R zwlA_{u&K>+)3(Zv!KQZJSKHL)F>1f9-L-j`+LqV|cGu>qYFlL^*j<~)tNkkbBAePg zXKicj@N8=Hz_qQiW3Z{slh?M&j=`q3eJkTBY`=+3ZJx!pC3XyU*XE&YTV=;!cWs`| zwpDfvwuwz`p49f6*j=0FwQY$VgWa`xaNAbdG1y(3C%FA8TVqq3N4af{9fM76p6Rw# zb__PPdAQqF*)iDEw(sgZ=%t}S_B++O&Bcx>F3_7tfpfO>AoO)VVFOW3Z{s z@5ZJ!PpR7y8^NY_-WS=_=Am`J zt=+YGblsNNS+cu!(xx_#vD-E_f=%tbcVknVr`r9tHnn-S-Imx0Hnn-k-B#HMHnn-$ z-LJAuY-;n^yREV7U{jmt-))thC7aqj2yd(G7;I|ioj;q}JQwe`wYxS?$J-Jc!S32g zo7y}nZ`;@iHnr`0E>F$-O>AoO?7S_p5o~Jn5WTIkQ(#k@r|E5#ou#~wHiAuU9 z+SKL|ds|{7*wp4(ds}5A*wpr4V@cxCdu_?n_x8dM#54G|B=3v-Ks=0ZtNcJbl5b1= zAy`CtHs7zYsm&Amw!}uTsm-JMw#r7Zsm(L{w#r7Z-PqLT`F+2MO>LgywDqCYyn7^fAx|v@)<4ejpke*b;vzv^B6Leju6~*b+YwEe`w=+n-Hs zdL7sr8^P|{G(E6YHiF%?>3(3VYy`V&+f9c~2!0cr+Vn%PC3XyU*QPOot+HdVsZECj zTV=;!Uu{#HZV7%9o7(hDuqAd3HnmfB*QS4hZDS+YUEAgoy%hW=Hnr)fU`uQSo7%Ki zuvK;pHnr)oV5{sHY}>p`Zc{szcLHr{r}Ey8-L+G8*QNu5-?2??`Y`w<_9kp<(~!Z| z*fH4DrZa=BviD_En-&eW%8tRNcHY=FwdvU4x3#-AeH(0vjbL}}lud1VIM_Bef=%tb zgR-ejO9#KLO>G)G*b*DTrZycOY?X~*Q=3)~epTK_yV5qbY5QQC*a$YY>HlD>Yy_Ly zG=i{Ie$Rdjy9zXg&<6B{uor$HxTI!Vs~xY zN7xb@!S33TO>J69*futTO>Mg+&{o24VpE&u61K!fu&GU#30q~yU{jlR6Sm5Z!M@t2 zHmxW8CN{NcKw(Sl7;I`sHnnL+VcXaUHnlD3G^X&I*wm&yg)OlWY--b|!dBTa*wm(B zg{`t)4alOVpE$I7PiEW!KOC7ENqn>gUte(TKHAANnWUIYDamO$fkCbcffYn zj%;d2HnnMaVLP@FY;2p_w7>A1*wm&8hApvUu&Eu{)TSYZZDS+Y)Xp2yrZ)XC{I)i= z>5^efYy_Lyw9BwnHiAuU`eyi5c^~ac+tj9qhHYXa*wm((hOM#@Y--b0!&dn{`=RX| z(P2Xy&}742_&~U?+_B3=aYy_LywBxW+H~`< zC3XxpwbM4W>Fie3wQ2fctLzwTYSaD0R@pJwwl=lt z1mZWbsZBo+TVlsxQ#)-_o8BO{jg4Sa+hz`ZLi{E+wdodOOKb$2+O!R^Rdx(Ewdo&X ztLzwT?7Z08)K2Gxz@~OO@08lqPUpQro7!oskkezt4`oBz)TY^pt+8XUsZG}rTV=;! zQ=9f9wkmIjHbUM|O>An@)x?(A2sX88Z(^%#1e@CQIk8nX zg6+npHa$=LCN{O{d}2%N7;I|C_PaJ+P;46;!G71ai%CZmzllw4`l8qp8^M0pra_9W zvSYBRO{WxFWyfG|%ceG6Q~V}2wdtK=OY%0h5$tzu`l#48HiG@GZFd%Ws`yQ8YSUT8 zme>gP134{LY?U2@O>KIu*eW{)+r*|eeOLS@Hnr)#VoU58Y--1L*QOJTZDS+Y59Bss z=*r?Zv8hdO7F%K?*wm&;i>HVTDY5!s`{6I8; zu_b;WTEW;7f3j!@V@v!%w1x3YY=1Vj=?`OTYy`V&(q##Y%e*wm&kjjgg{ux)K>)1$_3VpE$=HMYc# z!KQY`?%H&%v2APwyK7tO=wRbFv8hcT8(U%{*j<~3Hnz%+!KOByZETetgN-^8Xiop5Z4jbKxomN>S`j=`ojy>V=n z9Yfwn8^NY_CNIb~wKI9Kuph`Xc_-PXb|&vF`>CzB>F0v|qqhBEo@>b-wB4?LW28{cinCrl+@G?|18`F`d2ra=%+Yhy6UI!?#xZ z32gs(ZU1&{|8z~SZ@-;?w|?r<@!K!=@75orpSJc-*7W|?W`54n|J#4=ck3rCUBLZv zzgs_B{r3>Mforv&s&oYRpZj6`Jf$zVU+&+npQLmL_sjje^_%$#N|$i0_S2J2;r?^~ zZvEU0{M4jlxL@yg>*uBa(S^?8TJ0w#-NXIoez$%`(nH)Y_wUwEM>>i7<^J9J(fm}j ze}<-~xHj|i(EbgY&fsj6^cVNb{cioIe*V#CT&w-;qu03q+@AzL_2@Y6m;1Nn z=N)~=wcI}Vli=r^{d+SV$o*!1w|>6ShukmsyY-WeZsghJ`YInU`&VZBM`rqxYa>6` z=uPfF*QrSQe}uAsUZzL6U+;IWL#I=@7W?}q-OByvesn+A=vnTU>pk1QDbu;!FZa9k zU+(7`9n7`b&q2DF`_KKm^>Z!hb3`|DzuxcG|NQIc5?#%;+TS7VpOBMz>-}#1-7)Fs z+W)BQj&UQaf+!4kFqRxxg1`aWK(gJhGiC)yldu91WQd3bu>=AZF#!u;2`pm%e)W|_ z!XUd{RreiN-+qVN?UK&f>2WIOuGg*QxVD4i^v-9Q_q%DHi*158yE`&Y`&=EYWw^G} z;#AOQ8Lp*-F5dO_7S~clR|l(%Yda!N8-12}XS?&^6w>AQI(v(2DWy+c8rOC%oMO7N zm(07_bkpU*;#%tI@?eFymWuk6aV;fvu}#3W^wj0Gg=?v*PktZV3AnZc;Pllk1B+{^ ztjmMNwY1jH{H%5#?AUjwzNxOyB3w&(T|8Uyf@|ro%g=L9gMH$8>O1RAiCr43b1gk~ zd9cp4RN3XhI@jE#sk2WR*HUN~&sL<;E(VKhJK0UGU0t^>!?je~r|euyxm^qv*IsZf z1$T9@&NUoL$$iSswU>Psxc2hCPPe$WGu)KjU9VfqaBau8X}r%eT-(`gO7HS>Kcx09 z2J2kg-->HHvrYMZmg8FL?_#h%LZ$&P4;I%P&S}9<**nbi;KlY9*HVR-*DbE44KHsK zgt)fT*;L|Z8Lp)iF9wTi>BY-04%mIrEw1fEHud33VuBA#Z2J1a!+Vt|e#kCab<#nqITuZ5b%D9$V zy?EDwb=Rj_uMXBD#@BYCh%{~dxeV8KKAJ+lJXl;yC;!Y()z0oF`e(*V5t_?>ex!mL|VCSX^_Hr_DcQTuYx{ z3>Mc?>6h0nuBFv4uUqB4BYa-lq2}9el5_2C+i%ih#jsxmoan0SB^T3pGE%yO2SX|44Kwh`FmJ@-zZk2H@M}jFk z*K#Eg>lW9pxRyJCI#^u8m0Sv@>^>m=&sT`;gdujOAL z)-A5(Vj%BX;#yt?@-|@~z~b5t5%V>e<+zr&ff%g6MCNcHKX?=Abw`IgIn3?gT)cC` z91p|;e&E{940AqE2R<*<&I@xum@=;Af*=NqYlm|!Hw1OCKCjuyToI;>YdIr`!TP+G zKZ5*h#ZCuvNs!mA%Wy5Xgel`%jtOGj;@aW!TFwdTV0~WmI_ID;Wn9ZeK@8UCwY(JM zb&G2`D#+{BGF;19Vam9cyMkD^xOVtEWG)NpV0~WmfBf7Qri^PjE{MT8*YaJE*DbE) zz96q#zYW)NVVJUWEhh%CZgK6vwHz7L!QvW@m>vCMn^#QxlD+`I@j`= zkk<~bpqviW9H#fKhI}RU!BO0=g;3%TkLzk_kSPG|I$e7 k=U;#O@uy#YPq(h0zq^0?&*SAmqTIiIef;$4`yamj0$#$~UjP6A From 107cc42ab008860d2b32e1f74c1ff0dda79340c6 Mon Sep 17 00:00:00 2001 From: hammadb Date: Mon, 1 Jul 2024 14:20:13 -0700 Subject: [PATCH 07/10] cleanup --- rust/worker/src/blockstore/arrow/block/delta_storage.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/rust/worker/src/blockstore/arrow/block/delta_storage.rs b/rust/worker/src/blockstore/arrow/block/delta_storage.rs index 965e3681af0..7d5e77e1f04 100644 --- a/rust/worker/src/blockstore/arrow/block/delta_storage.rs +++ b/rust/worker/src/blockstore/arrow/block/delta_storage.rs @@ -9,7 +9,6 @@ use arrow::{ Int32Array, Int32Builder, ListBuilder, RecordBatch, StringBuilder, StructArray, UInt32Builder, }, - buffer::{Buffer, MutableBuffer}, datatypes::{Field, Fields}, util::bit_util, }; From e03cb300b0c0e5d5543fe48f1006e1292da4b29c Mon Sep 17 00:00:00 2001 From: hammadb Date: Mon, 1 Jul 2024 14:24:34 -0700 Subject: [PATCH 08/10] Remove tests redudndant with those in delta.rs --- .../src/blockstore/arrow/block/types.rs | 90 ------------------- 1 file changed, 90 deletions(-) diff --git a/rust/worker/src/blockstore/arrow/block/types.rs b/rust/worker/src/blockstore/arrow/block/types.rs index 54d5c7082c5..a7c909524f6 100644 --- a/rust/worker/src/blockstore/arrow/block/types.rs +++ b/rust/worker/src/blockstore/arrow/block/types.rs @@ -666,93 +666,3 @@ where Ok(()) } - -// #[derive(Error, Debug)] -// pub enum FinishError { -// #[error("Arrow error")] -// ArrowError(#[from] arrow::error::ArrowError), -// } - -// impl ChromaError for FinishError { -// fn code(&self) -> ErrorCodes { -// match self { -// FinishError::ArrowError(_) => ErrorCodes::Internal, -// } -// } -// } - -// #[cfg(test)] -// mod test { -// use super::*; -// use crate::blockstore::types::Key; -// use arrow::array::Int32Array; - -// #[test] -// fn test_block_builder_can_add() { -// let num_entries = 1000; - -// let mut keys = Vec::new(); -// let mut key_bytes = 0; -// for i in 0..num_entries { -// keys.push(Key::String(format!("{:04}", i))); -// key_bytes += i.to_string().len(); -// } - -// let prefix = "key".to_string(); -// let prefix_bytes = prefix.len() * num_entries; -// let mut block_builder = BlockDataBuilder::new( -// KeyType::String, -// ValueType::Int32Array, -// Some(BlockBuilderOptions::new( -// num_entries, -// prefix_bytes, -// key_bytes, -// num_entries, // 2 int32s per entry -// num_entries * 2 * 4, // 2 int32s per entry -// )), -// ); - -// for i in 0..num_entries { -// block_builder -// .add( -// BlockfileKey::new(prefix.clone(), keys[i].clone()), -// Value::Int32ArrayValue(Int32Array::from(vec![i as i32, (i + 1) as i32])), -// ) -// .unwrap(); -// } - -// // Basic sanity check -// let block_data = block_builder.build().unwrap(); -// assert_eq!(block_data.data.column(0).len(), num_entries); -// assert_eq!(block_data.data.column(1).len(), num_entries); -// assert_eq!(block_data.data.column(2).len(), num_entries); -// } - -// // #[test] -// // fn test_out_of_order_key_fails() { -// // let mut block_builder = BlockDataBuilder::new( -// // KeyType::String, -// // ValueType::Int32Array, -// // Some(BlockBuilderOptions::default()), -// // ); - -// // block_builder -// // .add( -// // BlockfileKey::new("key".to_string(), Key::String("b".to_string())), -// // Value::Int32ArrayValue(Int32Array::from(vec![1, 2])), -// // ) -// // .unwrap(); - -// // let result = block_builder.add( -// // BlockfileKey::new("key".to_string(), Key::String("a".to_string())), -// // Value::Int32ArrayValue(Int32Array::from(vec![1, 2])), -// // ); - -// // match result { -// // Ok(_) => panic!("Expected error"), -// // Err(e) => { -// // assert_eq!(e.code(), ErrorCodes::InvalidArgument); -// // } -// // } -// // } -// // } From bd967758360065e910064a9827be482c0e63fb67 Mon Sep 17 00:00:00 2001 From: hammadb Date: Mon, 1 Jul 2024 14:28:11 -0700 Subject: [PATCH 09/10] Cleanup --- rust/worker/src/blockstore/arrow/block/types.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/rust/worker/src/blockstore/arrow/block/types.rs b/rust/worker/src/blockstore/arrow/block/types.rs index a7c909524f6..ad16946fa6f 100644 --- a/rust/worker/src/blockstore/arrow/block/types.rs +++ b/rust/worker/src/blockstore/arrow/block/types.rs @@ -335,10 +335,15 @@ impl Block { Ok(bytes) } + /// Load a block from bytes in Arrow IPC format with the given id pub fn from_bytes(bytes: &[u8], id: Uuid) -> Result { return Self::from_bytes_internal(bytes, id, false); } + /// Load a block from bytes in Arrow IPC format with the given id and validate the layout + /// ### Notes + /// - This method should be used in tests to ensure that the layout of the IPC file is as expected + /// - The validation is not performant and should not be used in production code pub fn from_bytes_with_validation(bytes: &[u8], id: Uuid) -> Result { return Self::from_bytes_internal(bytes, id, true); } @@ -348,10 +353,15 @@ impl Block { return Self::load_with_reader(cursor, id, validate); } + /// Load a block from the given path with the given id and validate the layout + /// ### Notes + /// - This method should be used in tests to ensure that the layout of the IPC file is as expected + /// - The validation is not performant and should not be used in production code pub fn load_with_validation(path: &str, id: Uuid) -> Result { return Self::load_internal(path, id, true); } + /// Load a block from the given path with the given id pub fn load(path: &str, id: Uuid) -> Result { return Self::load_internal(path, id, false); } From 81e3ab8b70e7c3a71fb581a3448ea2030959c63b Mon Sep 17 00:00:00 2001 From: hammadb Date: Mon, 1 Jul 2024 14:30:16 -0700 Subject: [PATCH 10/10] cleanup --- .../src/blockstore/arrow/block/types.rs | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/rust/worker/src/blockstore/arrow/block/types.rs b/rust/worker/src/blockstore/arrow/block/types.rs index ad16946fa6f..c87b34471a4 100644 --- a/rust/worker/src/blockstore/arrow/block/types.rs +++ b/rust/worker/src/blockstore/arrow/block/types.rs @@ -14,6 +14,8 @@ use std::io::SeekFrom; use thiserror::Error; use uuid::Uuid; +const ARROW_ALIGNMENT: usize = 64; + /// A block in a blockfile. A block is a sorted collection of data that is immutable once it has been committed. /// Blocks are the fundamental unit of storage in the blockstore and are used to store data in the form of (key, value) pairs. /// These pairs are stored in an Arrow record batch with the schema (prefix, key, value). @@ -275,13 +277,16 @@ impl Block { // We force the block to be written with 64 byte alignment // this is the default, but we are just being defensive let mut writer = std::io::BufWriter::new(file); - let options = - match arrow::ipc::writer::IpcWriteOptions::try_new(64, false, MetadataVersion::V5) { - Ok(options) => options, - Err(e) => { - return Err(BlockSaveError::ArrowError(e)); - } - }; + let options = match arrow::ipc::writer::IpcWriteOptions::try_new( + ARROW_ALIGNMENT, + false, + MetadataVersion::V5, + ) { + Ok(options) => options, + Err(e) => { + return Err(BlockSaveError::ArrowError(e)); + } + }; let writer = arrow::ipc::writer::FileWriter::try_new_with_options( &mut writer, @@ -657,15 +662,15 @@ where let mut prev_offset = blocks.get(0).offset(); for block in blocks.iter().skip(1) { let curr_offset = block.offset(); - let len = curr_offset - prev_offset; - if len % 64 != 0 { + let len = (curr_offset - prev_offset) as usize; + if len % ARROW_ALIGNMENT != 0 { return Err(ArrowLayoutVerificationError::BufferLengthNotAligned); } prev_offset = curr_offset; } // Check the remaining buffer length based on the body length let last_buffer_len = record_batch_body_len - prev_offset as usize; - if last_buffer_len % 64 != 0 { + if last_buffer_len % ARROW_ALIGNMENT != 0 { return Err(ArrowLayoutVerificationError::BufferLengthNotAligned); } }