ArrayNBytes includes size of arrays metadata (#1549)

robert3005 · web-flow · commit 6624f31ede6e · 2024-12-04T17:12:58.000Z
diff --git a/docs/quickstart.rst b/docs/quickstart.rst
@@ -35,7 +35,7 @@ Vortex array:
    >>> parquet = pq.read_table("_static/example.parquet")
    >>> vtx = vortex.array(parquet)
    >>> vtx.nbytes
-   141024
+   141070
 
 Compress
 ^^^^^^^^
@@ -46,9 +46,9 @@ Use :func:`~vortex.encoding.compress` to compress the Vortex array and check the
 
    >>> cvtx = vortex.compress(vtx)
    >>> cvtx.nbytes
-   15243
+   17780
    >>> cvtx.nbytes / vtx.nbytes
-   0.108...
+   0.126...
 
 Vortex uses nearly ten times fewer bytes than Arrow. Fewer bytes means more of your data fits in
 cache and RAM.
diff --git a/pyvortex/src/array.rs b/pyvortex/src/array.rs
@@ -520,10 +520,10 @@ impl PyArray {
     ///
     ///     >>> arr = vortex.array([1, 2, None, 3])
     ///     >>> print(arr.tree_display())
-    ///     root: vortex.primitive(0x03)(i64?, len=4) nbytes=33 B (100.00%)
+    ///     root: vortex.primitive(0x03)(i64?, len=4) nbytes=36 B (100.00%)
     ///       metadata: PrimitiveMetadata { validity: Array }
     ///       buffer: 32 B
-    ///       validity: vortex.bool(0x02)(bool, len=4) nbytes=1 B (3.03%)
+    ///       validity: vortex.bool(0x02)(bool, len=4) nbytes=3 B (8.33%)
     ///         metadata: BoolMetadata { validity: NonNullable, first_byte_bit_offset: 0 }
     ///         buffer: 1 B
     ///     <BLANKLINE>
diff --git a/uv.lock b/uv.lock
diff --git a/vortex-array/src/metadata.rs b/vortex-array/src/metadata.rs
@@ -13,7 +13,6 @@ use crate::encoding::Encoding;
 /// Note that this allows us to restrict the ('static + Send + Sync) requirement to just the
 /// metadata trait, and not the entire array trait. We require 'static so that we can downcast
 /// use the Any trait.
-/// TODO(ngates): add Display
 pub trait ArrayMetadata:
     'static + Send + Sync + Debug + TrySerializeArrayMetadata + Display
 {
diff --git a/vortex-array/src/nbytes.rs b/vortex-array/src/nbytes.rs
@@ -11,7 +11,7 @@ impl ArrayData {
         self.encoding()
             .accept(self.as_ref(), &mut visitor)
             .vortex_expect("Failed to get nbytes from Array");
-        visitor.0
+        visitor.0 + size_of_val(self.array_metadata())
     }
 }
 
diff --git a/vortex-sampling-compressor/src/compressors/mod.rs b/vortex-sampling-compressor/src/compressors/mod.rs
@@ -183,13 +183,6 @@ impl<'a> CompressionTree<'a> {
         std::mem::take(&mut self.metadata)
     }
 
-    pub fn num_descendants(&self) -> usize {
-        self.children
-            .iter()
-            .filter_map(|child| child.as_ref().map(|c| c.num_descendants() + 1))
-            .sum::<usize>()
-    }
-
     #[allow(clippy::type_complexity)]
     pub fn into_parts(
         self,
diff --git a/vortex-sampling-compressor/src/lib.rs b/vortex-sampling-compressor/src/lib.rs
@@ -9,7 +9,7 @@ use compressors::roaring_bool::RoaringBoolCompressor;
 use compressors::roaring_int::RoaringIntCompressor;
 use compressors::struct_::StructCompressor;
 use compressors::varbin::VarBinCompressor;
-use compressors::{CompressedArray, CompressionTree, CompressorRef};
+use compressors::{CompressedArray, CompressorRef};
 use vortex_alp::{ALPEncoding, ALPRDEncoding};
 use vortex_array::array::{
     PrimitiveEncoding, SparseEncoding, StructEncoding, VarBinEncoding, VarBinViewEncoding,
@@ -126,16 +126,8 @@ impl Objective {
         base_size_bytes: usize,
         config: &CompressConfig,
     ) -> f64 {
-        let num_descendants = array
-            .path()
-            .as_ref()
-            .map(CompressionTree::num_descendants)
-            .unwrap_or(0) as u64;
-        let overhead_bytes = num_descendants * config.overhead_bytes_per_array;
-        let size_in_bytes = array.nbytes() as u64 + overhead_bytes;
-
         match &config.objective {
-            Objective::MinSize => (size_in_bytes as f64) / (base_size_bytes as f64),
+            Objective::MinSize => (array.nbytes() as f64) / (base_size_bytes as f64),
         }
     }
 }
@@ -153,8 +145,6 @@ pub struct CompressConfig {
     max_cost: u8,
     // Are we minimizing size or maximizing performance?
     objective: Objective,
-    /// Penalty in bytes per compression level
-    overhead_bytes_per_array: u64,
 
     // Target chunk size in bytes
     target_block_bytesize: usize,
@@ -172,7 +162,6 @@ impl Default for CompressConfig {
             sample_count: 16,
             max_cost: constants::DEFAULT_MAX_COST,
             objective: Objective::MinSize,
-            overhead_bytes_per_array: 64,
             target_block_bytesize: 16 * mib,
             target_block_size: 64 * kib,
             rng_seed: 0,
diff --git a/vortex-sampling-compressor/tests/smoketest.rs b/vortex-sampling-compressor/tests/smoketest.rs
@@ -125,7 +125,7 @@ mod tests {
             assert_eq!(chunk.encoding().id(), FoREncoding::ID);
             assert_eq!(
                 chunk.statistics().get(Stat::UncompressedSizeInBytes),
-                Some(Scalar::from((chunk.len() * 8) as u64))
+                Some(Scalar::from((chunk.len() * 8) as u64 + 1))
             );
         }
 
@@ -138,7 +138,7 @@ mod tests {
             assert_eq!(chunk.encoding().id(), BoolEncoding::ID);
             assert_eq!(
                 chunk.statistics().get(Stat::UncompressedSizeInBytes),
-                Some(Scalar::from(chunk.len().div_ceil(8) as u64))
+                Some(Scalar::from(chunk.len().div_ceil(8) as u64 + 2))
             );
         }
 
@@ -154,7 +154,7 @@ mod tests {
             );
             assert_eq!(
                 chunk.statistics().get(Stat::UncompressedSizeInBytes),
-                Some(Scalar::from(1392640u64))
+                Some(Scalar::from(1392677u64))
             );
         }
 
@@ -167,7 +167,7 @@ mod tests {
             assert_eq!(chunk.encoding().id(), VarBinEncoding::ID);
             assert_eq!(
                 chunk.statistics().get(Stat::UncompressedSizeInBytes),
-                Some(Scalar::from(134357000u64))
+                Some(Scalar::from(134357018u64))
             );
         }
 
@@ -180,7 +180,7 @@ mod tests {
             assert_eq!(chunk.encoding().id(), DateTimePartsEncoding::ID);
             assert_eq!(
                 chunk.statistics().get(Stat::UncompressedSizeInBytes),
-                Some((chunk.len() * 8).into())
+                Some((chunk.len() * 8 + 4).into())
             )
         }
     }

Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,6 @@ use crate::encoding::Encoding;`
`13`	`13`	`/// Note that this allows us to restrict the ('static + Send + Sync) requirement to just the`
`14`	`14`	`/// metadata trait, and not the entire array trait. We require 'static so that we can downcast`
`15`	`15`	`/// use the Any trait.`
`16`		`-/// TODO(ngates): add Display`
`17`	`16`	`pub trait ArrayMetadata:`
`18`	`17`	`'static + Send + Sync + Debug + TrySerializeArrayMetadata + Display`
`19`	`18`	`{`
Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@ impl ArrayData {`
`11`	`11`	`self.encoding()`
`12`	`12`	`.accept(self.as_ref(), &mut visitor)`
`13`	`13`	`.vortex_expect("Failed to get nbytes from Array");`
`14`		`- visitor.0`
	`14`	`+ visitor.0 + size_of_val(self.array_metadata())`
`15`	`15`	`}`
`16`	`16`	`}`
`17`	`17`
Original file line number	Diff line number	Diff line change
`@@ -125,7 +125,7 @@ mod tests {`
`125`	`125`	`assert_eq!(chunk.encoding().id(), FoREncoding::ID);`
`126`	`126`	`assert_eq!(`
`127`	`127`	`chunk.statistics().get(Stat::UncompressedSizeInBytes),`
`128`		`- Some(Scalar::from((chunk.len() * 8) as u64))`
	`128`	`+ Some(Scalar::from((chunk.len() * 8) as u64 + 1))`
`129`	`129`	`);`
`130`	`130`	`}`
`131`	`131`
`@@ -138,7 +138,7 @@ mod tests {`
`138`	`138`	`assert_eq!(chunk.encoding().id(), BoolEncoding::ID);`
`139`	`139`	`assert_eq!(`
`140`	`140`	`chunk.statistics().get(Stat::UncompressedSizeInBytes),`
`141`		`- Some(Scalar::from(chunk.len().div_ceil(8) as u64))`
	`141`	`+ Some(Scalar::from(chunk.len().div_ceil(8) as u64 + 2))`
`142`	`142`	`);`
`143`	`143`	`}`
`144`	`144`
`@@ -154,7 +154,7 @@ mod tests {`
`154`	`154`	`);`
`155`	`155`	`assert_eq!(`
`156`	`156`	`chunk.statistics().get(Stat::UncompressedSizeInBytes),`
`157`		`- Some(Scalar::from(1392640u64))`
	`157`	`+ Some(Scalar::from(1392677u64))`
`158`	`158`	`);`
`159`	`159`	`}`
`160`	`160`
`@@ -167,7 +167,7 @@ mod tests {`
`167`	`167`	`assert_eq!(chunk.encoding().id(), VarBinEncoding::ID);`
`168`	`168`	`assert_eq!(`
`169`	`169`	`chunk.statistics().get(Stat::UncompressedSizeInBytes),`
`170`		`- Some(Scalar::from(134357000u64))`
	`170`	`+ Some(Scalar::from(134357018u64))`
`171`	`171`	`);`
`172`	`172`	`}`
`173`	`173`
`@@ -180,7 +180,7 @@ mod tests {`
`180`	`180`	`assert_eq!(chunk.encoding().id(), DateTimePartsEncoding::ID);`
`181`	`181`	`assert_eq!(`
`182`	`182`	`chunk.statistics().get(Stat::UncompressedSizeInBytes),`
`183`		`- Some((chunk.len() * 8).into())`
	`183`	`+ Some((chunk.len() * 8 + 4).into())`
`184`	`184`	`)`
`185`	`185`	`}`
`186`	`186`	`}`