Skip to content

Commit 6624f31

Browse files
authored
ArrayNBytes includes size of arrays metadata (#1549)
1 parent 9146a45 commit 6624f31

File tree

8 files changed

+14
-33
lines changed

8 files changed

+14
-33
lines changed

docs/quickstart.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ Vortex array:
3535
>>> parquet = pq.read_table("_static/example.parquet")
3636
>>> vtx = vortex.array(parquet)
3737
>>> vtx.nbytes
38-
141024
38+
141070
3939

4040
Compress
4141
^^^^^^^^
@@ -46,9 +46,9 @@ Use :func:`~vortex.encoding.compress` to compress the Vortex array and check the
4646

4747
>>> cvtx = vortex.compress(vtx)
4848
>>> cvtx.nbytes
49-
15243
49+
17780
5050
>>> cvtx.nbytes / vtx.nbytes
51-
0.108...
51+
0.126...
5252

5353
Vortex uses nearly ten times fewer bytes than Arrow. Fewer bytes means more of your data fits in
5454
cache and RAM.

pyvortex/src/array.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -520,10 +520,10 @@ impl PyArray {
520520
///
521521
/// >>> arr = vortex.array([1, 2, None, 3])
522522
/// >>> print(arr.tree_display())
523-
/// root: vortex.primitive(0x03)(i64?, len=4) nbytes=33 B (100.00%)
523+
/// root: vortex.primitive(0x03)(i64?, len=4) nbytes=36 B (100.00%)
524524
/// metadata: PrimitiveMetadata { validity: Array }
525525
/// buffer: 32 B
526-
/// validity: vortex.bool(0x02)(bool, len=4) nbytes=1 B (3.03%)
526+
/// validity: vortex.bool(0x02)(bool, len=4) nbytes=3 B (8.33%)
527527
/// metadata: BoolMetadata { validity: NonNullable, first_byte_bit_offset: 0 }
528528
/// buffer: 1 B
529529
/// <BLANKLINE>

uv.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vortex-array/src/metadata.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ use crate::encoding::Encoding;
1313
/// Note that this allows us to restrict the ('static + Send + Sync) requirement to just the
1414
/// metadata trait, and not the entire array trait. We require 'static so that we can downcast
1515
/// use the Any trait.
16-
/// TODO(ngates): add Display
1716
pub trait ArrayMetadata:
1817
'static + Send + Sync + Debug + TrySerializeArrayMetadata + Display
1918
{

vortex-array/src/nbytes.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ impl ArrayData {
1111
self.encoding()
1212
.accept(self.as_ref(), &mut visitor)
1313
.vortex_expect("Failed to get nbytes from Array");
14-
visitor.0
14+
visitor.0 + size_of_val(self.array_metadata())
1515
}
1616
}
1717

vortex-sampling-compressor/src/compressors/mod.rs

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -183,13 +183,6 @@ impl<'a> CompressionTree<'a> {
183183
std::mem::take(&mut self.metadata)
184184
}
185185

186-
pub fn num_descendants(&self) -> usize {
187-
self.children
188-
.iter()
189-
.filter_map(|child| child.as_ref().map(|c| c.num_descendants() + 1))
190-
.sum::<usize>()
191-
}
192-
193186
#[allow(clippy::type_complexity)]
194187
pub fn into_parts(
195188
self,

vortex-sampling-compressor/src/lib.rs

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use compressors::roaring_bool::RoaringBoolCompressor;
99
use compressors::roaring_int::RoaringIntCompressor;
1010
use compressors::struct_::StructCompressor;
1111
use compressors::varbin::VarBinCompressor;
12-
use compressors::{CompressedArray, CompressionTree, CompressorRef};
12+
use compressors::{CompressedArray, CompressorRef};
1313
use vortex_alp::{ALPEncoding, ALPRDEncoding};
1414
use vortex_array::array::{
1515
PrimitiveEncoding, SparseEncoding, StructEncoding, VarBinEncoding, VarBinViewEncoding,
@@ -126,16 +126,8 @@ impl Objective {
126126
base_size_bytes: usize,
127127
config: &CompressConfig,
128128
) -> f64 {
129-
let num_descendants = array
130-
.path()
131-
.as_ref()
132-
.map(CompressionTree::num_descendants)
133-
.unwrap_or(0) as u64;
134-
let overhead_bytes = num_descendants * config.overhead_bytes_per_array;
135-
let size_in_bytes = array.nbytes() as u64 + overhead_bytes;
136-
137129
match &config.objective {
138-
Objective::MinSize => (size_in_bytes as f64) / (base_size_bytes as f64),
130+
Objective::MinSize => (array.nbytes() as f64) / (base_size_bytes as f64),
139131
}
140132
}
141133
}
@@ -153,8 +145,6 @@ pub struct CompressConfig {
153145
max_cost: u8,
154146
// Are we minimizing size or maximizing performance?
155147
objective: Objective,
156-
/// Penalty in bytes per compression level
157-
overhead_bytes_per_array: u64,
158148

159149
// Target chunk size in bytes
160150
target_block_bytesize: usize,
@@ -172,7 +162,6 @@ impl Default for CompressConfig {
172162
sample_count: 16,
173163
max_cost: constants::DEFAULT_MAX_COST,
174164
objective: Objective::MinSize,
175-
overhead_bytes_per_array: 64,
176165
target_block_bytesize: 16 * mib,
177166
target_block_size: 64 * kib,
178167
rng_seed: 0,

vortex-sampling-compressor/tests/smoketest.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ mod tests {
125125
assert_eq!(chunk.encoding().id(), FoREncoding::ID);
126126
assert_eq!(
127127
chunk.statistics().get(Stat::UncompressedSizeInBytes),
128-
Some(Scalar::from((chunk.len() * 8) as u64))
128+
Some(Scalar::from((chunk.len() * 8) as u64 + 1))
129129
);
130130
}
131131

@@ -138,7 +138,7 @@ mod tests {
138138
assert_eq!(chunk.encoding().id(), BoolEncoding::ID);
139139
assert_eq!(
140140
chunk.statistics().get(Stat::UncompressedSizeInBytes),
141-
Some(Scalar::from(chunk.len().div_ceil(8) as u64))
141+
Some(Scalar::from(chunk.len().div_ceil(8) as u64 + 2))
142142
);
143143
}
144144

@@ -154,7 +154,7 @@ mod tests {
154154
);
155155
assert_eq!(
156156
chunk.statistics().get(Stat::UncompressedSizeInBytes),
157-
Some(Scalar::from(1392640u64))
157+
Some(Scalar::from(1392677u64))
158158
);
159159
}
160160

@@ -167,7 +167,7 @@ mod tests {
167167
assert_eq!(chunk.encoding().id(), VarBinEncoding::ID);
168168
assert_eq!(
169169
chunk.statistics().get(Stat::UncompressedSizeInBytes),
170-
Some(Scalar::from(134357000u64))
170+
Some(Scalar::from(134357018u64))
171171
);
172172
}
173173

@@ -180,7 +180,7 @@ mod tests {
180180
assert_eq!(chunk.encoding().id(), DateTimePartsEncoding::ID);
181181
assert_eq!(
182182
chunk.statistics().get(Stat::UncompressedSizeInBytes),
183-
Some((chunk.len() * 8).into())
183+
Some((chunk.len() * 8 + 4).into())
184184
)
185185
}
186186
}

0 commit comments

Comments
 (0)