Skip to content

Commit 76e846a

Browse files
authored
Prevent compression from converging into low-quality compressions (#3092)
File size before this change is about 21.18GB, after this change its 19.23GB. The issue we found is that for large files, we tend to converge into not-compressing, or settling for mediocre compressions. With this change we don't keep if the encoding is canonical or otherwise non satisfactory, we don't keep that state and try again on the next chunk.
1 parent 0478e7c commit 76e846a

File tree

1 file changed

+14
-6
lines changed

1 file changed

+14
-6
lines changed

vortex-file/src/strategy.rs

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -147,12 +147,12 @@ impl LayoutWriter for BtrBlocksCompressedWriter {
147147
let ratio = canonical_nbytes as f64 / encoded_chunk.nbytes() as f64;
148148

149149
// Make sure the ratio is within the expected drift, if it isn't we fall back to the compressor.
150-
if ratio > prev_compression.ratio / COMPRESSION_DRIFT_THRESHOLD {
150+
if ratio > (prev_compression.ratio / COMPRESSION_DRIFT_THRESHOLD) {
151151
Some(encoded_chunk)
152152
} else {
153153
log::trace!(
154154
"Compressed to a ratio of {ratio}, which is below the threshold of {}",
155-
prev_compression.ratio * COMPRESSION_DRIFT_THRESHOLD
155+
prev_compression.ratio / COMPRESSION_DRIFT_THRESHOLD
156156
);
157157
None
158158
}
@@ -171,10 +171,18 @@ impl LayoutWriter for BtrBlocksCompressedWriter {
171171
let canonical_chunk = chunk.to_canonical()?;
172172
let canonical_size = canonical_chunk.as_ref().nbytes() as f64;
173173
let compressed = BtrBlocksCompressor.compress_canonical(canonical_chunk)?;
174-
self.previous_chunk = Some(PreviousCompression {
175-
chunk: compressed.clone(),
176-
ratio: canonical_size / compressed.nbytes() as f64,
177-
});
174+
175+
if compressed.is_canonical()
176+
|| ((canonical_size / compressed.nbytes() as f64) < COMPRESSION_DRIFT_THRESHOLD)
177+
{
178+
self.previous_chunk = None;
179+
} else {
180+
self.previous_chunk = Some(PreviousCompression {
181+
chunk: compressed.clone(),
182+
ratio: canonical_size / compressed.nbytes() as f64,
183+
});
184+
}
185+
178186
compressed
179187
}
180188
};

0 commit comments

Comments
 (0)