add filter to the benchmark

connortsui20 · connortsui20 · commit ecee83012f2f · 2025-11-19T17:42:18.000-05:00
Signed-off-by: Connor Tsui &lt;connor.tsui20@gmail.com&gt;
diff --git a/vortex/benches/pipeline.rs b/vortex/benches/pipeline.rs
@@ -255,6 +255,13 @@ fn decompress_batch(
     unpack_10(bitpacked, bitpacked_output);
     for_decompress(cast_u32_as_i32(bitpacked_output), reference, for_decoded);
     alp_decompress(for_decoded, exponents, alp_decoded);
+
+    // Cast f32 output to u32 for filtering.
+    // SAFETY: f32 and u32 have the same size and alignment.
+    let alp_as_u32 = unsafe {
+        std::slice::from_raw_parts_mut(alp_decoded.as_mut_ptr() as *mut u32, alp_decoded.len())
+    };
+    let _kept = filter_scalar(alp_as_u32);
 }
 
 /// In-place batch decompression that reuses a single buffer for all stages.
@@ -280,6 +287,12 @@ fn decompress_in_place_batch(
 
     // Stage 3: ALP decode in-place (transmute i32 → f32).
     f32::decode_slice_inplace(buffer_i32, exponents);
+
+    // Cast f32 output to u32 for filtering.
+    // SAFETY: f32 and u32 have the same size and alignment.
+    let output_as_u32 =
+        unsafe { std::slice::from_raw_parts_mut(output.as_mut_ptr() as *mut u32, output.len()) };
+    let _kept = filter_scalar(output_as_u32);
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -308,7 +321,7 @@ fn decompress_pipeline(
     let for_chunk = &mut for_buffer[..N];
 
     let mut input_offset = 0;
-    let mut output_offset = 0;
+    let mut output_write_offset = 0; // Track where to write filtered output.
 
     // Process each 1024-element chunk.
     while input_offset < bitpacked.len() {
@@ -328,18 +341,27 @@ fn decompress_pipeline(
             }
         }
 
-        // Stage 3: ALP decompression.
+        // Stage 3: ALP decompression directly into output buffer.
+        // We decompress into the output buffer starting at output_write_offset.
         // SAFETY: Buffer sizes and output bounds are verified.
         unsafe {
-            let output_chunk = output.get_unchecked_mut(output_offset..output_offset + N);
+            let output_chunk =
+                output.get_unchecked_mut(output_write_offset..output_write_offset + N);
             for i in 0..N {
                 let for_decoded = *for_chunk.get_unchecked(i);
                 *output_chunk.get_unchecked_mut(i) = f32::decode_single(for_decoded, exponents);
             }
         }
 
+        // Stage 4: Filter the chunk in the output buffer.
+        // Note: filter_scalar modifies the data in-place, compacting it.
+        let output_chunk =
+            unsafe { output.get_unchecked_mut(output_write_offset..output_write_offset + N) };
+        let kept_count = filter_scalar(output_chunk);
+
+        // The filtered data is now compacted at output_write_offset.
+        output_write_offset += kept_count;
         input_offset += S;
-        output_offset += N;
     }
 }
 
@@ -368,7 +390,7 @@ fn decompress_pipeline_extra_copy(
     let alp_chunk = &mut alp_buffer[..N];
 
     let mut input_offset = 0;
-    let mut output_offset = 0;
+    let mut output_write_offset = 0; // Track where to write filtered output.
 
     // Process each 1024-element chunk.
     while input_offset < bitpacked.len() {
@@ -397,13 +419,18 @@ fn decompress_pipeline_extra_copy(
             }
         }
 
-        // Stage 4: Copy from intermediate ALP buffer to final output.
-        // SAFETY: Buffer sizes are verified to be N.
-        let output_chunk = unsafe { output.get_unchecked_mut(output_offset..output_offset + N) };
-        output_chunk.copy_from_slice(alp_chunk);
+        // Stage 4: Filter the intermediate ALP buffer.
+        let kept_count = filter_scalar(alp_chunk);
+
+        // Stage 5: Copy filtered data from intermediate ALP buffer to final output.
+        // SAFETY: Buffer sizes are verified and kept_count <= N.
+        let output_chunk = unsafe {
+            output.get_unchecked_mut(output_write_offset..output_write_offset + kept_count)
+        };
+        output_chunk.copy_from_slice(&alp_chunk[..kept_count]);
 
+        output_write_offset += kept_count;
         input_offset += S;
-        output_offset += N;
     }
 }
 
@@ -420,12 +447,13 @@ fn decompress_in_place_pipeline(
     debug_assert_eq!(output.len(), bitpacked.len() * T / W);
 
     let mut input_offset = 0;
-    let mut output_offset = 0;
+    let mut output_write_offset = 0; // Track where to write filtered output.
 
     while input_offset < bitpacked.len() {
         // Get the current chunk of the output buffer to work on.
         // SAFETY: Output bounds are verified by debug_assert.
-        let output_chunk = unsafe { output.get_unchecked_mut(output_offset..output_offset + N) };
+        let output_chunk =
+            unsafe { output.get_unchecked_mut(output_write_offset..output_write_offset + N) };
 
         // Reinterpret the output chunk as u32 for unpacking.
         // SAFETY: f32 and u32 have the same size and alignment.
@@ -457,11 +485,52 @@ fn decompress_in_place_pipeline(
             }
         }
 
+        // Stage 4: Filter the chunk in-place.
+        let kept_count = filter_scalar(output_chunk);
+
+        output_write_offset += kept_count;
         input_offset += S;
-        output_offset += N;
     }
 }
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Filter Functions
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Hardcoded mask for now.
+
+fn filter_scalar<T: Copy>(data: &mut [T]) -> usize {
+    let len = data.len();
+    assert!(len.is_multiple_of(usize::BITS as usize));
+
+    let iters = len / 64;
+
+    let mut read_ptr = data.as_ptr();
+    let mut write_ptr = data.as_mut_ptr();
+    let initial_write_ptr = write_ptr;
+
+    for _ in 0..iters {
+        let mut word: usize = std::hint::black_box(0xDEADBEEF);
+
+        while word != 0 {
+            let bit_pos = word.trailing_zeros();
+            word &= word - 1; // Clear the bit at `bit_pos`.
+            let span = word.trailing_ones();
+            word >>= span;
+
+            unsafe {
+                std::ptr::copy(read_ptr.add(bit_pos as usize), write_ptr, span as usize);
+                write_ptr = write_ptr.add(span as usize);
+            }
+        }
+
+        unsafe { read_ptr = read_ptr.add(usize::BITS as usize) };
+    }
+
+    // Return the number of elements kept.
+    unsafe { write_ptr.offset_from(initial_write_ptr) as usize }
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Bitpacking Functions
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -650,22 +719,46 @@ fn verify(
 /// Compare outputs from different decompression functions.
 ///
 /// Ensures that all decompression strategies produce identical results.
-fn compare_outputs(function_name: &str, expected: &[f32], actual: &[f32]) {
-    assert_eq!(
-        expected.len(),
-        actual.len(),
-        "{}: Output length mismatch: expected={}, actual={}",
-        function_name,
-        expected.len(),
-        actual.len()
-    );
-
-    for i in 0..expected.len() {
-        assert_eq!(
-            expected[i], actual[i],
-            "{}: Output mismatch at index {}: expected={}, actual={}",
-            function_name, i, expected[i], actual[i]
-        );
+/// Filtering should produce the same results whether applied chunk-by-chunk
+/// or all at once. Both expected and actual should already be filtered.
+fn compare_outputs(function_name: &str, expected: &[f32], actual: &[f32], expected_len: usize) {
+    // Both buffers should have the same allocated size.
+    assert_eq!(actual.len(), expected.len());
+
+    // Only compare the filtered portion of the data.
+    let expected_slice = &expected[..expected_len];
+    let actual_slice = &actual[..expected_len];
+
+    for i in 0..expected_len {
+        if expected_slice[i] != actual_slice[i] {
+            // Debug output to understand the mismatch.
+            eprintln!(
+                "Mismatch at index {}: expected={}, actual={}",
+                i, expected_slice[i], actual_slice[i]
+            );
+            if i > 0 {
+                eprintln!(
+                    "  Previous values: expected[{}]={}, actual[{}]={}",
+                    i - 1,
+                    expected_slice[i - 1],
+                    i - 1,
+                    actual_slice[i - 1]
+                );
+            }
+            if i + 1 < expected_len {
+                eprintln!(
+                    "  Next values: expected[{}]={}, actual[{}]={}",
+                    i + 1,
+                    expected_slice[i + 1],
+                    i + 1,
+                    actual_slice[i + 1]
+                );
+            }
+            panic!(
+                "{}: Output mismatch at index {}: expected={}, actual={}",
+                function_name, i, expected_slice[i], actual_slice[i]
+            );
+        }
     }
 }
 
@@ -767,7 +860,17 @@ mod correctness_verification {
     #[divan::bench(consts = VERIFICATION_SIZES)]
     fn verify_all_methods<const SIZE: usize>(bencher: Bencher) {
         bencher.bench_local(|| {
-            let (input_data, mut buffers) = setup(SIZE);
+            let (mut input_data, mut buffers) = setup(SIZE);
+
+            // Create a filtered version of the original values for comparison.
+            // SAFETY: f32 and u32 have the same size and alignment.
+            let original_as_u32 = unsafe {
+                std::slice::from_raw_parts_mut(
+                    input_data.original.as_mut_ptr() as *mut u32,
+                    input_data.original.len(),
+                )
+            };
+            let expected_filtered_len = filter_scalar(original_as_u32);
 
             // Run batch decompression (our reference implementation).
             decompress_batch(
@@ -780,12 +883,13 @@ mod correctness_verification {
             );
 
             // Verify batch decompression is correct.
+            // Note: for_decoded is not filtered, but alp_decoded is filtered.
             verify(
                 "batch",
                 &buffers.for_decoded,
                 &buffers.alp_decoded,
                 &input_data.alp_encoded,
-                &input_data.original,
+                &input_data.original, // This is now filtered.
                 &input_data.patches,
             );
 
@@ -798,7 +902,12 @@ mod correctness_verification {
                 &mut buffers.for_decoded,
                 &mut buffers.pipeline_output,
             );
-            compare_outputs("pipeline", &buffers.alp_decoded, &buffers.pipeline_output);
+            compare_outputs(
+                "pipeline",
+                &buffers.alp_decoded,
+                &buffers.pipeline_output,
+                expected_filtered_len,
+            );
 
             // Run in-place batch decompression and compare with batch.
             decompress_in_place_batch(
@@ -811,6 +920,7 @@ mod correctness_verification {
                 "in_place_batch",
                 &buffers.alp_decoded,
                 &buffers.alp_decoded_inplace_batch,
+                expected_filtered_len,
             );
 
             // Run in-place pipeline decompression and compare with batch.
@@ -824,6 +934,7 @@ mod correctness_verification {
                 "in_place_pipeline",
                 &buffers.alp_decoded,
                 &buffers.alp_decoded_inplace_pipeline,
+                expected_filtered_len,
             );
         });
     }