|
| 1 | +use std::cmp; |
| 2 | + |
| 3 | +use arrow_buffer::BooleanBuffer; |
| 4 | +use itertools::Itertools; |
| 5 | +use vortex_array::stats::{ArrayStatistics as _, Stat, StatisticsVTable, StatsSet}; |
| 6 | +use vortex_array::validity::{ArrayValidity as _, LogicalValidity}; |
| 7 | +use vortex_array::variants::PrimitiveArrayTrait; |
| 8 | +use vortex_array::{ArrayDType as _, ArrayLen as _, IntoArrayVariant as _}; |
| 9 | +use vortex_dtype::{match_each_unsigned_integer_ptype, DType, NativePType}; |
| 10 | +use vortex_error::VortexResult; |
| 11 | +use vortex_scalar::Scalar; |
| 12 | + |
| 13 | +use crate::{RunEndArray, RunEndEncoding}; |
| 14 | + |
| 15 | +impl StatisticsVTable<RunEndArray> for RunEndEncoding { |
| 16 | + fn compute_statistics(&self, array: &RunEndArray, stat: Stat) -> VortexResult<StatsSet> { |
| 17 | + let maybe_stat = match stat { |
| 18 | + Stat::Min | Stat::Max => array.values().statistics().compute(stat), |
| 19 | + Stat::IsSorted => Some(Scalar::from( |
| 20 | + array |
| 21 | + .values() |
| 22 | + .statistics() |
| 23 | + .compute_is_sorted() |
| 24 | + .unwrap_or(false) |
| 25 | + && array.logical_validity().all_valid(), |
| 26 | + )), |
| 27 | + Stat::TrueCount => match array.dtype() { |
| 28 | + DType::Bool(_) => Some(Scalar::from(array.true_count()?)), |
| 29 | + _ => None, |
| 30 | + }, |
| 31 | + Stat::NullCount => Some(Scalar::from(array.null_count()?)), |
| 32 | + _ => None, |
| 33 | + }; |
| 34 | + |
| 35 | + let mut stats = StatsSet::default(); |
| 36 | + if let Some(stat_value) = maybe_stat { |
| 37 | + stats.set(stat, stat_value); |
| 38 | + } |
| 39 | + Ok(stats) |
| 40 | + } |
| 41 | +} |
| 42 | + |
| 43 | +impl RunEndArray { |
| 44 | + fn true_count(&self) -> VortexResult<u64> { |
| 45 | + let ends = self.ends().into_primitive()?; |
| 46 | + let values = self.values().into_bool()?.boolean_buffer(); |
| 47 | + |
| 48 | + match_each_unsigned_integer_ptype!(ends.ptype(), |$P| self.typed_true_count(ends.as_slice::<$P>(), values)) |
| 49 | + } |
| 50 | + |
| 51 | + fn typed_true_count<P: NativePType + Into<u64>>( |
| 52 | + &self, |
| 53 | + decompressed_ends: &[P], |
| 54 | + decompressed_values: BooleanBuffer, |
| 55 | + ) -> VortexResult<u64> { |
| 56 | + Ok(match self.values().logical_validity() { |
| 57 | + LogicalValidity::AllValid(_) => { |
| 58 | + let mut begin = self.offset() as u64; |
| 59 | + decompressed_ends |
| 60 | + .iter() |
| 61 | + .copied() |
| 62 | + .zip_eq(&decompressed_values) |
| 63 | + .map(|(end, bool_value)| { |
| 64 | + let end: u64 = end.into(); |
| 65 | + let len = end - begin; |
| 66 | + begin = end; |
| 67 | + len * u64::from(bool_value) |
| 68 | + }) |
| 69 | + .sum() |
| 70 | + } |
| 71 | + LogicalValidity::AllInvalid(_) => 0, |
| 72 | + LogicalValidity::Array(is_valid) => { |
| 73 | + let is_valid = is_valid.into_bool()?.boolean_buffer(); |
| 74 | + let mut is_valid = is_valid.set_indices(); |
| 75 | + match is_valid.next() { |
| 76 | + None => self.len() as u64, |
| 77 | + Some(valid_index) => { |
| 78 | + let mut true_count: u64 = 0; |
| 79 | + let offsetted_begin = self.offset() as u64; |
| 80 | + let offsetted_len = (self.len() + self.offset()) as u64; |
| 81 | + let begin = if valid_index == 0 { |
| 82 | + offsetted_begin |
| 83 | + } else { |
| 84 | + decompressed_ends[valid_index - 1].into() |
| 85 | + }; |
| 86 | + |
| 87 | + let end = cmp::min(decompressed_ends[valid_index].into(), offsetted_len); |
| 88 | + true_count += decompressed_values.value(valid_index) as u64 * (end - begin); |
| 89 | + |
| 90 | + for valid_index in is_valid { |
| 91 | + let valid_end: u64 = decompressed_ends[valid_index].into(); |
| 92 | + let end = cmp::min(valid_end, offsetted_len); |
| 93 | + true_count += |
| 94 | + decompressed_values.value(valid_index) as u64 * (end - valid_end); |
| 95 | + } |
| 96 | + |
| 97 | + true_count |
| 98 | + } |
| 99 | + } |
| 100 | + } |
| 101 | + }) |
| 102 | + } |
| 103 | + |
| 104 | + fn null_count(&self) -> VortexResult<u64> { |
| 105 | + let ends = self.ends().into_primitive()?; |
| 106 | + let null_count = match self.values().logical_validity() { |
| 107 | + LogicalValidity::AllValid(_) => 0u64, |
| 108 | + LogicalValidity::AllInvalid(_) => self.len() as u64, |
| 109 | + LogicalValidity::Array(is_valid) => { |
| 110 | + let is_valid = is_valid.into_bool()?.boolean_buffer(); |
| 111 | + match_each_unsigned_integer_ptype!(ends.ptype(), |$P| self.null_count_with_array_validity(ends.as_slice::<$P>(), is_valid)) |
| 112 | + } |
| 113 | + }; |
| 114 | + Ok(null_count) |
| 115 | + } |
| 116 | + |
| 117 | + fn null_count_with_array_validity<P: NativePType + Into<u64>>( |
| 118 | + &self, |
| 119 | + decompressed_ends: &[P], |
| 120 | + is_valid: BooleanBuffer, |
| 121 | + ) -> u64 { |
| 122 | + let mut is_valid = is_valid.set_indices(); |
| 123 | + match is_valid.next() { |
| 124 | + None => self.len() as u64, |
| 125 | + Some(valid_index) => { |
| 126 | + let offsetted_len = (self.len() + self.offset()) as u64; |
| 127 | + let mut null_count: u64 = self.len() as u64; |
| 128 | + let begin = if valid_index == 0 { |
| 129 | + 0 |
| 130 | + } else { |
| 131 | + decompressed_ends[valid_index - 1].into() |
| 132 | + }; |
| 133 | + |
| 134 | + let end = cmp::min(decompressed_ends[valid_index].into(), offsetted_len); |
| 135 | + null_count -= end - begin; |
| 136 | + |
| 137 | + for valid_index in is_valid { |
| 138 | + let end = cmp::min(decompressed_ends[valid_index].into(), offsetted_len); |
| 139 | + null_count -= end - decompressed_ends[valid_index - 1].into(); |
| 140 | + } |
| 141 | + |
| 142 | + null_count |
| 143 | + } |
| 144 | + } |
| 145 | + } |
| 146 | +} |
| 147 | + |
| 148 | +#[cfg(test)] |
| 149 | +mod tests { |
| 150 | + use arrow_buffer::BooleanBuffer; |
| 151 | + use vortex_array::array::BoolArray; |
| 152 | + use vortex_array::compute::slice; |
| 153 | + use vortex_array::stats::{ArrayStatistics as _, Stat}; |
| 154 | + use vortex_array::validity::Validity; |
| 155 | + use vortex_array::IntoArrayData; |
| 156 | + use vortex_buffer::buffer; |
| 157 | + |
| 158 | + use crate::RunEndArray; |
| 159 | + |
| 160 | + #[test] |
| 161 | + fn test_runend_int_stats() { |
| 162 | + let arr = RunEndArray::try_new( |
| 163 | + buffer![2u32, 5, 10].into_array(), |
| 164 | + buffer![1i32, 2, 3].into_array(), |
| 165 | + ) |
| 166 | + .unwrap(); |
| 167 | + |
| 168 | + assert_eq!(arr.statistics().compute_as::<i32>(Stat::Min).unwrap(), 1); |
| 169 | + assert_eq!(arr.statistics().compute_as::<i32>(Stat::Max).unwrap(), 3); |
| 170 | + assert_eq!( |
| 171 | + arr.statistics().compute_as::<u64>(Stat::NullCount).unwrap(), |
| 172 | + 0 |
| 173 | + ); |
| 174 | + assert!(arr.statistics().compute_as::<bool>(Stat::IsSorted).unwrap()); |
| 175 | + } |
| 176 | + |
| 177 | + #[test] |
| 178 | + fn test_runend_bool_stats() { |
| 179 | + let arr = RunEndArray::try_new( |
| 180 | + buffer![2u32, 5, 10].into_array(), |
| 181 | + BoolArray::try_new( |
| 182 | + BooleanBuffer::from_iter([true, true, false]), |
| 183 | + Validity::Array(BoolArray::from_iter([true, false, true]).into_array()), |
| 184 | + ) |
| 185 | + .unwrap() |
| 186 | + .into_array(), |
| 187 | + ) |
| 188 | + .unwrap(); |
| 189 | + |
| 190 | + assert!(!arr.statistics().compute_as::<bool>(Stat::Min).unwrap()); |
| 191 | + assert!(arr.statistics().compute_as::<bool>(Stat::Max).unwrap()); |
| 192 | + assert_eq!( |
| 193 | + arr.statistics().compute_as::<u64>(Stat::NullCount).unwrap(), |
| 194 | + 3 |
| 195 | + ); |
| 196 | + assert!(!arr.statistics().compute_as::<bool>(Stat::IsSorted).unwrap()); |
| 197 | + assert_eq!( |
| 198 | + arr.statistics().compute_as::<u64>(Stat::TrueCount).unwrap(), |
| 199 | + 2 |
| 200 | + ); |
| 201 | + |
| 202 | + let sliced = slice(arr, 4, 7).unwrap(); |
| 203 | + |
| 204 | + assert!(!sliced.statistics().compute_as::<bool>(Stat::Min).unwrap()); |
| 205 | + assert!(!sliced.statistics().compute_as::<bool>(Stat::Max).unwrap()); |
| 206 | + assert_eq!( |
| 207 | + sliced |
| 208 | + .statistics() |
| 209 | + .compute_as::<u64>(Stat::NullCount) |
| 210 | + .unwrap(), |
| 211 | + 1 |
| 212 | + ); |
| 213 | + // Not sorted because null must come last |
| 214 | + assert!(!sliced |
| 215 | + .statistics() |
| 216 | + .compute_as::<bool>(Stat::IsSorted) |
| 217 | + .unwrap()); |
| 218 | + assert_eq!( |
| 219 | + sliced |
| 220 | + .statistics() |
| 221 | + .compute_as::<u64>(Stat::TrueCount) |
| 222 | + .unwrap(), |
| 223 | + 0 |
| 224 | + ); |
| 225 | + } |
| 226 | + |
| 227 | + #[test] |
| 228 | + fn test_all_invalid_true_count() { |
| 229 | + let arr = RunEndArray::try_new( |
| 230 | + buffer![2u32, 5, 10].into_array(), |
| 231 | + BoolArray::from_iter([None, None, None]).into_array(), |
| 232 | + ) |
| 233 | + .unwrap() |
| 234 | + .into_array(); |
| 235 | + assert_eq!( |
| 236 | + arr.statistics().compute_as::<u64>(Stat::TrueCount).unwrap(), |
| 237 | + 0 |
| 238 | + ); |
| 239 | + assert_eq!( |
| 240 | + arr.statistics().compute_as::<u64>(Stat::NullCount).unwrap(), |
| 241 | + 10 |
| 242 | + ); |
| 243 | + } |
| 244 | +} |
0 commit comments