|
| 1 | +use crate::{ |
| 2 | + common::{ |
| 3 | + nodes::{ArcPredicateNode, PredicateType, ReprPredicateNode}, |
| 4 | + predicates::{attr_index_pred::AttrIndexPred, list_pred::ListPred}, |
| 5 | + properties::attr_ref::{AttrRef, BaseTableAttrRef}, |
| 6 | + types::GroupId, |
| 7 | + }, |
| 8 | + cost_model::CostModelImpl, |
| 9 | + stats::DEFAULT_NUM_DISTINCT, |
| 10 | + storage::CostModelStorageManager, |
| 11 | + CostModelError, CostModelResult, EstimatedStatistic, SemanticError, |
| 12 | +}; |
1 | 13 |
|
| 14 | +impl<S: CostModelStorageManager> CostModelImpl<S> { |
| 15 | + pub async fn get_agg_row_cnt( |
| 16 | + &self, |
| 17 | + group_id: GroupId, |
| 18 | + group_by: ArcPredicateNode, |
| 19 | + ) -> CostModelResult<EstimatedStatistic> { |
| 20 | + let group_by = ListPred::from_pred_node(group_by).unwrap(); |
| 21 | + if group_by.is_empty() { |
| 22 | + Ok(EstimatedStatistic(1.0)) |
| 23 | + } else { |
| 24 | + // Multiply the n-distinct of all the group by columns. |
| 25 | + // TODO: improve with multi-dimensional n-distinct |
| 26 | + let mut row_cnt = 1; |
| 27 | + |
| 28 | + for node in &group_by.0.children { |
| 29 | + match node.typ { |
| 30 | + PredicateType::AttrIndex => { |
| 31 | + let attr_ref = |
| 32 | + AttrIndexPred::from_pred_node(node.clone()).ok_or_else(|| { |
| 33 | + SemanticError::InvalidPredicate( |
| 34 | + "Expected AttributeRef predicate".to_string(), |
| 35 | + ) |
| 36 | + })?; |
| 37 | + if let AttrRef::BaseTableAttrRef(BaseTableAttrRef { table_id, attr_idx }) = |
| 38 | + self.memo.get_attribute_ref(group_id, attr_ref.attr_index()) |
| 39 | + { |
| 40 | + // TODO: Only query ndistinct instead of all kinds of stats. |
| 41 | + let stats_option = |
| 42 | + self.get_attribute_comb_stats(table_id, &[attr_idx]).await?; |
| 43 | + |
| 44 | + let ndistinct = match stats_option { |
| 45 | + Some(stats) => stats.ndistinct, |
| 46 | + None => { |
| 47 | + // The column type is not supported or stats are missing. |
| 48 | + DEFAULT_NUM_DISTINCT |
| 49 | + } |
| 50 | + }; |
| 51 | + row_cnt *= ndistinct; |
| 52 | + } else { |
| 53 | + // TOOD: Handle derived attributes. |
| 54 | + row_cnt *= DEFAULT_NUM_DISTINCT; |
| 55 | + } |
| 56 | + } |
| 57 | + _ => { |
| 58 | + // TODO: Consider the case where `GROUP BY 1`. |
| 59 | + panic!("GROUP BY must have attribute ref predicate"); |
| 60 | + } |
| 61 | + } |
| 62 | + } |
| 63 | + Ok(EstimatedStatistic(row_cnt as f64)) |
| 64 | + } |
| 65 | + } |
| 66 | +} |
| 67 | + |
| 68 | +#[cfg(test)] |
| 69 | +mod tests { |
| 70 | + use std::{collections::HashMap, ops::Deref}; |
| 71 | + |
| 72 | + use crate::{ |
| 73 | + common::{ |
| 74 | + predicates::constant_pred::ConstantType, |
| 75 | + properties::Attribute, |
| 76 | + types::{GroupId, TableId}, |
| 77 | + values::Value, |
| 78 | + }, |
| 79 | + cost_model::tests::{ |
| 80 | + attr_index, cnst, create_mock_cost_model, create_mock_cost_model_with_attr_types, |
| 81 | + empty_list, empty_per_attr_stats, list, TestPerAttributeStats, TEST_ATTR1_BASE_INDEX, |
| 82 | + TEST_ATTR2_BASE_INDEX, TEST_ATTR3_BASE_INDEX, TEST_GROUP1_ID, TEST_TABLE1_ID, |
| 83 | + }, |
| 84 | + stats::{utilities::simple_map::SimpleMap, MostCommonValues, DEFAULT_NUM_DISTINCT}, |
| 85 | + EstimatedStatistic, |
| 86 | + }; |
| 87 | + |
| 88 | + #[tokio::test] |
| 89 | + async fn test_agg_no_stats() { |
| 90 | + let cost_model = create_mock_cost_model_with_attr_types( |
| 91 | + vec![TEST_TABLE1_ID], |
| 92 | + vec![], |
| 93 | + vec![HashMap::from([ |
| 94 | + (TEST_ATTR1_BASE_INDEX, ConstantType::Int32), |
| 95 | + (TEST_ATTR2_BASE_INDEX, ConstantType::Int32), |
| 96 | + ])], |
| 97 | + vec![None], |
| 98 | + ); |
| 99 | + |
| 100 | + // Group by empty list should return 1. |
| 101 | + let group_bys = empty_list(); |
| 102 | + assert_eq!( |
| 103 | + cost_model |
| 104 | + .get_agg_row_cnt(TEST_GROUP1_ID, group_bys) |
| 105 | + .await |
| 106 | + .unwrap(), |
| 107 | + EstimatedStatistic(1.0) |
| 108 | + ); |
| 109 | + |
| 110 | + // Group by single column should return the default value since there are no stats. |
| 111 | + let group_bys = list(vec![attr_index(0)]); |
| 112 | + assert_eq!( |
| 113 | + cost_model |
| 114 | + .get_agg_row_cnt(TEST_GROUP1_ID, group_bys) |
| 115 | + .await |
| 116 | + .unwrap(), |
| 117 | + EstimatedStatistic(DEFAULT_NUM_DISTINCT as f64) |
| 118 | + ); |
| 119 | + |
| 120 | + // Group by two columns should return the default value squared since there are no stats. |
| 121 | + let group_bys = list(vec![attr_index(0), attr_index(1)]); |
| 122 | + assert_eq!( |
| 123 | + cost_model |
| 124 | + .get_agg_row_cnt(TEST_GROUP1_ID, group_bys) |
| 125 | + .await |
| 126 | + .unwrap(), |
| 127 | + EstimatedStatistic((DEFAULT_NUM_DISTINCT * DEFAULT_NUM_DISTINCT) as f64) |
| 128 | + ); |
| 129 | + } |
| 130 | + |
| 131 | + #[tokio::test] |
| 132 | + async fn test_agg_with_stats() { |
| 133 | + let attr1_ndistinct = 12; |
| 134 | + let attr2_ndistinct = 645; |
| 135 | + let attr1_stats = TestPerAttributeStats::new( |
| 136 | + MostCommonValues::SimpleFrequency(SimpleMap::default()), |
| 137 | + None, |
| 138 | + attr1_ndistinct, |
| 139 | + 0.0, |
| 140 | + ); |
| 141 | + let attr2_stats = TestPerAttributeStats::new( |
| 142 | + MostCommonValues::SimpleFrequency(SimpleMap::default()), |
| 143 | + None, |
| 144 | + attr2_ndistinct, |
| 145 | + 0.0, |
| 146 | + ); |
| 147 | + |
| 148 | + let cost_model = create_mock_cost_model_with_attr_types( |
| 149 | + vec![TEST_TABLE1_ID], |
| 150 | + vec![HashMap::from([ |
| 151 | + (TEST_ATTR1_BASE_INDEX, attr1_stats), |
| 152 | + (TEST_ATTR2_BASE_INDEX, attr2_stats), |
| 153 | + ])], |
| 154 | + vec![HashMap::from([ |
| 155 | + (TEST_ATTR1_BASE_INDEX, ConstantType::Int32), |
| 156 | + (TEST_ATTR2_BASE_INDEX, ConstantType::Int32), |
| 157 | + (TEST_ATTR3_BASE_INDEX, ConstantType::Int32), |
| 158 | + ])], |
| 159 | + vec![None], |
| 160 | + ); |
| 161 | + |
| 162 | + // Group by empty list should return 1. |
| 163 | + let group_bys = empty_list(); |
| 164 | + assert_eq!( |
| 165 | + cost_model |
| 166 | + .get_agg_row_cnt(TEST_GROUP1_ID, group_bys) |
| 167 | + .await |
| 168 | + .unwrap(), |
| 169 | + EstimatedStatistic(1.0) |
| 170 | + ); |
| 171 | + |
| 172 | + // Group by single column should return the n-distinct of the column. |
| 173 | + let group_bys = list(vec![attr_index(0)]); |
| 174 | + assert_eq!( |
| 175 | + cost_model |
| 176 | + .get_agg_row_cnt(TEST_GROUP1_ID, group_bys) |
| 177 | + .await |
| 178 | + .unwrap(), |
| 179 | + EstimatedStatistic(attr1_ndistinct as f64) |
| 180 | + ); |
| 181 | + |
| 182 | + // Group by two columns should return the product of the n-distinct of the columns. |
| 183 | + let group_bys = list(vec![attr_index(0), attr_index(1)]); |
| 184 | + assert_eq!( |
| 185 | + cost_model |
| 186 | + .get_agg_row_cnt(TEST_GROUP1_ID, group_bys) |
| 187 | + .await |
| 188 | + .unwrap(), |
| 189 | + EstimatedStatistic((attr1_ndistinct * attr2_ndistinct) as f64) |
| 190 | + ); |
| 191 | + |
| 192 | + // Group by multiple columns should return the product of the n-distinct of the columns. If one of the columns |
| 193 | + // does not have stats, it should use the default value instead. |
| 194 | + let group_bys = list(vec![attr_index(0), attr_index(1), attr_index(2)]); |
| 195 | + assert_eq!( |
| 196 | + cost_model |
| 197 | + .get_agg_row_cnt(TEST_GROUP1_ID, group_bys) |
| 198 | + .await |
| 199 | + .unwrap(), |
| 200 | + EstimatedStatistic((attr1_ndistinct * attr2_ndistinct * DEFAULT_NUM_DISTINCT) as f64) |
| 201 | + ); |
| 202 | + } |
| 203 | +} |
0 commit comments