From 472ad4860d14cd22448671f84757676cadd54132 Mon Sep 17 00:00:00 2001 From: Lan Lou Date: Sat, 9 Nov 2024 15:36:11 -0500 Subject: [PATCH 1/4] fix: fix update_stats_from_catalog and improve the test --- .../src/cost_model/catalog/mock_catalog.rs | 11 ++- optd-persistent/src/cost_model/interface.rs | 14 ++-- optd-persistent/src/cost_model/orm.rs | 75 +++++++++++++++---- 3 files changed, 78 insertions(+), 22 deletions(-) diff --git a/optd-persistent/src/cost_model/catalog/mock_catalog.rs b/optd-persistent/src/cost_model/catalog/mock_catalog.rs index 5dedf3e..c1ba7e5 100644 --- a/optd-persistent/src/cost_model/catalog/mock_catalog.rs +++ b/optd-persistent/src/cost_model/catalog/mock_catalog.rs @@ -1,3 +1,6 @@ +use sea_orm::prelude::Json; +use serde_json::json; + use crate::cost_model::interface::{AttrType, IndexType, StatType}; pub struct MockDatabaseMetadata { @@ -31,7 +34,7 @@ pub struct MockStatistic { pub id: i32, pub stat_type: i32, // TODO(lanlou): what should I use for the value type? - pub stat_value: String, + pub stat_value: Json, pub attr_ids: Vec, pub table_id: Option, pub name: String, @@ -111,7 +114,7 @@ impl MockCatalog { MockStatistic { id: 1, stat_type: StatType::Count as i32, - stat_value: "100".to_string(), + stat_value: json!(100), attr_ids: vec![1], table_id: None, name: "CountAttr1".to_string(), @@ -119,7 +122,7 @@ impl MockCatalog { MockStatistic { id: 2, stat_type: StatType::Count as i32, - stat_value: "200".to_string(), + stat_value: json!(200), attr_ids: vec![2], table_id: None, name: "CountAttr2".to_string(), @@ -127,7 +130,7 @@ impl MockCatalog { MockStatistic { id: 3, stat_type: StatType::Count as i32, - stat_value: "300".to_string(), + stat_value: json!(300), attr_ids: vec![], table_id: Some(1), name: "Table1Count".to_string(), diff --git a/optd-persistent/src/cost_model/interface.rs b/optd-persistent/src/cost_model/interface.rs index 0c28ad0..e6bd8b7 100644 --- a/optd-persistent/src/cost_model/interface.rs +++ b/optd-persistent/src/cost_model/interface.rs @@ -41,6 +41,14 @@ pub enum ConstraintType { /// TODO: documentation pub enum StatType { + // TODO(lanlou): I am not sure which way to represent the type is better. + // 1. `Count` means row count, (i.e. record count), and it only applies to + // table statistics. In this way, we should introduce `NotNullCount` for attribute + // statistics to indicate the number of non-null values. + // 2. `Count` means the number of non-null values, and it applies to both table + // and attribute statistics. (Will a table have a record with null values in all + // attributes?) + // For now, we just use the second way for simplicity. Count, Cardinality, Min, @@ -78,11 +86,7 @@ pub trait CostModelStorageLayer { // TODO: Change EpochId to event::Model::epoch_id async fn create_new_epoch(&self, source: String, data: String) -> StorageResult; - async fn update_stats_from_catalog( - &self, - c: CatalogSource, - epoch_id: Self::EpochId, - ) -> StorageResult<()>; + async fn update_stats_from_catalog(&self, c: CatalogSource) -> StorageResult; async fn update_stats( &self, diff --git a/optd-persistent/src/cost_model/orm.rs b/optd-persistent/src/cost_model/orm.rs index d47ef09..252249e 100644 --- a/optd-persistent/src/cost_model/orm.rs +++ b/optd-persistent/src/cost_model/orm.rs @@ -51,11 +51,24 @@ impl CostModelStorageLayer for BackendManager { } /// TODO: documentation - async fn update_stats_from_catalog( - &self, - c: CatalogSource, - epoch_id: Self::EpochId, - ) -> StorageResult<()> { + async fn update_stats_from_catalog(&self, c: CatalogSource) -> StorageResult { + let transaction = self.db.begin().await?; + let source = match c { + CatalogSource::Mock => "Mock", + CatalogSource::Iceberg() => "Iceberg", + }; + let new_event = event::ActiveModel { + source_variant: sea_orm::ActiveValue::Set(source.to_string()), + timestamp: sea_orm::ActiveValue::Set(Utc::now()), + data: sea_orm::ActiveValue::Set(sea_orm::JsonValue::String( + "Update stats from catalog".to_string(), + )), + ..Default::default() + }; + let epoch_id = Event::insert(new_event) + .exec(&transaction) + .await? + .last_insert_id; match c { CatalogSource::Mock => { let mock_catalog = MockCatalog::new(); @@ -66,7 +79,7 @@ impl CostModelStorageLayer for BackendManager { ..Default::default() } })) - .exec(&self.db) + .exec(&transaction) .await?; NamespaceMetadata::insert_many(mock_catalog.namespaces.iter().map(|namespace| { namespace_metadata::ActiveModel { @@ -76,7 +89,7 @@ impl CostModelStorageLayer for BackendManager { ..Default::default() } })) - .exec(&self.db) + .exec(&transaction) .await?; TableMetadata::insert_many(mock_catalog.tables.iter().map(|table| { table_metadata::ActiveModel { @@ -86,7 +99,7 @@ impl CostModelStorageLayer for BackendManager { ..Default::default() } })) - .exec(&self.db) + .exec(&transaction) .await?; Attribute::insert_many(mock_catalog.attributes.iter().map(|attr| { attribute::ActiveModel { @@ -101,7 +114,7 @@ impl CostModelStorageLayer for BackendManager { ..Default::default() } })) - .exec(&self.db) + .exec(&transaction) .await?; Statistic::insert_many(mock_catalog.statistics.iter().map(|stat| { statistic::ActiveModel { @@ -116,7 +129,29 @@ impl CostModelStorageLayer for BackendManager { ..Default::default() } })) - .exec(&self.db) + .exec(&transaction) + .await?; + VersionedStatistic::insert_many(mock_catalog.statistics.iter().map(|stat| { + versioned_statistic::ActiveModel { + epoch_id: sea_orm::ActiveValue::Set(epoch_id), + statistic_id: sea_orm::ActiveValue::Set(stat.id), + statistic_value: sea_orm::ActiveValue::Set(stat.stat_value.clone()), + ..Default::default() + } + })) + .exec(&transaction) + .await?; + StatisticToAttributeJunction::insert_many(mock_catalog.statistics.iter().flat_map( + |stat| { + stat.attr_ids.iter().map(move |attr_id| { + statistic_to_attribute_junction::ActiveModel { + statistic_id: sea_orm::ActiveValue::Set(stat.id), + attribute_id: sea_orm::ActiveValue::Set(*attr_id), + } + }) + }, + )) + .exec(&transaction) .await?; IndexMetadata::insert_many( mock_catalog @@ -140,12 +175,13 @@ impl CostModelStorageLayer for BackendManager { ..Default::default() }), ) - .exec(&self.db) + .exec(&transaction) .await?; - Ok(()) } CatalogSource::Iceberg() => todo!(), } + transaction.commit().await?; + Ok(epoch_id) } /// TODO: improve the documentation @@ -551,15 +587,28 @@ mod tests { let mut binding = super::BackendManager::new(Some(&database_url)).await; let backend_manager = binding.as_mut().unwrap(); let res = backend_manager - .update_stats_from_catalog(super::CatalogSource::Mock, 1) + .update_stats_from_catalog(super::CatalogSource::Mock) .await; println!("{:?}", res); assert!(res.is_ok()); + let epoch_id = res.unwrap(); + assert_eq!(epoch_id, 1); let lookup_res = Statistic::find().all(&backend_manager.db).await.unwrap(); println!("{:?}", lookup_res); assert_eq!(lookup_res.len(), 3); + let stat_res = backend_manager + .get_stats_for_table(1, StatType::Count as i32, Some(epoch_id)) + .await; + assert!(stat_res.is_ok()); + assert_eq!(stat_res.unwrap().unwrap(), json!(300)); + let stat_res = backend_manager + .get_stats_for_attr([2].to_vec(), StatType::Count as i32, None) + .await; + assert!(stat_res.is_ok()); + assert_eq!(stat_res.unwrap().unwrap(), json!(200)); + remove_db_file(DATABASE_FILE); } From b900ee78f94beea392180bb8d2b7403557043ab4 Mon Sep 17 00:00:00 2001 From: Lan Lou Date: Sat, 9 Nov 2024 16:16:02 -0500 Subject: [PATCH 2/4] fix: improve the statistic type --- optd-persistent/src/bin/init.rs | 2 +- .../src/cost_model/catalog/mock_catalog.rs | 6 ++--- optd-persistent/src/cost_model/interface.rs | 12 +++------- optd-persistent/src/cost_model/orm.rs | 22 +++++++++--------- optd-persistent/src/db/init.db | Bin 147456 -> 147456 bytes 5 files changed, 18 insertions(+), 24 deletions(-) diff --git a/optd-persistent/src/bin/init.rs b/optd-persistent/src/bin/init.rs index e282f98..c2291d7 100644 --- a/optd-persistent/src/bin/init.rs +++ b/optd-persistent/src/bin/init.rs @@ -105,7 +105,7 @@ async fn init_all_tables() -> Result<(), sea_orm::error::DbErr> { table_id: Set(Some(1)), creation_time: Set(Utc::now()), number_of_attributes: Set(0), - variant_tag: Set(StatType::Count as i32), + variant_tag: Set(StatType::TableRowCount as i32), description: Set("".to_owned()), }; let table_versioned_statistic = versioned_statistic::ActiveModel { diff --git a/optd-persistent/src/cost_model/catalog/mock_catalog.rs b/optd-persistent/src/cost_model/catalog/mock_catalog.rs index c1ba7e5..fb927fb 100644 --- a/optd-persistent/src/cost_model/catalog/mock_catalog.rs +++ b/optd-persistent/src/cost_model/catalog/mock_catalog.rs @@ -113,7 +113,7 @@ impl MockCatalog { let statistics: Vec = vec![ MockStatistic { id: 1, - stat_type: StatType::Count as i32, + stat_type: StatType::NotNullCount as i32, stat_value: json!(100), attr_ids: vec![1], table_id: None, @@ -121,7 +121,7 @@ impl MockCatalog { }, MockStatistic { id: 2, - stat_type: StatType::Count as i32, + stat_type: StatType::NotNullCount as i32, stat_value: json!(200), attr_ids: vec![2], table_id: None, @@ -129,7 +129,7 @@ impl MockCatalog { }, MockStatistic { id: 3, - stat_type: StatType::Count as i32, + stat_type: StatType::TableRowCount as i32, stat_value: json!(300), attr_ids: vec![], table_id: Some(1), diff --git a/optd-persistent/src/cost_model/interface.rs b/optd-persistent/src/cost_model/interface.rs index e6bd8b7..d896ec1 100644 --- a/optd-persistent/src/cost_model/interface.rs +++ b/optd-persistent/src/cost_model/interface.rs @@ -41,15 +41,9 @@ pub enum ConstraintType { /// TODO: documentation pub enum StatType { - // TODO(lanlou): I am not sure which way to represent the type is better. - // 1. `Count` means row count, (i.e. record count), and it only applies to - // table statistics. In this way, we should introduce `NotNullCount` for attribute - // statistics to indicate the number of non-null values. - // 2. `Count` means the number of non-null values, and it applies to both table - // and attribute statistics. (Will a table have a record with null values in all - // attributes?) - // For now, we just use the second way for simplicity. - Count, + /// `TableRowCount` only applies to table statistics. + TableRowCount, + NotNullCount, Cardinality, Min, Max, diff --git a/optd-persistent/src/cost_model/orm.rs b/optd-persistent/src/cost_model/orm.rs index 252249e..8f94161 100644 --- a/optd-persistent/src/cost_model/orm.rs +++ b/optd-persistent/src/cost_model/orm.rs @@ -599,12 +599,12 @@ mod tests { assert_eq!(lookup_res.len(), 3); let stat_res = backend_manager - .get_stats_for_table(1, StatType::Count as i32, Some(epoch_id)) + .get_stats_for_table(1, StatType::TableRowCount as i32, Some(epoch_id)) .await; assert!(stat_res.is_ok()); assert_eq!(stat_res.unwrap().unwrap(), json!(300)); let stat_res = backend_manager - .get_stats_for_attr([2].to_vec(), StatType::Count as i32, None) + .get_stats_for_attr([2].to_vec(), StatType::NotNullCount as i32, None) .await; assert!(stat_res.is_ok()); assert_eq!(stat_res.unwrap().unwrap(), json!(200)); @@ -624,7 +624,7 @@ mod tests { .await .unwrap(); let stat = Stat { - stat_type: StatType::Count as i32, + stat_type: StatType::NotNullCount as i32, stat_value: json!(100), attr_ids: vec![1], table_id: None, @@ -643,7 +643,7 @@ mod tests { println!("{:?}", stat_res); assert_eq!(stat_res[0].number_of_attributes, 1); assert_eq!(stat_res[0].description, "1".to_string()); - assert_eq!(stat_res[0].variant_tag, StatType::Count as i32); + assert_eq!(stat_res[0].variant_tag, StatType::NotNullCount as i32); let stat_attr_res = StatisticToAttributeJunction::find() .filter(statistic_to_attribute_junction::Column::StatisticId.eq(stat_res[0].id)) .all(&backend_manager.db) @@ -696,7 +696,7 @@ mod tests { .await .unwrap(); let stat2 = Stat { - stat_type: StatType::Count as i32, + stat_type: StatType::NotNullCount as i32, stat_value: json!(200), attr_ids: vec![1], table_id: None, @@ -750,7 +750,7 @@ mod tests { // 3. Update existed stat with the same value let epoch_num = Event::find().all(&backend_manager.db).await.unwrap().len(); let stat3 = Stat { - stat_type: StatType::Count as i32, + stat_type: StatType::NotNullCount as i32, stat_value: json!(200), attr_ids: vec![1], table_id: None, @@ -791,21 +791,21 @@ mod tests { let statistics: Vec = vec![ Stat { - stat_type: StatType::Count as i32, + stat_type: StatType::TableRowCount as i32, stat_value: json!(0), attr_ids: vec![], table_id: Some(1), name: "row_count".to_string(), }, Stat { - stat_type: StatType::Count as i32, + stat_type: StatType::TableRowCount as i32, stat_value: json!(20), attr_ids: vec![], table_id: Some(1), name: "row_count".to_string(), }, Stat { - stat_type: StatType::Count as i32, + stat_type: StatType::TableRowCount as i32, stat_value: json!(100), attr_ids: vec![], table_id: Some(table_inserted_res.last_insert_id), @@ -969,7 +969,7 @@ mod tests { let backend_manager = binding.as_mut().unwrap(); let epoch_id = 1; let table_id = 1; - let stat_type = StatType::Count as i32; + let stat_type = StatType::TableRowCount as i32; // Get initial stats let res = backend_manager @@ -986,7 +986,7 @@ mod tests { .await .unwrap(); let stat = Stat { - stat_type: StatType::Count as i32, + stat_type: StatType::TableRowCount as i32, stat_value: json!(100), attr_ids: vec![], table_id: Some(table_id), diff --git a/optd-persistent/src/db/init.db b/optd-persistent/src/db/init.db index d88b92d67fbd7019ce2728841a6ce9ee457f8357..3d860ba5c64f35b16047e153628d94d0f92fa69f 100644 GIT binary patch delta 555 zcmZ8cJ1j#{7`~_Hphs|3NXUH^ZInLlIk&y~5YL508u1uiB8^DUrbJ>=n~|JE2Lr2- z4o1vKj6!0vnCu;Zw!;B}r1ieg6Ya&LVN}b+h)aK=ZZ8 z#??@Zi{T!)%%)X^eMDNDH(ess%^p>?tx(Jta>e~aPO*D+$m`-r7hFWFGup?p(N@fr zfDg_hwi?xt@QrD~CYSA(*@E6p3)}0v&PIN3pKDqNOrsr(SP4q3v_tq%CwGwB74l3w zItIK)`3VAlZQHjtDw+=HBLcI!pSK8OxQy6EEKqY@>bg)L^dMKqo8cj1Q>McFrXo7p zO>m1dG4Z{A#JjOy9oOQ=Mo`4BjbR=i253N6(dPXQprkn z@~4ruQ}IgG`H>skX`5E6`L-+XiX^|t=kiL$N9JTa0lkum=K(0M8$b!xU9c0$RG5$q zAs!EnShmM24dQtW%2x*H>8-OS5`_L+%g+YT>W{~5lDexJ;Kga+sWp1{b_7BXe=8=M Ak^lez delta 553 zcmZ8cO)mpM7~V01PJ>k;BKuXeQTjErZP%7Gz77(Jh#(OMOQaEkHq}U+)IT6|*n>k| z9qGXlXWWIv-O0TNu`_kRn@ry1`FNjaDivI*;4>w#D@jtwdp`r#&SNRCd%f|hz{JcY z%c+r8Z%2FJ3Y$_D_7?NEYvqim|7WXy}oUOuPnR8kP%%Xjhm}yGPtV8%vCx4Vb5cG6A zIt9Fk`v^g>aqUC1MAHF%MC=7U$Xi4)TtVz25o)Nax-QszJiz z;`2XjrAe*se9C{-7PV5fdZF#ow<^icvMaCF0%T6cIp~v;cphNaF@O^Kosgm?B~dSq zkR}>4(@j2U5YJ=mRt?b8`&(&d?Tp<}x=#ks{zlt&Dt%iwz_arYH>rL5asndv--ADp A-2eap From 49f600afe1da37fea4b0ee5a27b8578ef796da01 Mon Sep 17 00:00:00 2001 From: Yuanxin Cao Date: Sat, 9 Nov 2024 16:29:51 -0500 Subject: [PATCH 3/4] fix clippy --- optd-persistent/src/lib.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/optd-persistent/src/lib.rs b/optd-persistent/src/lib.rs index 5c6371c..8125e68 100644 --- a/optd-persistent/src/lib.rs +++ b/optd-persistent/src/lib.rs @@ -1,6 +1,6 @@ #![allow(dead_code)] -use std::cell::LazyCell; +use std::sync::LazyLock; use sea_orm::*; use sea_orm_migration::prelude::*; @@ -17,7 +17,7 @@ pub const DATABASE_FILENAME: &str = "sqlite.db"; pub const DATABASE_URL: &str = "sqlite:./sqlite.db?mode=rwc"; pub const TEST_DATABASE_FILENAME: &str = "init.db"; -pub const TEST_DATABASE_FILE: LazyCell = LazyCell::new(|| { +pub static TEST_DATABASE_FILE: LazyLock = LazyLock::new(|| { std::env::current_dir() .unwrap() .join("src") @@ -27,8 +27,8 @@ pub const TEST_DATABASE_FILE: LazyCell = LazyCell::new(|| { .unwrap() .to_owned() }); -pub const TEST_DATABASE_URL: LazyCell = - LazyCell::new(|| get_sqlite_url(TEST_DATABASE_FILE.as_str())); +pub static TEST_DATABASE_URL: LazyLock = + LazyLock::new(|| get_sqlite_url(TEST_DATABASE_FILE.as_str())); fn get_sqlite_url(file: &str) -> String { format!("sqlite:{}?mode=rwc", file) From b730d5987864db5ff8bf92d4f2b5b2590a09e3ae Mon Sep 17 00:00:00 2001 From: Yuanxin Cao Date: Sat, 9 Nov 2024 16:38:59 -0500 Subject: [PATCH 4/4] add comments --- optd-persistent/src/cost_model/catalog/mock_catalog.rs | 4 +++- optd-persistent/src/cost_model/orm.rs | 1 + optd-persistent/src/lib.rs | 5 +++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/optd-persistent/src/cost_model/catalog/mock_catalog.rs b/optd-persistent/src/cost_model/catalog/mock_catalog.rs index fb927fb..f79f930 100644 --- a/optd-persistent/src/cost_model/catalog/mock_catalog.rs +++ b/optd-persistent/src/cost_model/catalog/mock_catalog.rs @@ -3,6 +3,7 @@ use serde_json::json; use crate::cost_model::interface::{AttrType, IndexType, StatType}; +/// TODO: documentation pub struct MockDatabaseMetadata { pub id: i32, pub name: String, @@ -33,7 +34,6 @@ pub struct MockAttribute { pub struct MockStatistic { pub id: i32, pub stat_type: i32, - // TODO(lanlou): what should I use for the value type? pub stat_value: Json, pub attr_ids: Vec, pub table_id: Option, @@ -74,7 +74,9 @@ pub struct MockCatalog { pub triggers: Vec, // TODO: constraints } + impl MockCatalog { + /// TODO: documentation pub fn new() -> Self { let databases: Vec = vec![MockDatabaseMetadata { id: 1, diff --git a/optd-persistent/src/cost_model/orm.rs b/optd-persistent/src/cost_model/orm.rs index 8f94161..dc35d2e 100644 --- a/optd-persistent/src/cost_model/orm.rs +++ b/optd-persistent/src/cost_model/orm.rs @@ -177,6 +177,7 @@ impl CostModelStorageLayer for BackendManager { ) .exec(&transaction) .await?; + // TODO: initialize constraints } CatalogSource::Iceberg() => todo!(), } diff --git a/optd-persistent/src/lib.rs b/optd-persistent/src/lib.rs index 8125e68..2638940 100644 --- a/optd-persistent/src/lib.rs +++ b/optd-persistent/src/lib.rs @@ -13,10 +13,14 @@ mod migrator; pub mod cost_model; pub use cost_model::interface::CostModelStorageLayer; +/// The filename of the SQLite database for migration. pub const DATABASE_FILENAME: &str = "sqlite.db"; +/// The URL of the SQLite database for migration. pub const DATABASE_URL: &str = "sqlite:./sqlite.db?mode=rwc"; +/// The filename of the SQLite database for testing. pub const TEST_DATABASE_FILENAME: &str = "init.db"; +/// The URL of the SQLite database for testing. pub static TEST_DATABASE_FILE: LazyLock = LazyLock::new(|| { std::env::current_dir() .unwrap() @@ -27,6 +31,7 @@ pub static TEST_DATABASE_FILE: LazyLock = LazyLock::new(|| { .unwrap() .to_owned() }); +/// The URL of the SQLite database for testing. pub static TEST_DATABASE_URL: LazyLock = LazyLock::new(|| get_sqlite_url(TEST_DATABASE_FILE.as_str()));