diff --git a/Cargo.lock b/Cargo.lock index 9464489..b54d901 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -57,6 +57,21 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "250f629c0161ad8107cf89319e990051fae62832fd343083bea452d93e2205fd" +[[package]] +name = "alloc-no-stdlib" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" +dependencies = [ + "alloc-no-stdlib", +] + [[package]] name = "allocator-api2" version = "0.2.20" @@ -127,6 +142,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + [[package]] name = "arrayvec" version = "0.7.6" @@ -353,6 +374,30 @@ dependencies = [ "regex-syntax 0.7.5", ] +[[package]] +name = "assert_approx_eq" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c07dab4369547dbe5114677b33fbbf724971019f3818172d59a97a61c774ffd" + +[[package]] +name = "async-compression" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cb8f1d480b0ea3783ab015936d2a55c87e219676f0c0b7dec61494043f21857" +dependencies = [ + "bzip2", + "flate2", + "futures-core", + "futures-io", + "memchr", + "pin-project-lite", + "tokio", + "xz2", + "zstd 0.13.2", + "zstd-safe 7.2.1", +] + [[package]] name = "async-stream" version = "0.3.6" @@ -416,6 +461,12 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "base64" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" + [[package]] name = "base64" version = "0.22.1" @@ -469,6 +520,28 @@ dependencies = [ "wyz", ] +[[package]] +name = "blake2" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe" +dependencies = [ + "digest", +] + +[[package]] +name = "blake3" +version = "1.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d82033247fd8e890df8f740e407ad4d038debb9eb1f40533fffb32e7d17dc6f7" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", +] + [[package]] name = "block-buffer" version = "0.10.4" @@ -501,6 +574,27 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "brotli" +version = "3.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d640d25bc63c50fb1f0b545ffd80207d2e10a4c965530809b40ba3386825c391" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "2.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e2e4afe60d7dd600fdd3de8d0f08c2b7ec039712e3b6137ff98b7004e82de4f" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + [[package]] name = "bumpalo" version = "3.16.0" @@ -541,12 +635,35 @@ version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da" +[[package]] +name = "bzip2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" +dependencies = [ + "bzip2-sys", + "libc", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.11+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "cc" version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fd9de9f2205d5ef3fd67e685b0df337994ddd4495e2a28d185500d0e1edfea47" dependencies = [ + "jobserver", + "libc", "shlex", ] @@ -691,6 +808,12 @@ dependencies = [ "tiny-keccak", ] +[[package]] +name = "constant_time_eq" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" + [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -721,6 +844,15 @@ version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + [[package]] name = "crossbeam" version = "0.8.4" @@ -849,6 +981,67 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "dashmap" +version = "5.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" +dependencies = [ + "cfg-if", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + +[[package]] +name = "datafusion" +version = "32.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7014432223f4d721cb9786cd88bb89e7464e0ba984d4a7f49db7787f5f268674" +dependencies = [ + "ahash 0.8.11", + "arrow", + "arrow-array", + "arrow-schema 47.0.0", + "async-compression", + "async-trait", + "bytes", + "bzip2", + "chrono", + "dashmap", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-optimizer", + "datafusion-physical-expr", + "datafusion-physical-plan", + "datafusion-sql", + "flate2", + "futures", + "glob", + "half", + "hashbrown 0.14.5", + "indexmap 2.6.0", + "itertools 0.11.0", + "log", + "num_cpus", + "object_store", + "parking_lot", + "parquet", + "percent-encoding", + "pin-project-lite", + "rand", + "sqlparser", + "tempfile", + "tokio", + "tokio-util", + "url", + "uuid", + "xz2", + "zstd 0.12.4", +] + [[package]] name = "datafusion-common" version = "32.0.0" @@ -863,9 +1056,32 @@ dependencies = [ "chrono", "half", "num_cpus", + "object_store", + "parquet", "sqlparser", ] +[[package]] +name = "datafusion-execution" +version = "32.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "780b73b2407050e53f51a9781868593f694102c59e622de9a8aafc0343c4f237" +dependencies = [ + "arrow", + "chrono", + "dashmap", + "datafusion-common", + "datafusion-expr", + "futures", + "hashbrown 0.14.5", + "log", + "object_store", + "parking_lot", + "rand", + "tempfile", + "url", +] + [[package]] name = "datafusion-expr" version = "32.0.0" @@ -881,6 +1097,103 @@ dependencies = [ "strum_macros 0.25.3", ] +[[package]] +name = "datafusion-optimizer" +version = "32.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f2904a432f795484fd45e29ded4537152adb60f636c05691db34fcd94c92c96" +dependencies = [ + "arrow", + "async-trait", + "chrono", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-expr", + "hashbrown 0.14.5", + "itertools 0.11.0", + "log", + "regex-syntax 0.7.5", +] + +[[package]] +name = "datafusion-physical-expr" +version = "32.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57b4968e9a998dc0476c4db7a82f280e2026b25f464e4aa0c3bb9807ee63ddfd" +dependencies = [ + "ahash 0.8.11", + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-schema 47.0.0", + "base64 0.21.7", + "blake2", + "blake3", + "chrono", + "datafusion-common", + "datafusion-expr", + "half", + "hashbrown 0.14.5", + "hex", + "indexmap 2.6.0", + "itertools 0.11.0", + "libc", + "log", + "md-5", + "paste", + "petgraph", + "rand", + "regex", + "sha2", + "unicode-segmentation", + "uuid", +] + +[[package]] +name = "datafusion-physical-plan" +version = "32.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efd0d1fe54e37a47a2d58a1232c22786f2c28ad35805fdcd08f0253a8b0aaa90" +dependencies = [ + "ahash 0.8.11", + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-schema 47.0.0", + "async-trait", + "chrono", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "futures", + "half", + "hashbrown 0.14.5", + "indexmap 2.6.0", + "itertools 0.11.0", + "log", + "once_cell", + "parking_lot", + "pin-project-lite", + "rand", + "tokio", + "uuid", +] + +[[package]] +name = "datafusion-sql" +version = "32.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b568d44c87ead99604d704f942e257c8a236ee1bbf890ee3e034ad659dcb2c21" +dependencies = [ + "arrow", + "arrow-schema 47.0.0", + "datafusion-common", + "datafusion-expr", + "log", + "sqlparser", +] + [[package]] name = "der" version = "0.7.9" @@ -925,6 +1238,12 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "doc-comment" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" + [[package]] name = "dotenvy" version = "0.15.7" @@ -984,6 +1303,12 @@ version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4" +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + [[package]] name = "flatbuffers" version = "23.5.26" @@ -994,6 +1319,16 @@ dependencies = [ "rustc_version", ] +[[package]] +name = "flate2" +version = "1.0.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "flume" version = "0.11.1" @@ -1034,6 +1369,7 @@ checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" dependencies = [ "futures-channel", "futures-core", + "futures-executor", "futures-io", "futures-sink", "futures-task", @@ -1084,6 +1420,17 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" +[[package]] +name = "futures-macro" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + [[package]] name = "futures-sink" version = "0.3.31" @@ -1105,6 +1452,7 @@ dependencies = [ "futures-channel", "futures-core", "futures-io", + "futures-macro", "futures-sink", "futures-task", "memchr", @@ -1242,6 +1590,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + [[package]] name = "iana-time-zone" version = "0.1.61" @@ -1443,12 +1797,27 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + [[package]] name = "is_terminal_polyfill" version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.12.1" @@ -1473,6 +1842,15 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +[[package]] +name = "jobserver" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" +dependencies = [ + "libc", +] + [[package]] name = "js-sys" version = "0.3.72" @@ -1606,6 +1984,36 @@ version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" +[[package]] +name = "lz4" +version = "1.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d1febb2b4a79ddd1980eede06a8f7902197960aa0383ffcfdd62fe723036725" +dependencies = [ + "lz4-sys", +] + +[[package]] +name = "lz4-sys" +version = "1.11.1+lz4-1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bd8c0d6c6ed0cd30b3652886bb8711dc4bb01d637a68105a3d5158039b418e6" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "matchers" version = "0.1.0" @@ -1775,6 +2183,27 @@ dependencies = [ "libc", ] +[[package]] +name = "num_enum" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e613fc340b2220f734a8595782c551f1250e969d87d3be1ae0579e8d4065179" +dependencies = [ + "num_enum_derive", +] + +[[package]] +name = "num_enum_derive" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af1844ef2428cc3e1cb900be36181049ef3d3193c63e43026cfe202983b27a56" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn 2.0.87", +] + [[package]] name = "object" version = "0.36.5" @@ -1784,6 +2213,27 @@ dependencies = [ "memchr", ] +[[package]] +name = "object_store" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f930c88a43b1c3f6e776dfe495b4afab89882dbc81530c632db2ed65451ebcb4" +dependencies = [ + "async-trait", + "bytes", + "chrono", + "futures", + "humantime", + "itertools 0.11.0", + "parking_lot", + "percent-encoding", + "snafu", + "tokio", + "tracing", + "url", + "walkdir", +] + [[package]] name = "once_cell" version = "1.20.2" @@ -1795,8 +2245,10 @@ name = "optd-cost-model" version = "0.1.0" dependencies = [ "arrow-schema 53.2.0", + "assert_approx_eq", "chrono", "crossbeam", + "datafusion", "datafusion-expr", "itertools 0.13.0", "optd-persistent", @@ -1805,6 +2257,9 @@ dependencies = [ "serde", "serde_json", "serde_with", + "test-case", + "tokio", + "trait-variant", ] [[package]] @@ -1813,6 +2268,7 @@ version = "0.1.0" dependencies = [ "async-stream", "async-trait", + "num_enum", "sea-orm", "sea-orm-migration", "serde_json", @@ -1821,6 +2277,15 @@ dependencies = [ "trait-variant", ] +[[package]] +name = "ordered-float" +version = "2.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" +dependencies = [ + "num-traits", +] + [[package]] name = "ordered-float" version = "3.9.2" @@ -1893,6 +2358,40 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "parquet" +version = "47.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0463cc3b256d5f50408c49a4be3a16674f4c8ceef60941709620a062b1f6bf4d" +dependencies = [ + "ahash 0.8.11", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ipc", + "arrow-schema 47.0.0", + "arrow-select", + "base64 0.21.7", + "brotli", + "bytes", + "chrono", + "flate2", + "futures", + "hashbrown 0.14.5", + "lz4", + "num", + "num-bigint", + "object_store", + "paste", + "seq-macro", + "snap", + "thrift", + "tokio", + "twox-hash", + "zstd 0.12.4", +] + [[package]] name = "parse-zoneinfo" version = "0.3.1" @@ -1923,6 +2422,16 @@ version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" +[[package]] +name = "petgraph" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" +dependencies = [ + "fixedbitset", + "indexmap 2.6.0", +] + [[package]] name = "phf" version = "0.11.2" @@ -2361,6 +2870,15 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -2538,6 +3056,12 @@ version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" +[[package]] +name = "seq-macro" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" + [[package]] name = "serde" version = "1.0.215" @@ -2588,7 +3112,7 @@ version = "3.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e28bdad6db2b8340e449f7108f020b3b092e8583a9e3fb82713e1d4e71fe817" dependencies = [ - "base64", + "base64 0.22.1", "chrono", "hex", "indexmap 1.9.3", @@ -2689,6 +3213,34 @@ dependencies = [ "serde", ] +[[package]] +name = "snafu" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4de37ad025c587a29e8f3f5605c00f70b98715ef90b9061a815b9e59e9042d6" +dependencies = [ + "doc-comment", + "snafu-derive", +] + +[[package]] +name = "snafu-derive" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990079665f075b699031e9c08fd3ab99be5029b96f3b78dc0709e8f77e4efebf" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "snap" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" + [[package]] name = "socket2" version = "0.5.7" @@ -2855,7 +3407,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "64bb4714269afa44aef2755150a0fc19d756fb580a67db8885608cf02f47d06a" dependencies = [ "atoi", - "base64", + "base64 0.22.1", "bigdecimal", "bitflags 2.6.0", "byteorder", @@ -2902,7 +3454,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fa91a732d854c5d7726349bb4bb879bb9478993ceb764247660aee25f67c2f8" dependencies = [ "atoi", - "base64", + "base64 0.22.1", "bigdecimal", "bitflags 2.6.0", "byteorder", @@ -3093,6 +3645,39 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "test-case" +version = "3.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb2550dd13afcd286853192af8601920d959b14c401fcece38071d53bf0768a8" +dependencies = [ + "test-case-macros", +] + +[[package]] +name = "test-case-core" +version = "3.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adcb7fd841cd518e279be3d5a3eb0636409487998a4aff22f3de87b81e88384f" +dependencies = [ + "cfg-if", + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "test-case-macros" +version = "3.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c89e72a01ed4c579669add59014b9a524d609c0c88c6a585ce37485879f6ffb" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", + "test-case-core", +] + [[package]] name = "thiserror" version = "1.0.69" @@ -3123,6 +3708,17 @@ dependencies = [ "once_cell", ] +[[package]] +name = "thrift" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" +dependencies = [ + "byteorder", + "integer-encoding", + "ordered-float 2.10.1", +] + [[package]] name = "time" version = "0.3.36" @@ -3198,6 +3794,7 @@ dependencies = [ "bytes", "libc", "mio", + "parking_lot", "pin-project-lite", "socket2", "tokio-macros", @@ -3226,6 +3823,19 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-util" +version = "0.7.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61e7c3654c13bcd040d4a03abee2c75b1d14a37b423cf5a813ceae1cc903ec6a" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + [[package]] name = "toml_datetime" version = "0.6.8" @@ -3301,6 +3911,16 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "twox-hash" +version = "1.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" +dependencies = [ + "cfg-if", + "static_assertions", +] + [[package]] name = "typenum" version = "1.17.0" @@ -3334,6 +3954,12 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e70f2a8b45122e719eb623c01822704c4e0907e7e426a05927e1a1cfff5b75d0" +[[package]] +name = "unicode-segmentation" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" + [[package]] name = "unicode-width" version = "0.2.0" @@ -3387,6 +4013,7 @@ version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a" dependencies = [ + "getrandom", "serde", ] @@ -3402,6 +4029,16 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -3488,6 +4125,15 @@ dependencies = [ "wasite", ] +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys 0.59.0", +] + [[package]] name = "windows-core" version = "0.52.0" @@ -3675,6 +4321,15 @@ dependencies = [ "tap", ] +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + [[package]] name = "yansi" version = "1.0.1" @@ -3774,3 +4429,50 @@ dependencies = [ "quote", "syn 2.0.87", ] + +[[package]] +name = "zstd" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c" +dependencies = [ + "zstd-safe 6.0.6", +] + +[[package]] +name = "zstd" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9" +dependencies = [ + "zstd-safe 7.2.1", +] + +[[package]] +name = "zstd-safe" +version = "6.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581" +dependencies = [ + "libc", + "zstd-sys", +] + +[[package]] +name = "zstd-safe" +version = "7.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54a3ab4db68cea366acc5c897c7b4d4d1b8994a9cd6e6f841f8964566a419059" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.13+zstd.1.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/optd-cost-model/Cargo.lock b/optd-cost-model/Cargo.lock index a38097d..bf0b367 100644 --- a/optd-cost-model/Cargo.lock +++ b/optd-cost-model/Cargo.lock @@ -57,6 +57,21 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "250f629c0161ad8107cf89319e990051fae62832fd343083bea452d93e2205fd" +[[package]] +name = "alloc-no-stdlib" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" +dependencies = [ + "alloc-no-stdlib", +] + [[package]] name = "allocator-api2" version = "0.2.20" @@ -127,6 +142,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + [[package]] name = "arrayvec" version = "0.7.6" @@ -353,6 +374,24 @@ dependencies = [ "regex-syntax 0.7.5", ] +[[package]] +name = "async-compression" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cb8f1d480b0ea3783ab015936d2a55c87e219676f0c0b7dec61494043f21857" +dependencies = [ + "bzip2", + "flate2", + "futures-core", + "futures-io", + "memchr", + "pin-project-lite", + "tokio", + "xz2", + "zstd 0.13.2", + "zstd-safe 7.2.1", +] + [[package]] name = "async-stream" version = "0.3.6" @@ -416,6 +455,12 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "base64" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" + [[package]] name = "base64" version = "0.22.1" @@ -469,6 +514,28 @@ dependencies = [ "wyz", ] +[[package]] +name = "blake2" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe" +dependencies = [ + "digest", +] + +[[package]] +name = "blake3" +version = "1.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d82033247fd8e890df8f740e407ad4d038debb9eb1f40533fffb32e7d17dc6f7" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", +] + [[package]] name = "block-buffer" version = "0.10.4" @@ -480,9 +547,9 @@ dependencies = [ [[package]] name = "borsh" -version = "1.5.2" +version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5327f6c99920069d1fe374aa743be1af0031dea9f250852cdf1ae6a0861ee24" +checksum = "2506947f73ad44e344215ccd6403ac2ae18cd8e046e581a441bf8d199f257f03" dependencies = [ "borsh-derive", "cfg_aliases", @@ -490,9 +557,9 @@ dependencies = [ [[package]] name = "borsh-derive" -version = "1.5.2" +version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10aedd8f1a81a8aafbfde924b0e3061cd6fedd6f6bbcfc6a76e6fd426d7bfe26" +checksum = "c2593a3b8b938bd68373196c9832f516be11fa487ef4ae745eb282e6a56a7244" dependencies = [ "once_cell", "proc-macro-crate", @@ -501,6 +568,27 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "brotli" +version = "3.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d640d25bc63c50fb1f0b545ffd80207d2e10a4c965530809b40ba3386825c391" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "2.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e2e4afe60d7dd600fdd3de8d0f08c2b7ec039712e3b6137ff98b7004e82de4f" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + [[package]] name = "bumpalo" version = "3.16.0" @@ -541,12 +629,35 @@ version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da" +[[package]] +name = "bzip2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" +dependencies = [ + "bzip2-sys", + "libc", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.11+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "cc" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1aeb932158bd710538c73702db6945cb68a8fb08c519e6e12706b94263b36db8" +checksum = "fd9de9f2205d5ef3fd67e685b0df337994ddd4495e2a28d185500d0e1edfea47" dependencies = [ + "jobserver", + "libc", "shlex", ] @@ -601,9 +712,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.20" +version = "4.5.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8" +checksum = "fb3b4b9e5a7c7514dfa52869339ee98b3156b0bfb4e8a77c4ff4babb64b1604f" dependencies = [ "clap_builder", "clap_derive", @@ -611,9 +722,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.20" +version = "4.5.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54" +checksum = "b17a95aa67cc7b5ebd32aa5370189aa0d79069ef1c64ce893bd30fb24bff20ec" dependencies = [ "anstream", "anstyle", @@ -635,9 +746,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.2" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" +checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7" [[package]] name = "colorchoice" @@ -647,9 +758,9 @@ checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" [[package]] name = "comfy-table" -version = "7.1.1" +version = "7.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7" +checksum = "24f165e7b643266ea80cb858aed492ad9280e3e05ce24d4a99d7d7b889b6a4d9" dependencies = [ "strum 0.26.3", "strum_macros 0.26.4", @@ -691,6 +802,12 @@ dependencies = [ "tiny-keccak", ] +[[package]] +name = "constant_time_eq" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" + [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -721,6 +838,15 @@ version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + [[package]] name = "crossbeam" version = "0.8.4" @@ -849,6 +975,67 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "dashmap" +version = "5.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" +dependencies = [ + "cfg-if", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + +[[package]] +name = "datafusion" +version = "32.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7014432223f4d721cb9786cd88bb89e7464e0ba984d4a7f49db7787f5f268674" +dependencies = [ + "ahash 0.8.11", + "arrow", + "arrow-array", + "arrow-schema 47.0.0", + "async-compression", + "async-trait", + "bytes", + "bzip2", + "chrono", + "dashmap", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-optimizer", + "datafusion-physical-expr", + "datafusion-physical-plan", + "datafusion-sql", + "flate2", + "futures", + "glob", + "half", + "hashbrown 0.14.5", + "indexmap 2.6.0", + "itertools 0.11.0", + "log", + "num_cpus", + "object_store", + "parking_lot", + "parquet", + "percent-encoding", + "pin-project-lite", + "rand", + "sqlparser", + "tempfile", + "tokio", + "tokio-util", + "url", + "uuid", + "xz2", + "zstd 0.12.4", +] + [[package]] name = "datafusion-common" version = "32.0.0" @@ -863,9 +1050,32 @@ dependencies = [ "chrono", "half", "num_cpus", + "object_store", + "parquet", "sqlparser", ] +[[package]] +name = "datafusion-execution" +version = "32.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "780b73b2407050e53f51a9781868593f694102c59e622de9a8aafc0343c4f237" +dependencies = [ + "arrow", + "chrono", + "dashmap", + "datafusion-common", + "datafusion-expr", + "futures", + "hashbrown 0.14.5", + "log", + "object_store", + "parking_lot", + "rand", + "tempfile", + "url", +] + [[package]] name = "datafusion-expr" version = "32.0.0" @@ -881,6 +1091,103 @@ dependencies = [ "strum_macros 0.25.3", ] +[[package]] +name = "datafusion-optimizer" +version = "32.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f2904a432f795484fd45e29ded4537152adb60f636c05691db34fcd94c92c96" +dependencies = [ + "arrow", + "async-trait", + "chrono", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-expr", + "hashbrown 0.14.5", + "itertools 0.11.0", + "log", + "regex-syntax 0.7.5", +] + +[[package]] +name = "datafusion-physical-expr" +version = "32.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57b4968e9a998dc0476c4db7a82f280e2026b25f464e4aa0c3bb9807ee63ddfd" +dependencies = [ + "ahash 0.8.11", + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-schema 47.0.0", + "base64 0.21.7", + "blake2", + "blake3", + "chrono", + "datafusion-common", + "datafusion-expr", + "half", + "hashbrown 0.14.5", + "hex", + "indexmap 2.6.0", + "itertools 0.11.0", + "libc", + "log", + "md-5", + "paste", + "petgraph", + "rand", + "regex", + "sha2", + "unicode-segmentation", + "uuid", +] + +[[package]] +name = "datafusion-physical-plan" +version = "32.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efd0d1fe54e37a47a2d58a1232c22786f2c28ad35805fdcd08f0253a8b0aaa90" +dependencies = [ + "ahash 0.8.11", + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-schema 47.0.0", + "async-trait", + "chrono", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "futures", + "half", + "hashbrown 0.14.5", + "indexmap 2.6.0", + "itertools 0.11.0", + "log", + "once_cell", + "parking_lot", + "pin-project-lite", + "rand", + "tokio", + "uuid", +] + +[[package]] +name = "datafusion-sql" +version = "32.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b568d44c87ead99604d704f942e257c8a236ee1bbf890ee3e034ad659dcb2c21" +dependencies = [ + "arrow", + "arrow-schema 47.0.0", + "datafusion-common", + "datafusion-expr", + "log", + "sqlparser", +] + [[package]] name = "der" version = "0.7.9" @@ -925,6 +1232,12 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "doc-comment" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" + [[package]] name = "dotenvy" version = "0.15.7" @@ -984,6 +1297,12 @@ version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4" +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + [[package]] name = "flatbuffers" version = "23.5.26" @@ -994,6 +1313,16 @@ dependencies = [ "rustc_version", ] +[[package]] +name = "flate2" +version = "1.0.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "flume" version = "0.11.1" @@ -1034,6 +1363,7 @@ checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" dependencies = [ "futures-channel", "futures-core", + "futures-executor", "futures-io", "futures-sink", "futures-task", @@ -1084,6 +1414,17 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" +[[package]] +name = "futures-macro" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + [[package]] name = "futures-sink" version = "0.3.31" @@ -1105,6 +1446,7 @@ dependencies = [ "futures-channel", "futures-core", "futures-io", + "futures-macro", "futures-sink", "futures-task", "memchr", @@ -1242,6 +1584,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + [[package]] name = "iana-time-zone" version = "0.1.61" @@ -1443,12 +1791,27 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + [[package]] name = "is_terminal_polyfill" version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.12.1" @@ -1473,6 +1836,15 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +[[package]] +name = "jobserver" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" +dependencies = [ + "libc", +] + [[package]] name = "js-sys" version = "0.3.72" @@ -1606,6 +1978,36 @@ version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" +[[package]] +name = "lz4" +version = "1.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d1febb2b4a79ddd1980eede06a8f7902197960aa0383ffcfdd62fe723036725" +dependencies = [ + "lz4-sys", +] + +[[package]] +name = "lz4-sys" +version = "1.11.1+lz4-1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bd8c0d6c6ed0cd30b3652886bb8711dc4bb01d637a68105a3d5158039b418e6" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "matchers" version = "0.1.0" @@ -1784,6 +2186,27 @@ dependencies = [ "memchr", ] +[[package]] +name = "object_store" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f930c88a43b1c3f6e776dfe495b4afab89882dbc81530c632db2ed65451ebcb4" +dependencies = [ + "async-trait", + "bytes", + "chrono", + "futures", + "humantime", + "itertools 0.11.0", + "parking_lot", + "percent-encoding", + "snafu", + "tokio", + "tracing", + "url", + "walkdir", +] + [[package]] name = "once_cell" version = "1.20.2" @@ -1797,6 +2220,7 @@ dependencies = [ "arrow-schema 53.2.0", "chrono", "crossbeam", + "datafusion", "datafusion-expr", "itertools 0.13.0", "optd-persistent", @@ -1821,6 +2245,15 @@ dependencies = [ "trait-variant", ] +[[package]] +name = "ordered-float" +version = "2.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" +dependencies = [ + "num-traits", +] + [[package]] name = "ordered-float" version = "3.9.2" @@ -1893,6 +2326,40 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "parquet" +version = "47.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0463cc3b256d5f50408c49a4be3a16674f4c8ceef60941709620a062b1f6bf4d" +dependencies = [ + "ahash 0.8.11", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ipc", + "arrow-schema 47.0.0", + "arrow-select", + "base64 0.21.7", + "brotli", + "bytes", + "chrono", + "flate2", + "futures", + "hashbrown 0.14.5", + "lz4", + "num", + "num-bigint", + "object_store", + "paste", + "seq-macro", + "snap", + "thrift", + "tokio", + "twox-hash", + "zstd 0.12.4", +] + [[package]] name = "parse-zoneinfo" version = "0.3.1" @@ -1923,6 +2390,16 @@ version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" +[[package]] +name = "petgraph" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" +dependencies = [ + "fixedbitset", + "indexmap 2.6.0", +] + [[package]] name = "phf" version = "0.11.2" @@ -2361,6 +2838,15 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -2538,6 +3024,12 @@ version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" +[[package]] +name = "seq-macro" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" + [[package]] name = "serde" version = "1.0.215" @@ -2588,7 +3080,7 @@ version = "3.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e28bdad6db2b8340e449f7108f020b3b092e8583a9e3fb82713e1d4e71fe817" dependencies = [ - "base64", + "base64 0.22.1", "chrono", "hex", "indexmap 1.9.3", @@ -2689,6 +3181,34 @@ dependencies = [ "serde", ] +[[package]] +name = "snafu" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4de37ad025c587a29e8f3f5605c00f70b98715ef90b9061a815b9e59e9042d6" +dependencies = [ + "doc-comment", + "snafu-derive", +] + +[[package]] +name = "snafu-derive" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990079665f075b699031e9c08fd3ab99be5029b96f3b78dc0709e8f77e4efebf" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "snap" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" + [[package]] name = "socket2" version = "0.5.7" @@ -2855,7 +3375,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "64bb4714269afa44aef2755150a0fc19d756fb580a67db8885608cf02f47d06a" dependencies = [ "atoi", - "base64", + "base64 0.22.1", "bigdecimal", "bitflags 2.6.0", "byteorder", @@ -2902,7 +3422,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fa91a732d854c5d7726349bb4bb879bb9478993ceb764247660aee25f67c2f8" dependencies = [ "atoi", - "base64", + "base64 0.22.1", "bigdecimal", "bitflags 2.6.0", "byteorder", @@ -3123,6 +3643,17 @@ dependencies = [ "once_cell", ] +[[package]] +name = "thrift" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" +dependencies = [ + "byteorder", + "integer-encoding", + "ordered-float 2.10.1", +] + [[package]] name = "time" version = "0.3.36" @@ -3198,6 +3729,7 @@ dependencies = [ "bytes", "libc", "mio", + "parking_lot", "pin-project-lite", "socket2", "tokio-macros", @@ -3226,6 +3758,19 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-util" +version = "0.7.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61e7c3654c13bcd040d4a03abee2c75b1d14a37b423cf5a813ceae1cc903ec6a" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + [[package]] name = "toml_datetime" version = "0.6.8" @@ -3301,6 +3846,16 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "twox-hash" +version = "1.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" +dependencies = [ + "cfg-if", + "static_assertions", +] + [[package]] name = "typenum" version = "1.17.0" @@ -3334,11 +3889,17 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e70f2a8b45122e719eb623c01822704c4e0907e7e426a05927e1a1cfff5b75d0" +[[package]] +name = "unicode-segmentation" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" + [[package]] name = "unicode-width" -version = "0.1.14" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" +checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" [[package]] name = "unicode_categories" @@ -3387,6 +3948,7 @@ version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a" dependencies = [ + "getrandom", "serde", ] @@ -3402,6 +3964,16 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -3488,6 +4060,15 @@ dependencies = [ "wasite", ] +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys 0.59.0", +] + [[package]] name = "windows-core" version = "0.52.0" @@ -3675,6 +4256,15 @@ dependencies = [ "tap", ] +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + [[package]] name = "yansi" version = "1.0.1" @@ -3774,3 +4364,50 @@ dependencies = [ "quote", "syn 2.0.87", ] + +[[package]] +name = "zstd" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c" +dependencies = [ + "zstd-safe 6.0.6", +] + +[[package]] +name = "zstd" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9" +dependencies = [ + "zstd-safe 7.2.1", +] + +[[package]] +name = "zstd-safe" +version = "6.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581" +dependencies = [ + "libc", + "zstd-sys", +] + +[[package]] +name = "zstd-safe" +version = "7.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54a3ab4db68cea366acc5c897c7b4d4d1b8994a9cd6e6f841f8964566a419059" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.13+zstd.1.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/optd-cost-model/Cargo.toml b/optd-cost-model/Cargo.toml index 1d41af7..e8b22aa 100644 --- a/optd-cost-model/Cargo.toml +++ b/optd-cost-model/Cargo.toml @@ -2,6 +2,7 @@ name = "optd-cost-model" version = "0.1.0" edition = "2021" +authors = ["Yuanxin Cao", "Lan Lou", "Kunle Li"] [dependencies] optd-persistent = { path = "../optd-persistent", version = "0.1" } @@ -10,10 +11,15 @@ serde_json = "1.0" serde_with = { version = "3.7.0", features = ["json"] } arrow-schema = "53.2.0" datafusion-expr = "32.0.0" +datafusion = "32.0.0" ordered-float = "4.0" chrono = "0.4" itertools = "0.13" +assert_approx_eq = "1.1.0" +trait-variant = "0.1.2" +tokio = { version = "1.0.1", features = ["macros", "rt-multi-thread"] } [dev-dependencies] crossbeam = "0.8" rand = "0.8" +test-case = "3.3" diff --git a/optd-cost-model/src/cost/agg.rs b/optd-cost-model/src/cost/agg.rs index 8b13789..f5edc7a 100644 --- a/optd-cost-model/src/cost/agg.rs +++ b/optd-cost-model/src/cost/agg.rs @@ -1 +1,203 @@ +use crate::{ + common::{ + nodes::{ArcPredicateNode, PredicateType, ReprPredicateNode}, + predicates::{attr_index_pred::AttrIndexPred, list_pred::ListPred}, + properties::attr_ref::{AttrRef, BaseTableAttrRef}, + types::GroupId, + }, + cost_model::CostModelImpl, + stats::DEFAULT_NUM_DISTINCT, + storage::CostModelStorageManager, + CostModelError, CostModelResult, EstimatedStatistic, SemanticError, +}; +impl CostModelImpl { + pub async fn get_agg_row_cnt( + &self, + group_id: GroupId, + group_by: ArcPredicateNode, + ) -> CostModelResult { + let group_by = ListPred::from_pred_node(group_by).unwrap(); + if group_by.is_empty() { + Ok(EstimatedStatistic(1.0)) + } else { + // Multiply the n-distinct of all the group by columns. + // TODO: improve with multi-dimensional n-distinct + let mut row_cnt = 1; + + for node in &group_by.0.children { + match node.typ { + PredicateType::AttrIndex => { + let attr_ref = + AttrIndexPred::from_pred_node(node.clone()).ok_or_else(|| { + SemanticError::InvalidPredicate( + "Expected AttributeRef predicate".to_string(), + ) + })?; + if let AttrRef::BaseTableAttrRef(BaseTableAttrRef { table_id, attr_idx }) = + self.memo.get_attribute_ref(group_id, attr_ref.attr_index()) + { + // TODO: Only query ndistinct instead of all kinds of stats. + let stats_option = + self.get_attribute_comb_stats(table_id, &[attr_idx]).await?; + + let ndistinct = match stats_option { + Some(stats) => stats.ndistinct, + None => { + // The column type is not supported or stats are missing. + DEFAULT_NUM_DISTINCT + } + }; + row_cnt *= ndistinct; + } else { + // TOOD: Handle derived attributes. + row_cnt *= DEFAULT_NUM_DISTINCT; + } + } + _ => { + // TODO: Consider the case where `GROUP BY 1`. + panic!("GROUP BY must have attribute ref predicate"); + } + } + } + Ok(EstimatedStatistic(row_cnt as f64)) + } + } +} + +#[cfg(test)] +mod tests { + use std::{collections::HashMap, ops::Deref}; + + use crate::{ + common::{ + predicates::constant_pred::ConstantType, + properties::Attribute, + types::{GroupId, TableId}, + values::Value, + }, + cost_model::tests::{ + attr_index, cnst, create_mock_cost_model, create_mock_cost_model_with_attr_types, + empty_list, empty_per_attr_stats, list, TestPerAttributeStats, TEST_ATTR1_BASE_INDEX, + TEST_ATTR2_BASE_INDEX, TEST_ATTR3_BASE_INDEX, TEST_GROUP1_ID, TEST_TABLE1_ID, + }, + stats::{utilities::simple_map::SimpleMap, MostCommonValues, DEFAULT_NUM_DISTINCT}, + EstimatedStatistic, + }; + + #[tokio::test] + async fn test_agg_no_stats() { + let cost_model = create_mock_cost_model_with_attr_types( + vec![TEST_TABLE1_ID], + vec![], + vec![HashMap::from([ + (TEST_ATTR1_BASE_INDEX, ConstantType::Int32), + (TEST_ATTR2_BASE_INDEX, ConstantType::Int32), + ])], + vec![None], + ); + + // Group by empty list should return 1. + let group_bys = empty_list(); + assert_eq!( + cost_model + .get_agg_row_cnt(TEST_GROUP1_ID, group_bys) + .await + .unwrap(), + EstimatedStatistic(1.0) + ); + + // Group by single column should return the default value since there are no stats. + let group_bys = list(vec![attr_index(0)]); + assert_eq!( + cost_model + .get_agg_row_cnt(TEST_GROUP1_ID, group_bys) + .await + .unwrap(), + EstimatedStatistic(DEFAULT_NUM_DISTINCT as f64) + ); + + // Group by two columns should return the default value squared since there are no stats. + let group_bys = list(vec![attr_index(0), attr_index(1)]); + assert_eq!( + cost_model + .get_agg_row_cnt(TEST_GROUP1_ID, group_bys) + .await + .unwrap(), + EstimatedStatistic((DEFAULT_NUM_DISTINCT * DEFAULT_NUM_DISTINCT) as f64) + ); + } + + #[tokio::test] + async fn test_agg_with_stats() { + let attr1_ndistinct = 12; + let attr2_ndistinct = 645; + let attr1_stats = TestPerAttributeStats::new( + MostCommonValues::SimpleFrequency(SimpleMap::default()), + None, + attr1_ndistinct, + 0.0, + ); + let attr2_stats = TestPerAttributeStats::new( + MostCommonValues::SimpleFrequency(SimpleMap::default()), + None, + attr2_ndistinct, + 0.0, + ); + + let cost_model = create_mock_cost_model_with_attr_types( + vec![TEST_TABLE1_ID], + vec![HashMap::from([ + (TEST_ATTR1_BASE_INDEX, attr1_stats), + (TEST_ATTR2_BASE_INDEX, attr2_stats), + ])], + vec![HashMap::from([ + (TEST_ATTR1_BASE_INDEX, ConstantType::Int32), + (TEST_ATTR2_BASE_INDEX, ConstantType::Int32), + (TEST_ATTR3_BASE_INDEX, ConstantType::Int32), + ])], + vec![None], + ); + + // Group by empty list should return 1. + let group_bys = empty_list(); + assert_eq!( + cost_model + .get_agg_row_cnt(TEST_GROUP1_ID, group_bys) + .await + .unwrap(), + EstimatedStatistic(1.0) + ); + + // Group by single column should return the n-distinct of the column. + let group_bys = list(vec![attr_index(0)]); + assert_eq!( + cost_model + .get_agg_row_cnt(TEST_GROUP1_ID, group_bys) + .await + .unwrap(), + EstimatedStatistic(attr1_ndistinct as f64) + ); + + // Group by two columns should return the product of the n-distinct of the columns. + let group_bys = list(vec![attr_index(0), attr_index(1)]); + assert_eq!( + cost_model + .get_agg_row_cnt(TEST_GROUP1_ID, group_bys) + .await + .unwrap(), + EstimatedStatistic((attr1_ndistinct * attr2_ndistinct) as f64) + ); + + // Group by multiple columns should return the product of the n-distinct of the columns. If one of the columns + // does not have stats, it should use the default value instead. + let group_bys = list(vec![attr_index(0), attr_index(1), attr_index(2)]); + assert_eq!( + cost_model + .get_agg_row_cnt(TEST_GROUP1_ID, group_bys) + .await + .unwrap(), + EstimatedStatistic((attr1_ndistinct * attr2_ndistinct * DEFAULT_NUM_DISTINCT) as f64) + ); + } +} diff --git a/optd-cost-model/src/cost/filter.rs b/optd-cost-model/src/cost/filter.rs index 8b13789..e69de29 100644 --- a/optd-cost-model/src/cost/filter.rs +++ b/optd-cost-model/src/cost/filter.rs @@ -1 +0,0 @@ - diff --git a/optd-cost-model/src/cost/join.rs b/optd-cost-model/src/cost/join.rs index 8b13789..e69de29 100644 --- a/optd-cost-model/src/cost/join.rs +++ b/optd-cost-model/src/cost/join.rs @@ -1 +0,0 @@ - diff --git a/optd-cost-model/src/cost/limit.rs b/optd-cost-model/src/cost/limit.rs new file mode 100644 index 0000000..c63c0e0 --- /dev/null +++ b/optd-cost-model/src/cost/limit.rs @@ -0,0 +1,28 @@ +use crate::{ + common::{ + nodes::{ArcPredicateNode, ReprPredicateNode}, + predicates::constant_pred::ConstantPred, + }, + cost_model::CostModelImpl, + storage::CostModelStorageManager, + CostModelResult, EstimatedStatistic, +}; + +impl CostModelImpl { + pub(crate) fn get_limit_row_cnt( + &self, + child_row_cnt: EstimatedStatistic, + fetch_expr: ArcPredicateNode, + ) -> CostModelResult { + let fetch = ConstantPred::from_pred_node(fetch_expr) + .unwrap() + .value() + .as_u64(); + // u64::MAX represents None + if fetch == u64::MAX { + Ok(child_row_cnt) + } else { + Ok(EstimatedStatistic(child_row_cnt.0.min(fetch as f64))) + } + } +} diff --git a/optd-cost-model/src/cost/mod.rs b/optd-cost-model/src/cost/mod.rs index 795ed3e..c98d7d7 100644 --- a/optd-cost-model/src/cost/mod.rs +++ b/optd-cost-model/src/cost/mod.rs @@ -1,3 +1,6 @@ +#![allow(unused)] + pub mod agg; pub mod filter; pub mod join; +pub mod limit; diff --git a/optd-cost-model/src/cost_model.rs b/optd-cost-model/src/cost_model.rs index e933add..4583484 100644 --- a/optd-cost-model/src/cost_model.rs +++ b/optd-cost-model/src/cost_model.rs @@ -13,33 +13,34 @@ use crate::{ types::{AttrId, EpochId, ExprId, TableId}, }, memo_ext::MemoExt, + stats::AttributeCombValueStats, storage::CostModelStorageManager, ComputeCostContext, Cost, CostModel, CostModelResult, EstimatedStatistic, StatValue, }; /// TODO: documentation -pub struct CostModelImpl { - storage_manager: CostModelStorageManager, - default_catalog_source: CatalogSource, - _memo: Arc, +pub struct CostModelImpl { + pub storage_manager: S, + pub default_catalog_source: CatalogSource, + pub memo: Arc, } -impl CostModelImpl { +impl CostModelImpl { /// TODO: documentation pub fn new( - storage_manager: CostModelStorageManager, + storage_manager: S, default_catalog_source: CatalogSource, memo: Arc, ) -> Self { Self { storage_manager, default_catalog_source, - _memo: memo, + memo, } } } -impl CostModel for CostModelImpl { +impl CostModel for CostModelImpl { fn compute_operation_cost( &self, node: &PhysicalNodeType, @@ -71,7 +72,6 @@ impl CostModel for CostModelImpl { fn get_table_statistic_for_analysis( &self, - // TODO: i32 should be changed to TableId. table_id: TableId, stat_type: StatType, epoch_id: Option, @@ -96,3 +96,534 @@ impl CostModel for CostModelImpl { todo!() } } + +impl CostModelImpl { + /// TODO: documentation + /// TODO: if we have memory cache, + /// we should add the reference. (&AttributeCombValueStats) + pub(crate) async fn get_attribute_comb_stats( + &self, + table_id: TableId, + attr_comb: &[u64], + ) -> CostModelResult> { + self.storage_manager + .get_attributes_comb_statistics(table_id, attr_comb) + .await + } +} + +/// I thought about using the system's own parser and planner to generate these expression trees, +/// but this is not currently feasible because it would create a cyclic dependency between +/// optd-datafusion-bridge and optd-datafusion-repr +#[cfg(test)] +pub mod tests { + use std::{collections::HashMap, hash::Hash}; + + use arrow_schema::DataType; + use itertools::Itertools; + use optd_persistent::cost_model::interface::CatalogSource; + use serde::{Deserialize, Serialize}; + + use crate::{ + common::{ + nodes::ReprPredicateNode, + predicates::{ + attr_index_pred::AttrIndexPred, + bin_op_pred::{BinOpPred, BinOpType}, + cast_pred::CastPred, + constant_pred::{ConstantPred, ConstantType}, + in_list_pred::InListPred, + like_pred::LikePred, + list_pred::ListPred, + log_op_pred::{LogOpPred, LogOpType}, + un_op_pred::{UnOpPred, UnOpType}, + }, + properties::{ + attr_ref::{AttrRef, GroupAttrRefs}, + schema::Schema, + Attribute, + }, + types::GroupId, + values::Value, + }, + memo_ext::tests::{MemoGroupInfo, MockMemoExtImpl}, + stats::{ + utilities::{counter::Counter, simple_map::SimpleMap}, + AttributeCombValueStats, Distribution, MostCommonValues, + }, + storage::mock::{CostModelStorageMockManagerImpl, TableStats}, + }; + + use super::*; + + pub const TEST_TABLE1_ID: TableId = TableId(0); + pub const TEST_TABLE2_ID: TableId = TableId(1); + pub const TEST_TABLE3_ID: TableId = TableId(2); + pub const TEST_TABLE4_ID: TableId = TableId(3); + + pub const TEST_GROUP1_ID: GroupId = GroupId(0); + pub const TEST_GROUP2_ID: GroupId = GroupId(1); + pub const TEST_GROUP3_ID: GroupId = GroupId(2); + pub const TEST_GROUP4_ID: GroupId = GroupId(3); + + // This is base index rather than ref index. + pub const TEST_ATTR1_BASE_INDEX: u64 = 0; + pub const TEST_ATTR2_BASE_INDEX: u64 = 1; + pub const TEST_ATTR3_BASE_INDEX: u64 = 2; + + pub const TEST_ATTR1_NAME: &str = "attr1"; + pub const TEST_ATTR2_NAME: &str = "attr2"; + pub const TEST_ATTR3_NAME: &str = "attr3"; + pub const TEST_ATTR4_NAME: &str = "attr4"; + + pub type TestPerAttributeStats = AttributeCombValueStats; + // TODO: add tests for non-mock storage manager + pub type TestOptCostModelMock = CostModelImpl; + + // Use this method, we only create one group `TEST_GROUP1_ID` in the memo. + // We put the first attribute in the first table as the ref index 0 in the group. + // And put the second attribute in the first table as the ref index 1 in the group. + // etc. + // The orders of attributes and tables are defined by the order of their ids (smaller first). + pub fn create_mock_cost_model( + table_id: Vec, + // u64 should be base attribute index. + per_attribute_stats: Vec>, + row_counts: Vec>, + ) -> TestOptCostModelMock { + let attr_ids: Vec<(TableId, u64, Option)> = per_attribute_stats + .iter() + .enumerate() + .map(|(idx, m)| (table_id[idx], m)) + .flat_map(|(table_id, m)| { + m.iter() + .map(|(attr_idx, _)| (table_id, *attr_idx, None)) + .collect_vec() + }) + .sorted_by_key(|(table_id, attr_idx, _)| (*table_id, *attr_idx)) + .collect(); + create_mock_cost_model_with_memo( + table_id.clone(), + per_attribute_stats, + row_counts, + create_one_group_all_base_attributes_mock_memo(attr_ids), + ) + } + + pub fn create_mock_cost_model_with_attr_types( + table_id: Vec, + // u64 should be base attribute index. + per_attribute_stats: Vec>, + attributes: Vec>, + row_counts: Vec>, + ) -> TestOptCostModelMock { + let attr_ids: Vec<(TableId, u64, Option)> = attributes + .iter() + .enumerate() + .map(|(idx, m)| (table_id[idx], m)) + .flat_map(|(table_id, m)| { + m.iter() + .map(|(attr_idx, typ)| (table_id, *attr_idx, Some(*typ))) + .collect_vec() + }) + .sorted_by_key(|(table_id, attr_idx, _)| (*table_id, *attr_idx)) + .collect(); + create_mock_cost_model_with_memo( + table_id.clone(), + per_attribute_stats, + row_counts, + create_one_group_all_base_attributes_mock_memo(attr_ids), + ) + } + + pub fn create_mock_cost_model_with_memo( + table_id: Vec, + per_attribute_stats: Vec>, + row_counts: Vec>, + memo: MockMemoExtImpl, + ) -> TestOptCostModelMock { + let storage_manager = CostModelStorageMockManagerImpl::new( + table_id + .into_iter() + .zip(per_attribute_stats) + .zip(row_counts) + .map(|((table_id, per_attr_stats), row_count)| { + ( + table_id, + TableStats::new( + row_count.unwrap_or(100), + per_attr_stats + .into_iter() + .map(|(attr_idx, stats)| (vec![attr_idx], stats)) + .collect(), + ), + ) + }) + .collect(), + ); + CostModelImpl::new(storage_manager, CatalogSource::Mock, Arc::new(memo)) + } + + // attributes: Vec<(TableId, AttrBaseIndex)> + pub fn create_one_group_all_base_attributes_mock_memo( + attr_ids: Vec<(TableId, u64, Option)>, + ) -> MockMemoExtImpl { + let group_info = MemoGroupInfo::new( + Schema::new( + attr_ids + .clone() + .into_iter() + .map(|(_, _, typ)| Attribute { + name: "attr".to_string(), + typ: typ.unwrap_or(ConstantType::Int64), + nullable: false, + }) + .collect(), + ), + GroupAttrRefs::new( + attr_ids + .into_iter() + .map(|(table_id, attr_base_index, _)| { + AttrRef::new_base_table_attr_ref(table_id, attr_base_index) + }) + .collect(), + None, + ), + ); + MockMemoExtImpl::from(HashMap::from([(TEST_GROUP1_ID, group_info)])) + } + + /// Create a cost model two tables, each with one attribute. Each attribute has 100 values. + pub fn create_two_table_mock_cost_model( + tbl1_per_attr_stats: TestPerAttributeStats, + tbl2_per_attr_stats: TestPerAttributeStats, + additional_memo: Option>, + ) -> TestOptCostModelMock { + create_two_table_mock_cost_model_custom_row_cnts( + tbl1_per_attr_stats, + tbl2_per_attr_stats, + 100, + 100, + additional_memo, + ) + } + + /// Create a cost model three tables, each with one attribute. Each attribute has 100 values. + pub fn create_three_table_mock_cost_model( + tbl1_per_column_stats: TestPerAttributeStats, + tbl2_per_column_stats: TestPerAttributeStats, + tbl3_per_column_stats: TestPerAttributeStats, + ) -> TestOptCostModelMock { + let storage_manager = CostModelStorageMockManagerImpl::new( + vec![ + ( + TEST_TABLE1_ID, + TableStats::new( + 100, + vec![(vec![0], tbl1_per_column_stats)].into_iter().collect(), + ), + ), + ( + TEST_TABLE2_ID, + TableStats::new( + 100, + vec![(vec![0], tbl2_per_column_stats)].into_iter().collect(), + ), + ), + ( + TEST_TABLE3_ID, + TableStats::new( + 100, + vec![(vec![0], tbl3_per_column_stats)].into_iter().collect(), + ), + ), + ] + .into_iter() + .collect(), + ); + let memo = HashMap::from([ + ( + TEST_GROUP1_ID, + MemoGroupInfo::new( + vec![Attribute::new_non_null_int64(TEST_ATTR1_NAME.to_string())].into(), + GroupAttrRefs::new( + vec![AttrRef::new_base_table_attr_ref(TEST_TABLE1_ID, 0)], + None, + ), + ), + ), + ( + TEST_GROUP2_ID, + MemoGroupInfo::new( + vec![Attribute::new_non_null_int64(TEST_ATTR2_NAME.to_string())].into(), + GroupAttrRefs::new( + vec![AttrRef::new_base_table_attr_ref(TEST_TABLE2_ID, 0)], + None, + ), + ), + ), + ( + TEST_GROUP3_ID, + MemoGroupInfo::new( + vec![Attribute::new_non_null_int64(TEST_ATTR3_NAME.to_string())].into(), + GroupAttrRefs::new( + vec![AttrRef::new_base_table_attr_ref(TEST_TABLE3_ID, 0)], + None, + ), + ), + ), + ]); + CostModelImpl::new( + storage_manager, + CatalogSource::Mock, + Arc::new(MockMemoExtImpl::from(memo)), + ) + } + + /// Create a cost model four tables, each with one attribute. Each attribute has 100 values. + pub fn create_four_table_mock_cost_model( + tbl1_per_column_stats: TestPerAttributeStats, + tbl2_per_column_stats: TestPerAttributeStats, + tbl3_per_column_stats: TestPerAttributeStats, + tbl4_per_column_stats: TestPerAttributeStats, + ) -> TestOptCostModelMock { + let storage_manager = CostModelStorageMockManagerImpl::new( + vec![ + ( + TEST_TABLE1_ID, + TableStats::new( + 100, + vec![(vec![0], tbl1_per_column_stats)].into_iter().collect(), + ), + ), + ( + TEST_TABLE2_ID, + TableStats::new( + 100, + vec![(vec![0], tbl2_per_column_stats)].into_iter().collect(), + ), + ), + ( + TEST_TABLE3_ID, + TableStats::new( + 100, + vec![(vec![0], tbl3_per_column_stats)].into_iter().collect(), + ), + ), + ( + TEST_TABLE4_ID, + TableStats::new( + 100, + vec![(vec![0], tbl4_per_column_stats)].into_iter().collect(), + ), + ), + ] + .into_iter() + .collect(), + ); + let memo = HashMap::from([ + ( + TEST_GROUP1_ID, + MemoGroupInfo::new( + vec![Attribute::new_non_null_int64(TEST_ATTR1_NAME.to_string())].into(), + GroupAttrRefs::new( + vec![AttrRef::new_base_table_attr_ref(TEST_TABLE1_ID, 0)], + None, + ), + ), + ), + ( + TEST_GROUP2_ID, + MemoGroupInfo::new( + vec![Attribute::new_non_null_int64(TEST_ATTR2_NAME.to_string())].into(), + GroupAttrRefs::new( + vec![AttrRef::new_base_table_attr_ref(TEST_TABLE2_ID, 0)], + None, + ), + ), + ), + ( + TEST_GROUP3_ID, + MemoGroupInfo::new( + vec![Attribute::new_non_null_int64(TEST_ATTR3_NAME.to_string())].into(), + GroupAttrRefs::new( + vec![AttrRef::new_base_table_attr_ref(TEST_TABLE3_ID, 0)], + None, + ), + ), + ), + ( + TEST_GROUP4_ID, + MemoGroupInfo::new( + vec![Attribute::new_non_null_int64(TEST_ATTR4_NAME.to_string())].into(), + GroupAttrRefs::new( + vec![AttrRef::new_base_table_attr_ref(TEST_TABLE4_ID, 0)], + None, + ), + ), + ), + ]); + CostModelImpl::new( + storage_manager, + CatalogSource::Mock, + Arc::new(MockMemoExtImpl::from(memo)), + ) + } + + /// We need custom row counts because some join algorithms rely on the row cnt + pub fn create_two_table_mock_cost_model_custom_row_cnts( + tbl1_per_column_stats: TestPerAttributeStats, + tbl2_per_column_stats: TestPerAttributeStats, + tbl1_row_cnt: u64, + tbl2_row_cnt: u64, + additional_memo: Option>, + ) -> TestOptCostModelMock { + let storage_manager = CostModelStorageMockManagerImpl::new( + vec![ + ( + TEST_TABLE1_ID, + TableStats::new( + tbl1_row_cnt, + vec![(vec![0], tbl1_per_column_stats)].into_iter().collect(), + ), + ), + ( + TEST_TABLE2_ID, + TableStats::new( + tbl2_row_cnt, + vec![(vec![0], tbl2_per_column_stats)].into_iter().collect(), + ), + ), + ] + .into_iter() + .collect(), + ); + let mut memo = HashMap::from([ + ( + TEST_GROUP1_ID, + MemoGroupInfo::new( + vec![Attribute::new_non_null_int64(TEST_ATTR1_NAME.to_string())].into(), + GroupAttrRefs::new( + vec![AttrRef::new_base_table_attr_ref(TEST_TABLE1_ID, 0)], + None, + ), + ), + ), + ( + TEST_GROUP2_ID, + MemoGroupInfo::new( + vec![Attribute::new_non_null_int64(TEST_ATTR2_NAME.to_string())].into(), + GroupAttrRefs::new( + vec![AttrRef::new_base_table_attr_ref(TEST_TABLE2_ID, 0)], + None, + ), + ), + ), + ]); + if let Some(additional_memo) = additional_memo { + memo.extend(additional_memo); + } + CostModelImpl::new( + storage_manager, + CatalogSource::Mock, + Arc::new(MockMemoExtImpl::from(memo)), + ) + } + + impl TestOptCostModelMock { + pub fn get_row_count(&self, table_id: TableId) -> u64 { + self.storage_manager + .per_table_stats_map + .get(&table_id) + .map(|stats| stats.row_cnt) + .unwrap_or(0) + } + + pub fn get_attr_refs(&self, group_id: GroupId) -> GroupAttrRefs { + self.memo.get_attribute_refs(group_id) + } + } + + pub fn attr_index(attr_index: u64) -> ArcPredicateNode { + AttrIndexPred::new(attr_index).into_pred_node() + } + + pub fn cnst(value: Value) -> ArcPredicateNode { + ConstantPred::new(value).into_pred_node() + } + + pub fn cast(child: ArcPredicateNode, cast_type: DataType) -> ArcPredicateNode { + CastPred::new(child, cast_type).into_pred_node() + } + + pub fn bin_op( + op_type: BinOpType, + left: ArcPredicateNode, + right: ArcPredicateNode, + ) -> ArcPredicateNode { + BinOpPred::new(left, right, op_type).into_pred_node() + } + + pub fn log_op(op_type: LogOpType, children: Vec) -> ArcPredicateNode { + LogOpPred::new(op_type, children).into_pred_node() + } + + pub fn un_op(op_type: UnOpType, child: ArcPredicateNode) -> ArcPredicateNode { + UnOpPred::new(child, op_type).into_pred_node() + } + + pub fn empty_list() -> ArcPredicateNode { + ListPred::new(vec![]).into_pred_node() + } + + pub fn list(children: Vec) -> ArcPredicateNode { + ListPred::new(children).into_pred_node() + } + + pub fn in_list(attr_idx: u64, list: Vec, negated: bool) -> InListPred { + InListPred::new( + attr_index(attr_idx), + ListPred::new(list.into_iter().map(cnst).collect_vec()), + negated, + ) + } + + pub fn like(attr_idx: u64, pattern: &str, negated: bool) -> LikePred { + LikePred::new( + negated, + false, + attr_index(attr_idx), + cnst(Value::String(pattern.into())), + ) + } + + pub(crate) fn empty_per_attr_stats() -> TestPerAttributeStats { + TestPerAttributeStats::new( + MostCommonValues::empty(), + Some(Distribution::empty()), + 0, + 0.0, + ) + } + + pub(crate) fn per_attr_stats_with_ndistinct(ndistinct: u64) -> TestPerAttributeStats { + TestPerAttributeStats::new( + MostCommonValues::empty(), + Some(Distribution::empty()), + ndistinct, + 0.0, + ) + } + + pub(crate) fn per_attr_stats_with_dist_and_ndistinct( + dist: Vec<(Value, f64)>, + ndistinct: u64, + ) -> TestPerAttributeStats { + TestPerAttributeStats::new( + MostCommonValues::empty(), + Some(Distribution::SimpleDistribution(SimpleMap::new(dist))), + ndistinct, + 0.0, + ) + } +} diff --git a/optd-cost-model/src/lib.rs b/optd-cost-model/src/lib.rs index 5417f1c..13774b2 100644 --- a/optd-cost-model/src/lib.rs +++ b/optd-cost-model/src/lib.rs @@ -33,7 +33,8 @@ pub struct Cost(pub Vec); /// Estimated statistic calculated by the cost model. /// It is the estimated output row count of the targeted expression. -pub struct EstimatedStatistic(pub u64); +#[derive(PartialEq, PartialOrd, Debug)] +pub struct EstimatedStatistic(pub f64); pub type CostModelResult = Result; @@ -42,12 +43,13 @@ pub enum SemanticError { // TODO: Add more error types UnknownStatisticType, VersionedStatisticNotFound, - AttributeNotFound(TableId, i32), // (table_id, attribute_base_index) + AttributeNotFound(TableId, u64), // (table_id, attribute_base_index) + // FIXME: not sure if this should be put here + InvalidPredicate(String), } #[derive(Debug)] pub enum CostModelError { - // TODO: Add more error types ORMError(BackendError), SemanticError(SemanticError), } @@ -58,6 +60,12 @@ impl From for CostModelError { } } +impl From for CostModelError { + fn from(err: SemanticError) -> Self { + CostModelError::SemanticError(err) + } +} + pub trait CostModel: 'static + Send + Sync { /// TODO: documentation fn compute_operation_cost( diff --git a/optd-cost-model/src/memo_ext.rs b/optd-cost-model/src/memo_ext.rs index 16cddca..c7827c5 100644 --- a/optd-cost-model/src/memo_ext.rs +++ b/optd-cost-model/src/memo_ext.rs @@ -1,5 +1,9 @@ use crate::common::{ - properties::{attr_ref::GroupAttrRefs, schema::Schema, Attribute}, + properties::{ + attr_ref::{AttrRef, GroupAttrRefs}, + schema::Schema, + Attribute, + }, types::GroupId, }; @@ -13,10 +17,78 @@ use crate::common::{ pub trait MemoExt: Send + Sync + 'static { /// Get the schema of a group in the memo. fn get_schema(&self, group_id: GroupId) -> Schema; - /// Get the attribute reference of a group in the memo. - fn get_attribute_ref(&self, group_id: GroupId) -> GroupAttrRefs; - /// Get the attribute information of a given attribute in a group in the memo. + /// Get the attribute info of a given attribute in a group in the memo. fn get_attribute_info(&self, group_id: GroupId, attr_ref_idx: u64) -> Attribute; + /// Get the attribute reference of a group in the memo. + fn get_attribute_refs(&self, group_id: GroupId) -> GroupAttrRefs; + /// Get the attribute reference of a given attribute in a group in the memo. + fn get_attribute_ref(&self, group_id: GroupId, attr_ref_idx: u64) -> AttrRef; // TODO: Figure out what other information is needed to compute the cost... } + +#[cfg(test)] +pub mod tests { + use std::collections::HashMap; + + use crate::common::{ + properties::{ + attr_ref::{AttrRef, GroupAttrRefs}, + schema::Schema, + Attribute, + }, + types::GroupId, + }; + + pub struct MemoGroupInfo { + pub schema: Schema, + pub attr_refs: GroupAttrRefs, + } + + impl MemoGroupInfo { + pub fn new(schema: Schema, attr_refs: GroupAttrRefs) -> Self { + Self { schema, attr_refs } + } + } + + #[derive(Default)] + pub struct MockMemoExtImpl { + memo: HashMap, + } + + impl MockMemoExtImpl { + pub fn add_group_info( + &mut self, + group_id: GroupId, + schema: Schema, + attr_ref: GroupAttrRefs, + ) { + self.memo + .insert(group_id, MemoGroupInfo::new(schema, attr_ref)); + } + } + + impl super::MemoExt for MockMemoExtImpl { + fn get_schema(&self, group_id: GroupId) -> Schema { + self.memo.get(&group_id).unwrap().schema.clone() + } + + fn get_attribute_info(&self, group_id: GroupId, attr_ref_idx: u64) -> Attribute { + self.memo.get(&group_id).unwrap().schema.attributes[attr_ref_idx as usize].clone() + } + + fn get_attribute_refs(&self, group_id: GroupId) -> GroupAttrRefs { + self.memo.get(&group_id).unwrap().attr_refs.clone() + } + + fn get_attribute_ref(&self, group_id: GroupId, attr_ref_idx: u64) -> AttrRef { + self.memo.get(&group_id).unwrap().attr_refs.attr_refs()[attr_ref_idx as usize].clone() + } + } + + impl From> for MockMemoExtImpl { + fn from(memo: HashMap) -> Self { + Self { memo } + } + } +} diff --git a/optd-cost-model/src/stats/mod.rs b/optd-cost-model/src/stats/mod.rs index 0b1396a..7ec2510 100644 --- a/optd-cost-model/src/stats/mod.rs +++ b/optd-cost-model/src/stats/mod.rs @@ -1,12 +1,15 @@ #![allow(unused)] mod arith_encoder; -pub mod counter; -pub mod tdigest; +pub mod utilities; use crate::common::values::Value; -use counter::Counter; use serde::{Deserialize, Serialize}; +use utilities::counter::Counter; +use utilities::{ + simple_map::{self, SimpleMap}, + tdigest::TDigest, +}; // Default n-distinct estimate for derived columns or columns lacking statistics pub const DEFAULT_NUM_DISTINCT: u64 = 200; @@ -27,10 +30,12 @@ pub const FIXED_CHAR_SEL_FACTOR: f64 = 0.2; pub type AttributeCombValue = Vec>; -#[derive(Serialize, Deserialize, Debug)] +// TODO: remove the clone, see the comment in the [`AttributeCombValueStats`] +#[derive(Serialize, Deserialize, Debug, Clone)] #[serde(tag = "type")] pub enum MostCommonValues { Counter(Counter), + SimpleFrequency(SimpleMap), // Add more types here... } @@ -43,12 +48,14 @@ impl MostCommonValues { pub fn freq(&self, value: &AttributeCombValue) -> Option { match self { MostCommonValues::Counter(counter) => counter.frequencies().get(value).copied(), + MostCommonValues::SimpleFrequency(simple_map) => simple_map.m.get(value).copied(), } } pub fn total_freq(&self) -> f64 { match self { MostCommonValues::Counter(counter) => counter.frequencies().values().sum(), + MostCommonValues::SimpleFrequency(simple_map) => simple_map.m.values().sum(), } } @@ -60,6 +67,12 @@ impl MostCommonValues { .filter(|(val, _)| pred(val)) .map(|(_, freq)| freq) .sum(), + MostCommonValues::SimpleFrequency(simple_map) => simple_map + .m + .iter() + .filter(|(val, _)| pred(val)) + .map(|(_, freq)| freq) + .sum(), } } @@ -67,14 +80,21 @@ impl MostCommonValues { pub fn cnt(&self) -> usize { match self { MostCommonValues::Counter(counter) => counter.frequencies().len(), + MostCommonValues::SimpleFrequency(simple_map) => simple_map.m.len(), } } + + pub fn empty() -> Self { + MostCommonValues::SimpleFrequency(SimpleMap::new(vec![])) + } } -#[derive(Serialize, Deserialize, Debug)] +// TODO: remove the clone, see the comment in the [`AttributeCombValueStats`] +#[derive(Serialize, Deserialize, Debug, Clone)] #[serde(tag = "type")] pub enum Distribution { - TDigest(tdigest::TDigest), + TDigest(TDigest), + SimpleDistribution(SimpleMap), // Add more types here... } @@ -89,11 +109,25 @@ impl Distribution { tdigest.centroids.len() as f64 * tdigest.cdf(value) / nb_rows as f64 } } + Distribution::SimpleDistribution(simple_distribution) => { + *simple_distribution.m.get(value).unwrap_or(&0.0) + } } } + + pub fn empty() -> Self { + Distribution::SimpleDistribution(SimpleMap::new(vec![])) + } } -#[derive(Serialize, Deserialize, Debug)] +// TODO: Remove the clone. Now I have to add this because +// persistent.rs doesn't have a memory cache, so we have to +// return AttributeCombValueStats rather than &AttributeCombValueStats. +// But this poses a problem for mock.rs when testing, since mock storage +// only has memory hash map, so we need to return a clone of AttributeCombValueStats. +// Later, if memory cache is added, we should change this to return a reference. +// **and** remove the clone. +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct AttributeCombValueStats { pub mcvs: MostCommonValues, // Does NOT contain full nulls. pub distr: Option, // Does NOT contain mcvs; optional. @@ -104,9 +138,9 @@ pub struct AttributeCombValueStats { impl AttributeCombValueStats { pub fn new( mcvs: MostCommonValues, + distr: Option, ndistinct: u64, null_frac: f64, - distr: Option, ) -> Self { Self { mcvs, diff --git a/optd-cost-model/src/stats/counter.rs b/optd-cost-model/src/stats/utilities/counter.rs similarity index 95% rename from optd-cost-model/src/stats/counter.rs rename to optd-cost-model/src/stats/utilities/counter.rs index 65a2d63..368700c 100644 --- a/optd-cost-model/src/stats/counter.rs +++ b/optd-cost-model/src/stats/utilities/counter.rs @@ -5,8 +5,9 @@ use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; /// The Counter structure to track exact frequencies of fixed elements. +/// TODO: remove the clone, see the comment in the [`AttributeCombValueStats`] #[serde_with::serde_as] -#[derive(Default, Serialize, Deserialize, Debug)] +#[derive(Default, Serialize, Deserialize, Debug, Clone)] pub struct Counter { #[serde_as(as = "HashMap")] counts: HashMap, // The exact counts of an element T. @@ -32,6 +33,13 @@ where } } + pub fn new_from_existing(counts: HashMap, total_count: i32) -> Self { + Counter:: { + counts, + total_count, + } + } + // Inserts an element in the Counter if it is being tracked. fn insert_element(&mut self, elem: T, occ: i32) { if let Some(frequency) = self.counts.get_mut(&elem) { diff --git a/optd-cost-model/src/stats/utilities/mod.rs b/optd-cost-model/src/stats/utilities/mod.rs new file mode 100644 index 0000000..0a7903b --- /dev/null +++ b/optd-cost-model/src/stats/utilities/mod.rs @@ -0,0 +1,3 @@ +pub mod counter; +pub mod simple_map; +pub mod tdigest; diff --git a/optd-cost-model/src/stats/utilities/simple_map.rs b/optd-cost-model/src/stats/utilities/simple_map.rs new file mode 100644 index 0000000..d04439e --- /dev/null +++ b/optd-cost-model/src/stats/utilities/simple_map.rs @@ -0,0 +1,21 @@ +use std::collections::HashMap; +use std::hash::Hash; + +use serde::{Deserialize, Serialize}; + +use crate::common::values::Value; + +/// TODO: documentation +/// Now it is mainly for testing purposes. +#[derive(Clone, Serialize, Deserialize, Debug, Default)] +pub struct SimpleMap { + pub(crate) m: HashMap, +} + +impl SimpleMap { + pub fn new(v: Vec<(K, f64)>) -> Self { + Self { + m: v.into_iter().collect(), + } + } +} diff --git a/optd-cost-model/src/stats/tdigest.rs b/optd-cost-model/src/stats/utilities/tdigest.rs similarity index 99% rename from optd-cost-model/src/stats/tdigest.rs rename to optd-cost-model/src/stats/utilities/tdigest.rs index 83dc9b5..96a2269 100644 --- a/optd-cost-model/src/stats/tdigest.rs +++ b/optd-cost-model/src/stats/utilities/tdigest.rs @@ -15,9 +15,7 @@ use std::marker::PhantomData; use itertools::Itertools; use serde::{Deserialize, Serialize}; -use crate::common::values::Value; - -use super::arith_encoder; +use crate::{common::values::Value, stats::arith_encoder}; pub const DEFAULT_COMPRESSION: f64 = 200.0; diff --git a/optd-cost-model/src/storage/mock.rs b/optd-cost-model/src/storage/mock.rs new file mode 100644 index 0000000..d878bcb --- /dev/null +++ b/optd-cost-model/src/storage/mock.rs @@ -0,0 +1,61 @@ +#![allow(unused_variables, dead_code)] +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +use crate::{common::types::TableId, stats::AttributeCombValueStats, CostModelResult}; + +use super::CostModelStorageManager; + +pub type AttrIndices = Vec; + +#[serde_with::serde_as] +#[derive(Serialize, Deserialize, Debug)] +pub struct TableStats { + pub row_cnt: u64, + #[serde_as(as = "HashMap")] + pub column_comb_stats: HashMap, +} + +impl TableStats { + pub fn new( + row_cnt: u64, + column_comb_stats: HashMap, + ) -> Self { + Self { + row_cnt, + column_comb_stats, + } + } +} + +pub type BaseTableStats = HashMap; + +pub struct CostModelStorageMockManagerImpl { + pub(crate) per_table_stats_map: BaseTableStats, +} + +impl CostModelStorageMockManagerImpl { + pub fn new(per_table_stats_map: BaseTableStats) -> Self { + Self { + per_table_stats_map, + } + } +} + +impl CostModelStorageManager for CostModelStorageMockManagerImpl { + async fn get_attributes_comb_statistics( + &self, + table_id: TableId, + attr_base_indices: &[u64], + ) -> CostModelResult> { + let table_stats = self.per_table_stats_map.get(&table_id); + match table_stats { + None => Ok(None), + Some(table_stats) => match table_stats.column_comb_stats.get(attr_base_indices) { + None => Ok(None), + Some(stats) => Ok(Some(stats.clone())), + }, + } + } +} diff --git a/optd-cost-model/src/storage/mod.rs b/optd-cost-model/src/storage/mod.rs new file mode 100644 index 0000000..d3d26cd --- /dev/null +++ b/optd-cost-model/src/storage/mod.rs @@ -0,0 +1,13 @@ +use crate::{common::types::TableId, stats::AttributeCombValueStats, CostModelResult}; + +pub mod mock; +pub mod persistent; + +#[trait_variant::make(Send)] +pub trait CostModelStorageManager { + async fn get_attributes_comb_statistics( + &self, + table_id: TableId, + attr_base_indices: &[u64], + ) -> CostModelResult>; +} diff --git a/optd-cost-model/src/storage.rs b/optd-cost-model/src/storage/persistent.rs similarity index 71% rename from optd-cost-model/src/storage.rs rename to optd-cost-model/src/storage/persistent.rs index 5538618..e029270 100644 --- a/optd-cost-model/src/storage.rs +++ b/optd-cost-model/src/storage/persistent.rs @@ -1,44 +1,33 @@ #![allow(unused_variables)] use std::sync::Arc; -use optd_persistent::{ - cost_model::interface::{Attr, StatType}, - CostModelStorageLayer, -}; +use optd_persistent::{cost_model::interface::StatType, CostModelStorageLayer}; use crate::{ common::types::TableId, - stats::{counter::Counter, AttributeCombValueStats, Distribution, MostCommonValues}, + stats::{utilities::counter::Counter, AttributeCombValueStats, Distribution, MostCommonValues}, CostModelResult, }; +use super::CostModelStorageManager; + /// TODO: documentation -pub struct CostModelStorageManager { +pub struct CostModelStorageManagerImpl { pub backend_manager: Arc, // TODO: in-memory cache } -impl CostModelStorageManager { +impl CostModelStorageManagerImpl { pub fn new(backend_manager: Arc) -> Self { Self { backend_manager } } +} - /// Gets the attribute information for a given table and attribute base index. - /// - /// TODO: if we have memory cache, - /// we should add the reference. (&Attr) - pub async fn get_attribute_info( - &self, - table_id: TableId, - attr_base_index: i32, - ) -> CostModelResult> { - Ok(self - .backend_manager - .get_attribute(table_id.into(), attr_base_index) - .await?) - } - - /// Gets the latest statistics for a given table. +impl CostModelStorageManager + for CostModelStorageManagerImpl +{ + /// Gets the latest statistics for a given table. Currently we only support base table + /// statistic retrieval. /// /// TODO: Currently, in `AttributeCombValueStats`, only `Distribution` is optional. /// This poses a question about the behavior of the system if there is no corresponding @@ -50,16 +39,16 @@ impl CostModelStorageManager { /// /// TODO: Shall we pass in an epoch here to make sure that the statistics are from the same /// epoch? - pub async fn get_attributes_comb_statistics( + async fn get_attributes_comb_statistics( &self, table_id: TableId, - attr_base_indices: &[i32], + attr_base_indices: &[u64], ) -> CostModelResult> { let dist: Option = self .backend_manager .get_stats_for_attr_indices_based( table_id.into(), - attr_base_indices.to_vec(), + attr_base_indices.iter().map(|&x| x as i32).collect(), StatType::Distribution, None, ) @@ -70,7 +59,7 @@ impl CostModelStorageManager { .backend_manager .get_stats_for_attr_indices_based( table_id.into(), - attr_base_indices.to_vec(), + attr_base_indices.iter().map(|&x| x as i32).collect(), StatType::MostCommonValues, None, ) @@ -82,7 +71,7 @@ impl CostModelStorageManager { .backend_manager .get_stats_for_attr_indices_based( table_id.into(), - attr_base_indices.to_vec(), + attr_base_indices.iter().map(|&x| x as i32).collect(), StatType::Cardinality, None, ) @@ -94,7 +83,7 @@ impl CostModelStorageManager { .backend_manager .get_stats_for_attr_indices_based( table_id.into(), - attr_base_indices.to_vec(), + attr_base_indices.iter().map(|&x| x as i32).collect(), StatType::TableRowCount, None, ) @@ -105,7 +94,7 @@ impl CostModelStorageManager { .backend_manager .get_stats_for_attr_indices_based( table_id.into(), - attr_base_indices.to_vec(), + attr_base_indices.iter().map(|&x| x as i32).collect(), StatType::NonNullCount, None, ) @@ -123,9 +112,9 @@ impl CostModelStorageManager { }; Ok(Some(AttributeCombValueStats::new( - mcvs, ndistinct, null_frac, dist, + mcvs, dist, ndistinct, null_frac, ))) } -} -// TODO: add some tests, especially cover the error cases. + // TODO: Support querying for a specific type of statistics. +} diff --git a/optd-persistent/Cargo.toml b/optd-persistent/Cargo.toml index c576100..50af728 100644 --- a/optd-persistent/Cargo.toml +++ b/optd-persistent/Cargo.toml @@ -21,3 +21,4 @@ trait-variant = "0.1.2" async-trait = "0.1.43" async-stream = "0.3.1" strum = "0.26.1" +num_enum = "0.7.3" diff --git a/optd-persistent/src/cost_model/interface.rs b/optd-persistent/src/cost_model/interface.rs index a03087f..ee767d7 100644 --- a/optd-persistent/src/cost_model/interface.rs +++ b/optd-persistent/src/cost_model/interface.rs @@ -4,6 +4,7 @@ use crate::entities::cascades_group; use crate::entities::logical_expression; use crate::entities::physical_expression; use crate::StorageResult; +use num_enum::{IntoPrimitive, TryFromPrimitive}; use sea_orm::prelude::Json; use sea_orm::*; use sea_orm_migration::prelude::*; @@ -16,6 +17,7 @@ pub type AttrId = i32; pub type ExprId = i32; pub type EpochId = i32; pub type StatId = i32; +pub type AttrIndex = i32; /// TODO: documentation pub enum CatalogSource { @@ -24,8 +26,10 @@ pub enum CatalogSource { } /// TODO: documentation +#[repr(i32)] +#[derive(Copy, Clone, Debug, PartialEq, IntoPrimitive, TryFromPrimitive)] pub enum AttrType { - Integer, + Integer = 1, Float, Varchar, Boolean, @@ -96,7 +100,7 @@ pub struct Attr { pub table_id: i32, pub name: String, pub compression_method: String, - pub attr_type: i32, + pub attr_type: AttrType, pub base_index: i32, pub nullable: bool, } @@ -149,7 +153,7 @@ pub trait CostModelStorageLayer { async fn get_stats_for_attr_indices_based( &self, table_id: TableId, - attr_base_indices: Vec, + attr_base_indices: Vec, stat_type: StatType, epoch_id: Option, ) -> StorageResult>; @@ -165,6 +169,6 @@ pub trait CostModelStorageLayer { async fn get_attribute( &self, table_id: TableId, - attribute_base_index: i32, + attribute_base_index: AttrIndex, ) -> StorageResult>; } diff --git a/optd-persistent/src/cost_model/orm.rs b/optd-persistent/src/cost_model/orm.rs index 5b56476..d5b7ad6 100644 --- a/optd-persistent/src/cost_model/orm.rs +++ b/optd-persistent/src/cost_model/orm.rs @@ -14,7 +14,8 @@ use serde_json::json; use super::catalog::mock_catalog::{self, MockCatalog}; use super::interface::{ - Attr, AttrId, CatalogSource, EpochId, EpochOption, ExprId, Stat, StatId, StatType, TableId, + Attr, AttrId, AttrIndex, AttrType, CatalogSource, EpochId, EpochOption, ExprId, Stat, StatId, + StatType, TableId, }; impl BackendManager { @@ -434,7 +435,7 @@ impl CostModelStorageLayer for BackendManager { async fn get_stats_for_attr_indices_based( &self, table_id: TableId, - attr_base_indices: Vec, + attr_base_indices: Vec, stat_type: StatType, epoch_id: Option, ) -> StorageResult> { @@ -549,21 +550,30 @@ impl CostModelStorageLayer for BackendManager { async fn get_attribute( &self, table_id: TableId, - attribute_base_index: i32, + attribute_base_index: AttrIndex, ) -> StorageResult> { - Ok(Attribute::find() + let attr_res = Attribute::find() .filter(attribute::Column::TableId.eq(table_id)) .filter(attribute::Column::BaseAttributeNumber.eq(attribute_base_index)) .one(&self.db) - .await? - .map(|attr| Attr { - table_id, - name: attr.name, - compression_method: attr.compression_method, - attr_type: attr.variant_tag, - base_index: attribute_base_index, - nullable: !attr.is_not_null, - })) + .await?; + match attr_res { + Some(attr) => match AttrType::try_from(attr.variant_tag) { + Ok(attr_type) => Ok(Some(Attr { + table_id: attr.table_id, + name: attr.name, + compression_method: attr.compression_method, + attr_type, + base_index: attr.base_attribute_number, + nullable: attr.is_not_null, + })), + Err(_) => Err(BackendError::BackendError(format!( + "Failed to convert variant tag {} to AttrType", + attr.variant_tag + ))), + }, + None => Ok(None), + } } }