diff --git a/Cargo.lock b/Cargo.lock index f7c7e6f8c1994..899516fb30c4f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4904,6 +4904,7 @@ dependencies = [ "databend-common-config", "databend-common-exception", "databend-common-expression", + "databend-common-functions", "databend-common-io", "databend-common-license", "databend-common-management", diff --git a/src/query/ee/Cargo.toml b/src/query/ee/Cargo.toml index 4dbfd61c2273b..59b7cc4b840ad 100644 --- a/src/query/ee/Cargo.toml +++ b/src/query/ee/Cargo.toml @@ -66,6 +66,7 @@ typetag = { workspace = true } uuid = { workspace = true } [dev-dependencies] +databend-common-functions = { workspace = true } jsonb = { workspace = true } tantivy = { workspace = true } diff --git a/src/query/ee/tests/it/vector_index/pruning.rs b/src/query/ee/tests/it/vector_index/pruning.rs index 9b862ed7071ff..cba61b8faffd1 100644 --- a/src/query/ee/tests/it/vector_index/pruning.rs +++ b/src/query/ee/tests/it/vector_index/pruning.rs @@ -17,23 +17,31 @@ use std::sync::Arc; use databend_common_ast::ast::Engine; use databend_common_base::base::tokio; +use databend_common_catalog::plan::Filters; use databend_common_catalog::plan::PushDownInfo; use databend_common_catalog::plan::VectorIndexInfo; use databend_common_exception::Result; +use databend_common_expression::type_check::check_function; use databend_common_expression::types::number::UInt64Type; use databend_common_expression::types::DataType; use databend_common_expression::types::NumberDataType; +use databend_common_expression::types::NumberScalar; use databend_common_expression::types::VectorColumn; use databend_common_expression::types::VectorDataType; use databend_common_expression::types::F32; use databend_common_expression::Column; +use databend_common_expression::ColumnRef; +use databend_common_expression::Constant; use databend_common_expression::DataBlock; +use databend_common_expression::Expr; use databend_common_expression::FromData; use databend_common_expression::RemoteExpr; +use databend_common_expression::Scalar; use databend_common_expression::TableDataType; use databend_common_expression::TableField; use databend_common_expression::TableSchemaRef; use databend_common_expression::TableSchemaRefExt; +use databend_common_functions::BUILTIN_FUNCTIONS; use databend_common_meta_app::schema::CreateOption; use databend_common_meta_app::schema::TableIndex; use databend_common_meta_app::schema::TableIndexType; @@ -675,46 +683,73 @@ async fn test_block_pruner() -> Result<()> { ]; let results = vec![ + // First query: cosine_distance with query_values1 vec![ vec![ - (0, 0, 0, 0.005022526), - (0, 0, 9, 0.05992174), - (0, 0, 1, 0.09289217), + VectorScoreResult::new(0, 0, 0, 0.005022526), + VectorScoreResult::new(0, 0, 9, 0.05992174), + VectorScoreResult::new(0, 0, 1, 0.09289217), + ], + vec![ + VectorScoreResult::new(1, 0, 9, 0.05186367), + VectorScoreResult::new(1, 0, 5, 0.07403374), ], - vec![(1, 0, 9, 0.05186367), (1, 0, 5, 0.07403374)], ], + // Second query: l1_distance with query_values1 vec![ - vec![(0, 0, 0, 0.0), (0, 0, 9, 0.84269863), (0, 0, 1, 1.0792456)], - vec![(0, 4, 2, 0.9375271)], - vec![(1, 0, 9, 0.7167929)], + vec![ + VectorScoreResult::new(0, 0, 0, 0.0), + VectorScoreResult::new(0, 0, 9, 0.84269863), + VectorScoreResult::new(0, 0, 1, 1.0792456), + ], + vec![VectorScoreResult::new(0, 4, 2, 0.9375271)], + vec![VectorScoreResult::new(1, 0, 9, 0.7167929)], ], - vec![vec![(0, 0, 0, 3.5187712), (0, 0, 9, 3.5518785)], vec![ - (1, 3, 6, 3.4702706), - (1, 3, 7, 3.5206928), - (1, 3, 1, 3.556445), - ]], + // Third query: l2_distance with query_values1 + vec![ + vec![ + VectorScoreResult::new(0, 0, 0, 3.5187712), + VectorScoreResult::new(0, 0, 9, 3.5518785), + ], + vec![ + VectorScoreResult::new(1, 3, 6, 3.4702706), + VectorScoreResult::new(1, 3, 7, 3.5206928), + VectorScoreResult::new(1, 3, 1, 3.556445), + ], + ], + // Fourth query: cosine_distance with query_values2 vec![ - vec![(0, 1, 6, 0.18258381)], - vec![(0, 3, 8, 0.15948296)], - vec![(1, 1, 8, 0.008677483), (1, 1, 7, 0.21170044)], - vec![(1, 4, 8, 0.0657177)], + vec![VectorScoreResult::new(0, 1, 6, 0.18258381)], + vec![VectorScoreResult::new(0, 3, 8, 0.15948296)], + vec![ + VectorScoreResult::new(1, 1, 8, 0.008677483), + VectorScoreResult::new(1, 1, 7, 0.21170044), + ], + vec![VectorScoreResult::new(1, 4, 8, 0.0657177)], ], + // Fifth query: l1_distance with query_values2 vec![ - vec![(0, 1, 6, 0.7965471)], - vec![(0, 2, 7, 1.3045802)], - vec![(1, 1, 8, 0.0)], - vec![(1, 4, 8, 0.8538904), (1, 4, 7, 1.021619)], + vec![VectorScoreResult::new(0, 1, 6, 0.7965471)], + vec![VectorScoreResult::new(0, 2, 7, 1.3045802)], + vec![VectorScoreResult::new(1, 1, 8, 0.0)], + vec![ + VectorScoreResult::new(1, 4, 8, 0.8538904), + VectorScoreResult::new(1, 4, 7, 1.021619), + ], ], - vec![vec![(1, 1, 8, 3.4763064)], vec![ - (1, 3, 5, 3.4903116), - (1, 3, 9, 3.4926815), - (1, 3, 0, 3.527872), - (1, 3, 8, 3.560473), + // Sixth query: l2_distance with query_values2 + vec![vec![VectorScoreResult::new(1, 1, 8, 3.4763064)], vec![ + VectorScoreResult::new(1, 3, 5, 3.4903116), + VectorScoreResult::new(1, 3, 9, 3.4926815), + VectorScoreResult::new(1, 3, 0, 3.527872), + VectorScoreResult::new(1, 3, 8, 3.560473), ]], ]; let mut extras = Vec::new(); - for ((func_name, query_values), result) in query_values.into_iter().zip(results.into_iter()) { + for ((func_name, query_values), result) in + query_values.clone().into_iter().zip(results.into_iter()) + { let mut vector_index = vector_index.clone(); vector_index.func_name = func_name; vector_index.query_values = query_values; @@ -727,6 +762,223 @@ async fn test_block_pruner() -> Result<()> { extras.push((Some(extra), result)); } + // Add a filter to test filter pushdown + let filter_arg0_expr = Expr::ColumnRef(ColumnRef { + span: None, + id: "_vector_score".to_string(), + data_type: DataType::Number(NumberDataType::Float32), + display_name: "_vector_score".to_string(), + }); + let filter_arg1_expr = Expr::Constant(Constant { + span: None, + scalar: Scalar::Number(NumberScalar::Float32(F32::from(2.0))), + data_type: DataType::Number(NumberDataType::Float32), + }); + let filter = check_function( + None, + "gt", + &[], + &[filter_arg0_expr, filter_arg1_expr], + &BUILTIN_FUNCTIONS, + )?; + let inverted_filter = check_function(None, "not", &[], &[filter.clone()], &BUILTIN_FUNCTIONS)?; + + let filters = Filters { + filter: filter.as_remote_expr(), + inverted_filter: inverted_filter.as_remote_expr(), + }; + + let filter_results = vec![ + // First query: cosine_distance with filter and query_values1 + vec![], + // Second query: l1_distance with filter and query_values1 + vec![ + vec![ + VectorScoreResult::new(0, 0, 0, 0.0), + VectorScoreResult::new(0, 0, 1, 1.0792456), + VectorScoreResult::new(0, 0, 2, 3.7108307), + VectorScoreResult::new(0, 0, 3, 3.2820892), + VectorScoreResult::new(0, 0, 4, 2.6611536), + VectorScoreResult::new(0, 0, 5, 2.0845702), + VectorScoreResult::new(0, 0, 6, 3.6664782), + VectorScoreResult::new(0, 0, 7, 3.6369097), + VectorScoreResult::new(0, 0, 8, 4.361335), + VectorScoreResult::new(0, 0, 9, 0.84269863), + ], + vec![ + VectorScoreResult::new(0, 1, 0, 1.9089664), + VectorScoreResult::new(0, 1, 1, 1.6205615), + VectorScoreResult::new(0, 1, 2, 2.6231122), + VectorScoreResult::new(0, 1, 3, 2.375908), + VectorScoreResult::new(0, 1, 4, 2.8565829), + VectorScoreResult::new(0, 1, 5, 3.26859), + VectorScoreResult::new(0, 1, 6, 2.3896415), + VectorScoreResult::new(0, 1, 7, 2.11497), + VectorScoreResult::new(0, 1, 8, 3.1175208), + VectorScoreResult::new(0, 1, 9, 1.9913678), + ], + vec![ + VectorScoreResult::new(0, 2, 0, 1.5041043), + VectorScoreResult::new(0, 2, 1, 3.1616886), + VectorScoreResult::new(0, 2, 2, 2.0873284), + VectorScoreResult::new(0, 2, 3, 4.6504445), + VectorScoreResult::new(0, 2, 4, 1.6268883), + VectorScoreResult::new(0, 2, 5, 1.9338483), + VectorScoreResult::new(0, 2, 6, 5.4485407), + VectorScoreResult::new(0, 2, 7, 3.7449126), + VectorScoreResult::new(0, 2, 8, 2.3789403), + VectorScoreResult::new(0, 2, 9, 2.5017245), + ], + vec![ + VectorScoreResult::new(0, 3, 0, 2.1560328), + VectorScoreResult::new(0, 3, 1, 3.256665), + VectorScoreResult::new(0, 3, 2, 2.0957243), + VectorScoreResult::new(0, 3, 3, 1.5981783), + VectorScoreResult::new(0, 3, 4, 3.4074366), + VectorScoreResult::new(0, 3, 5, 1.9751071), + VectorScoreResult::new(0, 3, 6, 3.151125), + VectorScoreResult::new(0, 3, 7, 3.0908165), + VectorScoreResult::new(0, 3, 8, 3.543131), + VectorScoreResult::new(0, 3, 9, 1.3117124), + ], + vec![ + VectorScoreResult::new(1, 0, 0, 2.1198769), + VectorScoreResult::new(1, 0, 1, 4.0567427), + VectorScoreResult::new(1, 0, 2, 2.8214188), + VectorScoreResult::new(1, 0, 3, 4.499019), + VectorScoreResult::new(1, 0, 4, 2.4858987), + VectorScoreResult::new(1, 0, 5, 1.1590694), + VectorScoreResult::new(1, 0, 6, 3.8737319), + VectorScoreResult::new(1, 0, 7, 2.074124), + VectorScoreResult::new(1, 0, 8, 4.8192883), + VectorScoreResult::new(1, 0, 9, 0.7167929), + ], + ], + // Third query: l2_distance with filter and query_values1 + vec![ + vec![ + VectorScoreResult::new(0, 0, 0, 3.5187712), + VectorScoreResult::new(0, 0, 1, 3.5688834), + VectorScoreResult::new(0, 0, 2, 4.158467), + VectorScoreResult::new(0, 0, 3, 3.8972623), + VectorScoreResult::new(0, 0, 4, 3.9402654), + VectorScoreResult::new(0, 0, 5, 3.8295133), + VectorScoreResult::new(0, 0, 6, 4.16049), + VectorScoreResult::new(0, 0, 7, 4.1032534), + VectorScoreResult::new(0, 0, 8, 4.2184787), + VectorScoreResult::new(0, 0, 9, 3.5518785), + ], + vec![ + VectorScoreResult::new(1, 3, 0, 3.9212408), + VectorScoreResult::new(1, 3, 1, 3.556445), + VectorScoreResult::new(1, 3, 2, 3.7241573), + VectorScoreResult::new(1, 3, 3, 4.1835504), + VectorScoreResult::new(1, 3, 4, 3.6078227), + VectorScoreResult::new(1, 3, 5, 3.5779395), + VectorScoreResult::new(1, 3, 6, 3.4702706), + VectorScoreResult::new(1, 3, 7, 3.5206928), + VectorScoreResult::new(1, 3, 8, 3.8251386), + VectorScoreResult::new(1, 3, 9, 3.616931), + ], + ], + // Fourth query: cosine_distance with filter and query_values2 + vec![], + // Fifth query: l1_distance with filter and query_values2 + vec![ + vec![ + VectorScoreResult::new(0, 2, 0, 2.2254603), + VectorScoreResult::new(0, 2, 1, 2.8700764), + VectorScoreResult::new(0, 2, 2, 2.9007723), + VectorScoreResult::new(0, 2, 3, 2.1487203), + VectorScoreResult::new(0, 2, 4, 1.3966682), + VectorScoreResult::new(0, 2, 5, 3.1463404), + VectorScoreResult::new(0, 2, 6, 2.9468164), + VectorScoreResult::new(0, 2, 7, 1.3045802), + VectorScoreResult::new(0, 2, 8, 2.0566323), + VectorScoreResult::new(0, 2, 9, 1.9645443), + ], + vec![ + VectorScoreResult::new(0, 4, 0, 3.383887), + VectorScoreResult::new(0, 4, 1, 2.329169), + VectorScoreResult::new(0, 4, 2, 2.7686348), + VectorScoreResult::new(0, 4, 3, 3.0909097), + VectorScoreResult::new(0, 4, 4, 2.2852223), + VectorScoreResult::new(0, 4, 5, 3.3545892), + VectorScoreResult::new(0, 4, 6, 2.9151235), + VectorScoreResult::new(0, 4, 7, 1.7139168), + VectorScoreResult::new(0, 4, 8, 2.0068939), + VectorScoreResult::new(0, 4, 9, 3.0762608), + ], + vec![ + VectorScoreResult::new(1, 0, 0, 2.0131204), + VectorScoreResult::new(1, 0, 1, 4.071994), + VectorScoreResult::new(1, 0, 2, 2.8366697), + VectorScoreResult::new(1, 0, 3, 2.3181388), + VectorScoreResult::new(1, 0, 4, 1.921615), + VectorScoreResult::new(1, 0, 5, 3.4619572), + VectorScoreResult::new(1, 0, 6, 3.0959353), + VectorScoreResult::new(1, 0, 7, 2.3943932), + VectorScoreResult::new(1, 0, 8, 2.9434261), + VectorScoreResult::new(1, 0, 9, 2.3486407), + ], + vec![ + VectorScoreResult::new(1, 3, 0, 2.011335), + VectorScoreResult::new(1, 3, 1, 2.4690871), + VectorScoreResult::new(1, 3, 2, 2.5800574), + VectorScoreResult::new(1, 3, 3, 2.5523148), + VectorScoreResult::new(1, 3, 4, 3.5649178), + VectorScoreResult::new(1, 3, 5, 2.1223052), + VectorScoreResult::new(1, 3, 6, 3.25975), + VectorScoreResult::new(1, 3, 7, 1.9281074), + VectorScoreResult::new(1, 3, 8, 2.4968297), + VectorScoreResult::new(1, 3, 9, 2.011335), + ], + ], + // Sixth query: l2_distance with filter and query_values2 + vec![ + vec![ + VectorScoreResult::new(1, 1, 0, 3.819309), + VectorScoreResult::new(1, 1, 1, 3.8105543), + VectorScoreResult::new(1, 1, 2, 3.8291273), + VectorScoreResult::new(1, 1, 3, 3.7389958), + VectorScoreResult::new(1, 1, 4, 3.5913217), + VectorScoreResult::new(1, 1, 5, 3.870244), + VectorScoreResult::new(1, 1, 6, 3.7941854), + VectorScoreResult::new(1, 1, 7, 3.585766), + VectorScoreResult::new(1, 1, 8, 3.4763064), + VectorScoreResult::new(1, 1, 9, 3.709784), + ], + vec![ + VectorScoreResult::new(1, 3, 0, 3.527872), + VectorScoreResult::new(1, 3, 1, 3.5928586), + VectorScoreResult::new(1, 3, 2, 3.5736349), + VectorScoreResult::new(1, 3, 3, 3.6239994), + VectorScoreResult::new(1, 3, 4, 3.8320742), + VectorScoreResult::new(1, 3, 5, 3.4903116), + VectorScoreResult::new(1, 3, 6, 3.70468), + VectorScoreResult::new(1, 3, 7, 3.586185), + VectorScoreResult::new(1, 3, 8, 3.560473), + VectorScoreResult::new(1, 3, 9, 3.4926815), + ], + ], + ]; + + for ((func_name, query_values), result) in + query_values.into_iter().zip(filter_results.into_iter()) + { + let mut vector_index = vector_index.clone(); + vector_index.func_name = func_name; + vector_index.query_values = query_values; + let extra = PushDownInfo { + limit: Some(5), + filters: Some(filters.clone()), + order_by: vec![(orderby_expr.clone(), true, false)], + vector_index: Some(vector_index), + ..Default::default() + }; + extras.push((Some(extra), result)); + } + for (extra, expected_results) in extras { let block_metas = apply_block_pruning( snapshot.clone(), @@ -737,6 +989,7 @@ async fn test_block_pruner() -> Result<()> { fuse_table.bloom_index_cols(), ) .await?; + assert_eq!(block_metas.len(), expected_results.len()); for ((block_meta_index, _), expected_scores) in block_metas.iter().zip(expected_results.iter()) @@ -745,13 +998,32 @@ async fn test_block_pruner() -> Result<()> { let vector_scores = block_meta_index.vector_scores.clone().unwrap(); assert_eq!(vector_scores.len(), expected_scores.len()); for (vector_score, expected_score) in vector_scores.iter().zip(expected_scores) { - assert_eq!(block_meta_index.segment_idx, expected_score.0); - assert_eq!(block_meta_index.block_idx, expected_score.1); - assert_eq!(vector_score.0, expected_score.2); - assert_eq!(vector_score.1, expected_score.3); + assert_eq!(block_meta_index.segment_idx, expected_score.segment_idx); + assert_eq!(block_meta_index.block_idx, expected_score.block_idx); + assert_eq!(vector_score.0, expected_score.vector_idx); + assert_eq!(vector_score.1, expected_score.score); } } } Ok(()) } + +#[derive(Debug, Clone)] +struct VectorScoreResult { + segment_idx: usize, + block_idx: usize, + vector_idx: usize, + score: f32, +} + +impl VectorScoreResult { + fn new(segment_idx: usize, block_idx: usize, vector_idx: usize, score: f32) -> Self { + Self { + segment_idx, + block_idx, + vector_idx, + score, + } + } +} diff --git a/src/query/sql/src/planner/optimizer/optimizers/rule/factory.rs b/src/query/sql/src/planner/optimizer/optimizers/rule/factory.rs index e9f0efde8a9f8..3eaf189d6a6d9 100644 --- a/src/query/sql/src/planner/optimizer/optimizers/rule/factory.rs +++ b/src/query/sql/src/planner/optimizer/optimizers/rule/factory.rs @@ -52,6 +52,7 @@ use crate::optimizer::optimizers::rule::RulePushDownLimitWindow; use crate::optimizer::optimizers::rule::RulePushDownPrewhere; use crate::optimizer::optimizers::rule::RulePushDownRankLimitAggregate; use crate::optimizer::optimizers::rule::RulePushDownSortEvalScalar; +use crate::optimizer::optimizers::rule::RulePushDownSortFilterScan; use crate::optimizer::optimizers::rule::RulePushDownSortScan; use crate::optimizer::optimizers::rule::RuleSemiToInnerJoin; use crate::optimizer::optimizers::rule::RuleSplitAggregate; @@ -79,6 +80,7 @@ impl RuleFactory { RuleID::PushDownLimitUnion => Ok(Box::new(RulePushDownLimitUnion::new())), RuleID::PushDownLimitScan => Ok(Box::new(RulePushDownLimitScan::new())), RuleID::PushDownSortScan => Ok(Box::new(RulePushDownSortScan::new())), + RuleID::PushDownSortFilterScan => Ok(Box::new(RulePushDownSortFilterScan::new())), RuleID::PushDownSortEvalScalar => { Ok(Box::new(RulePushDownSortEvalScalar::new(metadata))) } diff --git a/src/query/sql/src/planner/optimizer/optimizers/rule/filter_rules/mod.rs b/src/query/sql/src/planner/optimizer/optimizers/rule/filter_rules/mod.rs index 1d1cc63a6f2c6..efb64999d52f5 100644 --- a/src/query/sql/src/planner/optimizer/optimizers/rule/filter_rules/mod.rs +++ b/src/query/sql/src/planner/optimizer/optimizers/rule/filter_rules/mod.rs @@ -26,6 +26,7 @@ mod rule_push_down_filter_window; mod rule_push_down_filter_window_top_n; mod rule_push_down_prewhere; mod rule_push_down_sort_expression; +mod rule_push_down_sort_filter_scan; mod rule_push_down_sort_scan; pub use rule_eliminate_filter::RuleEliminateFilter; @@ -42,4 +43,5 @@ pub use rule_push_down_filter_window::RulePushDownFilterWindow; pub use rule_push_down_filter_window_top_n::RulePushDownFilterWindowTopN; pub use rule_push_down_prewhere::RulePushDownPrewhere; pub use rule_push_down_sort_expression::RulePushDownSortEvalScalar; +pub use rule_push_down_sort_filter_scan::RulePushDownSortFilterScan; pub use rule_push_down_sort_scan::RulePushDownSortScan; diff --git a/src/query/sql/src/planner/optimizer/optimizers/rule/filter_rules/rule_push_down_prewhere.rs b/src/query/sql/src/planner/optimizer/optimizers/rule/filter_rules/rule_push_down_prewhere.rs index 8a5dd9c3d1c00..eba60ad5b6b10 100644 --- a/src/query/sql/src/planner/optimizer/optimizers/rule/filter_rules/rule_push_down_prewhere.rs +++ b/src/query/sql/src/planner/optimizer/optimizers/rule/filter_rules/rule_push_down_prewhere.rs @@ -38,6 +38,14 @@ use crate::IndexType; use crate::MetadataRef; use crate::Visibility; +/// Input: Filter +/// \ +/// Scan +/// +/// Output: +/// Filter +/// \ +/// Scan(padding prewhere) pub struct RulePushDownPrewhere { id: RuleID, matchers: Vec, diff --git a/src/query/sql/src/planner/optimizer/optimizers/rule/filter_rules/rule_push_down_sort_filter_scan.rs b/src/query/sql/src/planner/optimizer/optimizers/rule/filter_rules/rule_push_down_sort_filter_scan.rs new file mode 100644 index 0000000000000..0e59f5a9ef477 --- /dev/null +++ b/src/query/sql/src/planner/optimizer/optimizers/rule/filter_rules/rule_push_down_sort_filter_scan.rs @@ -0,0 +1,162 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use databend_common_exception::Result; + +use crate::optimizer::ir::Matcher; +use crate::optimizer::ir::SExpr; +use crate::optimizer::optimizers::rule::Rule; +use crate::optimizer::optimizers::rule::RuleID; +use crate::optimizer::optimizers::rule::TransformResult; +use crate::plans::Filter; +use crate::plans::RelOp; +use crate::plans::RelOperator; +use crate::plans::Scan; +use crate::plans::Sort; + +/// Input: +/// (1) Sort +/// \ +/// Filter +/// \ +/// Scan +/// (2) Sort +/// \ +/// EvalScalar +/// \ +/// Filter +/// \ +/// Scan +/// +/// Output: +/// (1) Sort +/// \ +/// Filter +/// \ +/// Scan(padding order_by and limit) +/// (2) Sort +/// \ +/// EvalScalar +/// \ +/// Filter +/// \ +/// Scan(padding order_by and limit) +pub struct RulePushDownSortFilterScan { + id: RuleID, + matchers: Vec, +} + +impl RulePushDownSortFilterScan { + pub fn new() -> Self { + Self { + id: RuleID::PushDownSortFilterScan, + matchers: vec![ + Matcher::MatchOp { + op_type: RelOp::Sort, + children: vec![Matcher::MatchOp { + op_type: RelOp::Filter, + children: vec![Matcher::MatchOp { + op_type: RelOp::Scan, + children: vec![], + }], + }], + }, + Matcher::MatchOp { + op_type: RelOp::Sort, + children: vec![Matcher::MatchOp { + op_type: RelOp::EvalScalar, + children: vec![Matcher::MatchOp { + op_type: RelOp::Filter, + children: vec![Matcher::MatchOp { + op_type: RelOp::Scan, + children: vec![], + }], + }], + }], + }, + ], + } + } +} + +impl Rule for RulePushDownSortFilterScan { + fn id(&self) -> RuleID { + self.id + } + + fn apply(&self, s_expr: &SExpr, state: &mut TransformResult) -> Result<()> { + let sort: Sort = s_expr.plan().clone().try_into()?; + let child = s_expr.child(0)?; + let (eval_scalar, filter, mut scan) = match child.plan() { + RelOperator::Filter(filter) => { + let grand_child = child.child(0)?; + let scan: Scan = grand_child.plan().clone().try_into()?; + (None, filter.clone(), scan) + } + RelOperator::EvalScalar(eval_scalar) => { + let child = child.child(0)?; + let filter: Filter = child.plan().clone().try_into()?; + let grand_child = child.child(0)?; + let scan: Scan = grand_child.plan().clone().try_into()?; + (Some(eval_scalar.clone()), filter, scan) + } + _ => unreachable!(), + }; + + // The following conditions must be met push down filter and sort for vector index: + // 1. Scan must contain `vector_index`, because . + // 2. The number of `push_down_predicates` in Scan must be the same as the number of `predicates` + // in Filter to ensure that all filter conditions are pushed down. + // (Filter `predicates` has been pushed down in `RulePushDownFilterScan` rule.) + // 3. Sort must have limit in order to prune unused blocks. + let push_down_predicates = scan.push_down_predicates.clone().unwrap_or_default(); + if scan.vector_index.is_none() + || push_down_predicates.len() != filter.predicates.len() + || sort.limit.is_none() + { + return Ok(()); + } + + scan.order_by = Some(sort.items); + scan.limit = sort.limit; + + let new_scan = SExpr::create_leaf(Arc::new(RelOperator::Scan(scan))); + + let mut result = if eval_scalar.is_some() { + let grandchild = child.child(0)?; + let new_filter = grandchild.replace_children(vec![Arc::new(new_scan)]); + let new_eval_scalar = child.replace_children(vec![Arc::new(new_filter)]); + s_expr.replace_children(vec![Arc::new(new_eval_scalar)]) + } else { + let new_filter = child.replace_children(vec![Arc::new(new_scan)]); + s_expr.replace_children(vec![Arc::new(new_filter)]) + }; + + result.set_applied_rule(&self.id); + state.add_result(result); + Ok(()) + } + + fn matchers(&self) -> &[Matcher] { + &self.matchers + } +} + +impl Default for RulePushDownSortFilterScan { + fn default() -> Self { + Self::new() + } +} diff --git a/src/query/sql/src/planner/optimizer/optimizers/rule/rule.rs b/src/query/sql/src/planner/optimizer/optimizers/rule/rule.rs index eabd76d1e3402..a42c0c74470e3 100644 --- a/src/query/sql/src/planner/optimizer/optimizers/rule/rule.rs +++ b/src/query/sql/src/planner/optimizer/optimizers/rule/rule.rs @@ -58,6 +58,7 @@ pub static DEFAULT_REWRITE_RULES: LazyLock> = LazyLock::new(|| { RuleID::PushDownFilterScan, RuleID::PushDownPrewhere, /* PushDownPrwhere should be after all rules except PushDownFilterScan */ RuleID::PushDownSortScan, // PushDownSortScan should be after PushDownPrewhere + RuleID::PushDownSortFilterScan, // PushDownSortFilterScan should be after PushDownFilterScan RuleID::GroupingSetsToUnion, ] }); @@ -107,6 +108,7 @@ pub enum RuleID { PushDownLimitScan, PushDownSortEvalScalar, PushDownSortScan, + PushDownSortFilterScan, SemiToInnerJoin, EliminateEvalScalar, EliminateFilter, @@ -148,6 +150,7 @@ impl Display for RuleID { RuleID::PushDownFilterAggregate => write!(f, "PushDownFilterAggregate"), RuleID::PushDownLimitScan => write!(f, "PushDownLimitScan"), RuleID::PushDownSortScan => write!(f, "PushDownSortScan"), + RuleID::PushDownSortFilterScan => write!(f, "PushDownSortFilterScan"), RuleID::PushDownSortEvalScalar => write!(f, "PushDownSortEvalScalar"), RuleID::PushDownLimitWindow => write!(f, "PushDownLimitWindow"), RuleID::PushDownFilterWindow => write!(f, "PushDownFilterWindow"), diff --git a/src/query/storages/fuse/src/pruning/vector_index_pruner.rs b/src/query/storages/fuse/src/pruning/vector_index_pruner.rs index 0f1d8273d376d..08034b6c904c8 100644 --- a/src/query/storages/fuse/src/pruning/vector_index_pruner.rs +++ b/src/query/storages/fuse/src/pruning/vector_index_pruner.rs @@ -25,10 +25,19 @@ use databend_common_catalog::plan::Filters; use databend_common_catalog::plan::VectorIndexInfo; use databend_common_exception::ErrorCode; use databend_common_exception::Result; +use databend_common_expression::types::Buffer; +use databend_common_expression::types::NumberColumn; use databend_common_expression::types::F32; +use databend_common_expression::BlockEntry; +use databend_common_expression::Column; +use databend_common_expression::DataBlock; +use databend_common_expression::Evaluator; +use databend_common_expression::Expr; +use databend_common_expression::FunctionContext; use databend_common_expression::RemoteExpr; use databend_common_expression::TableSchemaRef; use databend_common_expression::VECTOR_SCORE_COL_NAME; +use databend_common_functions::BUILTIN_FUNCTIONS; use databend_common_metrics::storage::metrics_inc_block_vector_index_pruning_milliseconds; use databend_common_metrics::storage::metrics_inc_blocks_vector_index_pruning_after; use databend_common_metrics::storage::metrics_inc_blocks_vector_index_pruning_before; @@ -36,6 +45,7 @@ use databend_common_metrics::storage::metrics_inc_bytes_block_vector_index_pruni use databend_common_metrics::storage::metrics_inc_bytes_block_vector_index_pruning_before; use databend_storages_common_index::DistanceType; use databend_storages_common_index::FixedLengthPriorityQueue; +use databend_storages_common_index::ScoredPointOffset; use databend_storages_common_io::ReadSettings; use databend_storages_common_pruner::BlockMetaIndex; use databend_storages_common_table_meta::meta::BlockMeta; @@ -49,15 +59,23 @@ type VectorPruningFutureReturn = Pin VectorPruningFutureReturn + Send + 'static>; +#[derive(Clone)] +struct VectorTopNParam { + has_filter: bool, + filter_expr: Option, + asc: bool, + limit: usize, +} + /// Vector index pruner. #[derive(Clone)] pub struct VectorIndexPruner { + func_ctx: FunctionContext, pruning_ctx: Arc, _schema: TableSchemaRef, vector_index: VectorIndexInfo, - filters: Option, - sort: Vec<(RemoteExpr, bool, bool)>, - limit: Option, + vector_reader: VectorIndexReader, + vector_topn_param: Option, } impl VectorIndexPruner { @@ -69,154 +87,150 @@ impl VectorIndexPruner { sort: Vec<(RemoteExpr, bool, bool)>, limit: Option, ) -> Result { - Ok(Self { - pruning_ctx, - _schema: schema, - vector_index, - filters, - sort, - limit, - }) - } -} + let func_ctx = pruning_ctx.ctx.get_function_context()?; -impl VectorIndexPruner { - pub async fn prune( - &self, - metas: Vec<(BlockMetaIndex, Arc)>, - ) -> Result)>> { - let settings = ReadSettings::from_ctx(&self.pruning_ctx.ctx)?; - let distance_type = match self.vector_index.func_name.as_str() { + let settings = ReadSettings::from_ctx(&pruning_ctx.ctx)?; + let distance_type = match vector_index.func_name.as_str() { "cosine_distance" => DistanceType::Dot, "l1_distance" => DistanceType::L1, "l2_distance" => DistanceType::L2, _ => unreachable!(), }; let columns = vec![ - format!( - "{}-{}_graph_links", - self.vector_index.column_id, distance_type - ), - format!( - "{}-{}_graph_data", - self.vector_index.column_id, distance_type - ), + format!("{}-{}_graph_links", vector_index.column_id, distance_type), + format!("{}-{}_graph_data", vector_index.column_id, distance_type), format!( "{}-{}_encoded_u8_meta", - self.vector_index.column_id, distance_type + vector_index.column_id, distance_type ), format!( "{}-{}_encoded_u8_data", - self.vector_index.column_id, distance_type + vector_index.column_id, distance_type ), ]; - let query_values = unsafe { - std::mem::transmute::, Vec>(self.vector_index.query_values.clone()) - }; + let query_values = + unsafe { std::mem::transmute::, Vec>(vector_index.query_values.clone()) }; let vector_reader = VectorIndexReader::create( - self.pruning_ctx.dal.clone(), + pruning_ctx.dal.clone(), settings, distance_type, columns, query_values, ); - // @TODO support filters - if self.filters.is_none() && !self.sort.is_empty() && self.limit.is_some() { - let (sort, asc, _nulls_first) = &self.sort[0]; - if let RemoteExpr::ColumnRef { id, .. } = sort { - if id == VECTOR_SCORE_COL_NAME && *asc { - let limit = self.limit.unwrap(); - return self - .vector_index_topn_prune(vector_reader, limit, metas) - .await; + // If the filter only has the vector score column, we can filter the scores. + let filter_expr = if let Some(filters) = &filters { + let filter = filters.filter.as_expr(&BUILTIN_FUNCTIONS); + let column_refs = filter.column_refs(); + if column_refs.len() == 1 && column_refs.contains_key(VECTOR_SCORE_COL_NAME) { + let filter_expr = filter.project_column_ref(|_| 0); + Some(filter_expr) + } else { + None + } + } else { + None + }; + + let mut vector_topn_param = None; + // If the first sort expr is the vector score column and has the limit value, + // we can do vector TopN prune to filter out the blocks. + if !sort.is_empty() && limit.is_some() { + let (sort_expr, asc, _nulls_first) = &sort[0]; + if let RemoteExpr::ColumnRef { id, .. } = sort_expr { + if id == VECTOR_SCORE_COL_NAME { + let limit = limit.unwrap(); + vector_topn_param = Some(VectorTopNParam { + has_filter: filters.is_some(), + filter_expr, + asc: *asc, + limit, + }); } } } - self.vector_index_prune(vector_reader, metas).await + Ok(Self { + func_ctx, + pruning_ctx, + _schema: schema, + vector_index, + vector_reader, + vector_topn_param, + }) } +} - async fn vector_index_topn_prune( +impl VectorIndexPruner { + pub async fn prune( &self, - vector_reader: VectorIndexReader, - limit: usize, metas: Vec<(BlockMetaIndex, Arc)>, ) -> Result)>> { - let pruning_runtime = &self.pruning_ctx.pruning_runtime; - let pruning_semaphore = &self.pruning_ctx.pruning_semaphore; + if let Some(param) = &self.vector_topn_param { + let start = Instant::now(); + // Perf. + { + let block_size = metas.iter().map(|(_, m)| m.block_size).sum(); + metrics_inc_blocks_vector_index_pruning_before(metas.len() as u64); + metrics_inc_bytes_block_vector_index_pruning_before(block_size); + self.pruning_ctx + .pruning_stats + .set_blocks_vector_index_pruning_before(metas.len() as u64); + } + // If there are no filter conditions and sort is in ascending order, + // we can use the HNSW index to get the results. + // Otherwise, we need to calculate all the scores and then filter them + // by conditions or sort them in descending order to get the results. + let pruned_metas = if !param.has_filter && param.asc { + self.vector_index_hnsw_topn_prune(param.limit, metas) + .await? + } else { + self.vector_index_topn_prune(¶m.filter_expr, param.asc, param.limit, metas) + .await? + }; + + let elapsed = start.elapsed().as_millis() as u64; + // Perf. + { + let block_size = pruned_metas.iter().map(|(_, m)| m.block_size).sum(); + metrics_inc_blocks_vector_index_pruning_after(pruned_metas.len() as u64); + metrics_inc_bytes_block_vector_index_pruning_after(block_size); + self.pruning_ctx + .pruning_stats + .set_blocks_vector_index_pruning_after(pruned_metas.len() as u64); + metrics_inc_block_vector_index_pruning_milliseconds(elapsed); + } + if !param.has_filter && param.asc { + info!("[FUSE-PRUNER] Vector index hnsw topn prune elapsed: {elapsed}"); + } else { + info!("[FUSE-PRUNER] Vector index calculate score topn prune elapsed: {elapsed}"); + } - let start = Instant::now(); - // Perf. - { - let block_size = metas.iter().map(|(_, m)| m.block_size).sum(); - metrics_inc_blocks_vector_index_pruning_before(metas.len() as u64); - metrics_inc_bytes_block_vector_index_pruning_before(block_size); - self.pruning_ctx - .pruning_stats - .set_blocks_vector_index_pruning_before(metas.len() as u64); + return Ok(pruned_metas); } - let mut block_meta_indexes = metas.into_iter().enumerate(); - let pruning_tasks = std::iter::from_fn(move || { - block_meta_indexes - .next() - .map(|(index, (block_meta_index, block_meta))| { - let vector_reader = vector_reader.clone(); - let index_name = self.vector_index.index_name.clone(); - - let v: VectorPruningFuture = Box::new(move |permit: OwnedSemaphorePermit| { - Box::pin(async move { - let _permit = permit; - - let Some(location) = &block_meta.vector_index_location else { - return Err(ErrorCode::StorageUnavailable(format!( - "vector index {} file don't exist, need refresh", - index_name - ))); - }; - - let row_count = block_meta.row_count as usize; - let score_offsets = - vector_reader.prune(limit, row_count, &location.0).await?; - - let mut vector_scores = Vec::with_capacity(score_offsets.len()); - for score_offset in score_offsets { - let vector_score = VectorScore { - index, - row_idx: score_offset.idx, - score: F32::from(score_offset.score), - }; - vector_scores.push(vector_score); - } - - Ok(VectorPruneResult { - block_idx: index, - scores: vector_scores, - block_meta_index, - block_meta, - }) - }) - }); - v - }) - }); + // Unable to do prune, fallback to only calculating the score + self.vector_index_scores(metas).await + } - let join_handlers = pruning_runtime - .try_spawn_batch_with_owned_semaphore(pruning_semaphore.clone(), pruning_tasks) + async fn vector_index_hnsw_topn_prune( + &self, + limit: usize, + metas: Vec<(BlockMetaIndex, Arc)>, + ) -> Result)>> { + let results = self + .process_vector_pruning_tasks(metas, move |vector_reader, row_count, location| { + let limit = limit; + async move { vector_reader.prune(limit, row_count, &location).await } + }) .await?; - let joint = future::try_join_all(join_handlers) - .await - .map_err(|e| ErrorCode::StorageOther(format!("vector topn pruning failure, {}", e)))?; - let mut top_queue = FixedLengthPriorityQueue::new(limit); - let mut vector_prune_result_map = HashMap::with_capacity(joint.len()); - for vector_prune_result in joint { - let vector_prune_result = vector_prune_result?; - + let len = results.len(); + let mut vector_prune_result_map = HashMap::with_capacity(len); + for vector_prune_result in results { for vector_score in &vector_prune_result.scores { top_queue.push(vector_score.clone()); } @@ -227,7 +241,6 @@ impl VectorIndexPruner { let top_indexes: HashSet = top_scores.iter().map(|s| s.index).collect(); let mut pruned_metas = Vec::with_capacity(top_indexes.len()); - let len = vector_prune_result_map.len(); for index in 0..len { if !top_indexes.contains(&index) { continue; @@ -246,39 +259,164 @@ impl VectorIndexPruner { pruned_metas.push((block_meta_index, vector_prune_result.block_meta)); } + Ok(pruned_metas) + } + + async fn vector_index_topn_prune( + &self, + filter_expr: &Option, + asc: bool, + limit: usize, + metas: Vec<(BlockMetaIndex, Arc)>, + ) -> Result)>> { + let results = self + .process_vector_pruning_tasks( + metas, + move |vector_reader, row_count, location| async move { + vector_reader.generate_scores(row_count, &location).await + }, + ) + .await?; + + let mut top_queue = FixedLengthPriorityQueue::new(limit); + let len = results.len(); + let mut vector_prune_result_map = HashMap::with_capacity(len); + for vector_prune_result in results { + if let Some(filter_expr) = filter_expr { + // If has filter expr, use scores to build a block and do filtering. + let num_rows = vector_prune_result.block_meta.row_count as usize; + let mut builder = Vec::with_capacity(num_rows); + for score in &vector_prune_result.scores { + builder.push(F32::from(score.score)); + } + let column = Column::Number(NumberColumn::Float32(Buffer::from(builder))); + let block = DataBlock::new(vec![BlockEntry::from(column)], num_rows); + let evaluator = Evaluator::new(&block, &self.func_ctx, &BUILTIN_FUNCTIONS); + let res = evaluator.run(filter_expr)?; + let res_column = res.into_full_column(filter_expr.data_type(), num_rows); + let res_column = res_column.remove_nullable(); + let bitmap = res_column.as_boolean().unwrap(); + // All the scores do not meet the conditions, ignore this block. + if bitmap.null_count() == num_rows { + continue; + } + + if !asc { + for (i, vector_score) in vector_prune_result.scores.iter().enumerate() { + if bitmap.get_bit(i) { + // If asc is false, we want to keep the largest scores, + // modify the score to reverse the ordering + let modified_score = vector_score.negative_score(); + top_queue.push(modified_score); + } + } + } else { + for (i, vector_score) in vector_prune_result.scores.iter().enumerate() { + if bitmap.get_bit(i) { + top_queue.push(vector_score.clone()); + } + } + } + } else if !asc { + for vector_score in vector_prune_result.scores.iter() { + let modified_score = vector_score.negative_score(); + top_queue.push(modified_score); + } + } else { + for vector_score in vector_prune_result.scores.iter() { + top_queue.push(vector_score.clone()); + } + } + + vector_prune_result_map.insert(vector_prune_result.block_idx, vector_prune_result); + } + + let top_scores = top_queue.into_sorted_vec(); + let top_indexes: HashSet = top_scores.iter().map(|s| s.index).collect(); + + let mut pruned_metas = Vec::with_capacity(top_indexes.len()); + for index in 0..len { + if !top_indexes.contains(&index) { + continue; + } + let vector_prune_result = vector_prune_result_map.remove(&index).unwrap(); + + let mut vector_scores = Vec::with_capacity(vector_prune_result.scores.len()); + for vector_score in &vector_prune_result.scores { + vector_scores.push((vector_score.row_idx as usize, vector_score.score)); + } + let mut block_meta_index = vector_prune_result.block_meta_index; + block_meta_index.vector_scores = Some(vector_scores); + + pruned_metas.push((block_meta_index, vector_prune_result.block_meta)); + } + + Ok(pruned_metas) + } + + async fn vector_index_scores( + &self, + metas: Vec<(BlockMetaIndex, Arc)>, + ) -> Result)>> { + let start = Instant::now(); + + let results = self + .process_vector_pruning_tasks(metas, |vector_reader, row_count, location| async move { + vector_reader.generate_scores(row_count, &location).await + }) + .await?; + + let mut vector_prune_result_map = HashMap::with_capacity(results.len()); + for vector_prune_result in results { + vector_prune_result_map.insert(vector_prune_result.block_idx, vector_prune_result); + } + + let len = vector_prune_result_map.len(); + let mut new_metas = Vec::with_capacity(len); + for index in 0..len { + let vector_prune_result = vector_prune_result_map.remove(&index).unwrap(); + + let mut vector_scores = Vec::with_capacity(vector_prune_result.scores.len()); + for vector_score in &vector_prune_result.scores { + vector_scores.push((vector_score.row_idx as usize, vector_score.score)); + } + let mut block_meta_index = vector_prune_result.block_meta_index; + block_meta_index.vector_scores = Some(vector_scores); + + new_metas.push((block_meta_index, vector_prune_result.block_meta)); + } + let elapsed = start.elapsed().as_millis() as u64; // Perf. { - let block_size = pruned_metas.iter().map(|(_, m)| m.block_size).sum(); - metrics_inc_blocks_vector_index_pruning_after(pruned_metas.len() as u64); - metrics_inc_bytes_block_vector_index_pruning_after(block_size); - self.pruning_ctx - .pruning_stats - .set_blocks_vector_index_pruning_after(pruned_metas.len() as u64); metrics_inc_block_vector_index_pruning_milliseconds(elapsed); } - info!("[FUSE-PRUNER] Vector index topn prune elapsed: {elapsed}"); + info!("[FUSE-PRUNER] Vector index calculate score elapsed: {elapsed}"); - Ok(pruned_metas) + Ok(new_metas) } - async fn vector_index_prune( + // Helper function to process vector pruning tasks with different vector reader operations + async fn process_vector_pruning_tasks( &self, - vector_reader: VectorIndexReader, metas: Vec<(BlockMetaIndex, Arc)>, - ) -> Result)>> { - // can't use vector index topn to prune, only generate vector scores. + vector_reader_op: F, + ) -> Result> + where + F: Fn(VectorIndexReader, usize, String) -> Fut + Clone + Send + 'static, + Fut: Future>> + Send, + { let pruning_runtime = &self.pruning_ctx.pruning_runtime; let pruning_semaphore = &self.pruning_ctx.pruning_semaphore; - let start = Instant::now(); let mut block_meta_indexes = metas.into_iter().enumerate(); let pruning_tasks = std::iter::from_fn(move || { block_meta_indexes .next() .map(|(index, (block_meta_index, block_meta))| { - let vector_reader = vector_reader.clone(); + let vector_reader = self.vector_reader.clone(); let index_name = self.vector_index.index_name.clone(); + let vector_reader_op = vector_reader_op.clone(); let v: VectorPruningFuture = Box::new(move |permit: OwnedSemaphorePermit| { Box::pin(async move { @@ -290,10 +428,11 @@ impl VectorIndexPruner { index_name ))); }; + let row_count = block_meta.row_count as usize; - let score_offsets = vector_reader - .generate_scores(row_count, &location.0) - .await?; + let score_offsets = + vector_reader_op(vector_reader, row_count, location.0.clone()) + .await?; let mut vector_scores = Vec::with_capacity(score_offsets.len()); for score_offset in score_offsets { @@ -323,41 +462,18 @@ impl VectorIndexPruner { let joint = future::try_join_all(join_handlers) .await - .map_err(|e| ErrorCode::StorageOther(format!("vector pruning failure, {}", e)))?; + .map_err(|e| ErrorCode::StorageOther(format!("vector pruning failure: {}", e)))?; - let mut vector_prune_result_map = HashMap::with_capacity(joint.len()); - for vector_prune_result in joint { - let vector_prune_result = vector_prune_result?; - vector_prune_result_map.insert(vector_prune_result.block_idx, vector_prune_result); - } - - let len = vector_prune_result_map.len(); - let mut new_metas = Vec::with_capacity(len); - for index in 0..len { - let vector_prune_result = vector_prune_result_map.remove(&index).unwrap(); - let mut vector_scores = - Vec::with_capacity(vector_prune_result.block_meta.row_count as usize); - for score in &vector_prune_result.scores { - vector_scores.push((score.row_idx as usize, score.score)); - } - let mut block_meta_index = vector_prune_result.block_meta_index; - block_meta_index.vector_scores = Some(vector_scores); - - new_metas.push((block_meta_index, vector_prune_result.block_meta)); + let mut results = Vec::with_capacity(joint.len()); + for result in joint { + results.push(result?); } - let elapsed = start.elapsed().as_millis() as u64; - // Perf. - { - metrics_inc_block_vector_index_pruning_milliseconds(elapsed); - } - info!("[FUSE-PRUNER] Vector index prune elapsed: {elapsed}"); - - Ok(new_metas) + Ok(results) } } -// result of block pruning +// result of vector index block pruning struct VectorPruneResult { // the block index in segment block_idx: usize, @@ -385,3 +501,14 @@ impl PartialOrd for VectorScore { Some(self.cmp(other)) } } + +impl VectorScore { + // Create a modified vector score with negated score to reverse ordering + fn negative_score(&self) -> Self { + Self { + index: self.index, + row_idx: self.row_idx, + score: -self.score, + } + } +} diff --git a/tests/sqllogictests/suites/ee/09_ee_vector_index/09_0000_vector_index_base.test b/tests/sqllogictests/suites/ee/09_ee_vector_index/09_0000_vector_index_base.test index 829096a84b9bc..e7561c114a0a9 100644 --- a/tests/sqllogictests/suites/ee/09_ee_vector_index/09_0000_vector_index_base.test +++ b/tests/sqllogictests/suites/ee/09_ee_vector_index/09_0000_vector_index_base.test @@ -131,6 +131,64 @@ RowFetch ├── push downs: [filters: [], limit: 5] └── estimated rows: 16.00 +query T +EXPLAIN SELECT id, cosine_distance(embedding, [0.50515236, 0.8561939, 0.87169914, 0.55843271, 0.73689797, 0.49985862, 0.64527255, 0.29313098]::vector(8)) AS similarity FROM t ORDER BY similarity DESC LIMIT 3; +---- +RowFetch +├── output columns: [t._vector_score (#2), t._row_id (#3), t.id (#0)] +├── columns to fetch: [id] +├── estimated rows: 3.00 +└── Limit + ├── output columns: [t._vector_score (#2), t._row_id (#3)] + ├── limit: 3 + ├── offset: 0 + ├── estimated rows: 3.00 + └── Sort(Single) + ├── output columns: [t._vector_score (#2), t._row_id (#3)] + ├── sort keys: [_vector_score DESC NULLS LAST] + ├── estimated rows: 16.00 + └── TableScan + ├── table: default.test_vector_index.t + ├── output columns: [_vector_score (#2), _row_id (#3)] + ├── read rows: 8 + ├── read size: 0 + ├── partitions total: 4 + ├── partitions scanned: 2 + ├── pruning stats: [segments: , blocks: ] + ├── push downs: [filters: [], limit: 3] + └── estimated rows: 16.00 + +query T +EXPLAIN SELECT id, cosine_distance(embedding, [0.50515236, 0.8561939, 0.87169914, 0.55843271, 0.73689797, 0.49985862, 0.64527255, 0.29313098]::vector(8)) AS similarity FROM t WHERE similarity > 0.1 ORDER BY similarity ASC LIMIT 3; +---- +RowFetch +├── output columns: [t._vector_score (#2), t._row_id (#3), t.id (#0)] +├── columns to fetch: [id] +├── estimated rows: 0.01 +└── Limit + ├── output columns: [t._vector_score (#2), t._row_id (#3)] + ├── limit: 3 + ├── offset: 0 + ├── estimated rows: 0.01 + └── Sort(Single) + ├── output columns: [t._vector_score (#2), t._row_id (#3)] + ├── sort keys: [_vector_score ASC NULLS LAST] + ├── estimated rows: 0.01 + └── Filter + ├── output columns: [t._vector_score (#2), t._row_id (#3)] + ├── filters: [t._vector_score (#2) > 0.1] + ├── estimated rows: 0.01 + └── TableScan + ├── table: default.test_vector_index.t + ├── output columns: [_vector_score (#2), _row_id (#3)] + ├── read rows: 12 + ├── read size: 0 + ├── partitions total: 4 + ├── partitions scanned: 3 + ├── pruning stats: [segments: , blocks: ] + ├── push downs: [filters: [t._vector_score (#2) > 0.1], limit: 3] + └── estimated rows: 16.00 + query IF SELECT id, cosine_distance(embedding, [0.50515236, 0.8561939, 0.87169914, 0.55843271, 0.73689797, 0.49985862, 0.64527255, 0.29313098]::vector(8)) AS similarity FROM t ORDER BY similarity ASC LIMIT 5; ---- @@ -241,24 +299,32 @@ SELECT id, l2_distance(embedding, [0.02559146, 0.38549544, 0.77889671, 0.3159103 query IF -SELECT id, cosine_distance(embedding, [0.50515236, 0.8561939, 0.87169914, 0.55843271, 0.73689797, 0.49985862, 0.64527255, 0.29313098]::vector(8)) AS similarity FROM t ORDER BY similarity DESC; +SELECT id, cosine_distance(embedding, [0.50515236, 0.8561939, 0.87169914, 0.55843271, 0.73689797, 0.49985862, 0.64527255, 0.29313098]::vector(8)) AS similarity FROM t ORDER BY similarity DESC LIMIT 3; ---- 9 0.2568838 16 0.25626028 13 0.24121934 -2 0.2268933 -14 0.21996021 -5 0.17328858 -4 0.16786504 -6 0.1645267 -7 0.15616316 -15 0.150944 -3 0.14645952 -8 0.14554787 + +query IF +SELECT id, cosine_distance(embedding, [0.50515236, 0.8561939, 0.87169914, 0.55843271, 0.73689797, 0.49985862, 0.64527255, 0.29313098]::vector(8)) AS similarity FROM t1 ORDER BY similarity DESC LIMIT 3; +---- +9 0.286018 +16 0.24571353 +13 0.23111635 + +query IF +SELECT id, cosine_distance(embedding, [0.50515236, 0.8561939, 0.87169914, 0.55843271, 0.73689797, 0.49985862, 0.64527255, 0.29313098]::vector(8)) AS similarity FROM t WHERE similarity > 0.1 ORDER BY similarity ASC LIMIT 3; +---- 11 0.14048636 -12 0.060161233 -10 0.033747792 -1 0.009774268 +8 0.14554787 +3 0.14645952 + +query IF +SELECT id, cosine_distance(embedding, [0.50515236, 0.8561939, 0.87169914, 0.55843271, 0.73689797, 0.49985862, 0.64527255, 0.29313098]::vector(8)) AS similarity FROM t1 WHERE similarity > 0.1 ORDER BY similarity ASC LIMIT 3; +---- +8 0.13477594 +3 0.13801938 +15 0.13881427 statement ok use default