diff --git a/be/src/vec/functions/function_search.cpp b/be/src/vec/functions/function_search.cpp index 6fd7da39208f8f..4a4a397e8d80b7 100644 --- a/be/src/vec/functions/function_search.cpp +++ b/be/src/vec/functions/function_search.cpp @@ -317,7 +317,8 @@ Status FunctionSearch::evaluate_inverted_index_with_search_param( // Aligned with FE QsClauseType enum - uses enum.name() as clause_type FunctionSearch::ClauseTypeCategory FunctionSearch::get_clause_type_category( const std::string& clause_type) const { - if (clause_type == "AND" || clause_type == "OR" || clause_type == "NOT") { + if (clause_type == "AND" || clause_type == "OR" || clause_type == "NOT" || + clause_type == "OCCUR_BOOLEAN") { return ClauseTypeCategory::COMPOUND; } else if (clause_type == "TERM" || clause_type == "PREFIX" || clause_type == "WILDCARD" || clause_type == "REGEXP" || clause_type == "RANGE" || clause_type == "LIST" || @@ -377,6 +378,7 @@ InvertedIndexQueryType FunctionSearch::clause_type_to_query_type( {"AND", InvertedIndexQueryType::BOOLEAN_QUERY}, {"OR", InvertedIndexQueryType::BOOLEAN_QUERY}, {"NOT", InvertedIndexQueryType::BOOLEAN_QUERY}, + {"OCCUR_BOOLEAN", InvertedIndexQueryType::BOOLEAN_QUERY}, // Non-tokenized queries (exact matching, pattern matching) {"TERM", InvertedIndexQueryType::EQUAL_QUERY}, @@ -406,6 +408,20 @@ InvertedIndexQueryType FunctionSearch::clause_type_to_query_type( return InvertedIndexQueryType::EQUAL_QUERY; } +// Map Thrift TSearchOccur to query_v2::Occur +static query_v2::Occur map_thrift_occur(TSearchOccur::type thrift_occur) { + switch (thrift_occur) { + case TSearchOccur::MUST: + return query_v2::Occur::MUST; + case TSearchOccur::SHOULD: + return query_v2::Occur::SHOULD; + case TSearchOccur::MUST_NOT: + return query_v2::Occur::MUST_NOT; + default: + return query_v2::Occur::MUST; + } +} + Status FunctionSearch::build_query_recursive(const TSearchClause& clause, const std::shared_ptr& context, FieldReaderResolver& resolver, @@ -418,6 +434,38 @@ Status FunctionSearch::build_query_recursive(const TSearchClause& clause, } const std::string& clause_type = clause.clause_type; + + // Handle OCCUR_BOOLEAN - Lucene-style boolean query with MUST/SHOULD/MUST_NOT + if (clause_type == "OCCUR_BOOLEAN") { + auto builder = segment_v2::inverted_index::query_v2::create_occur_boolean_query_builder(); + + // Set minimum_should_match if specified + if (clause.__isset.minimum_should_match) { + builder->set_minimum_number_should_match(clause.minimum_should_match); + } + + if (clause.__isset.children) { + for (const auto& child_clause : clause.children) { + query_v2::QueryPtr child_query; + std::string child_binding_key; + RETURN_IF_ERROR(build_query_recursive(child_clause, context, resolver, &child_query, + &child_binding_key)); + + // Determine occur type from child clause + query_v2::Occur occur = query_v2::Occur::MUST; // default + if (child_clause.__isset.occur) { + occur = map_thrift_occur(child_clause.occur); + } + + builder->add(child_query, occur); + } + } + + *out = builder->build(); + return Status::OK(); + } + + // Handle standard boolean operators (AND/OR/NOT) if (clause_type == "AND" || clause_type == "OR" || clause_type == "NOT") { query_v2::OperatorType op = query_v2::OperatorType::OP_AND; if (clause_type == "OR") { diff --git a/be/test/vec/function/function_search_test.cpp b/be/test/vec/function/function_search_test.cpp index a4f53068d6f781..64b64b0d667b19 100644 --- a/be/test/vec/function/function_search_test.cpp +++ b/be/test/vec/function/function_search_test.cpp @@ -1760,4 +1760,445 @@ TEST_F(FunctionSearchTest, TestMultiPhraseQueryCase) { ASSERT_NE(multi_phrase_weight, nullptr); } +// ============== Lucene Mode (OCCUR_BOOLEAN) Tests ============== + +TEST_F(FunctionSearchTest, TestOccurBooleanClauseTypeCategory) { + // Test that OCCUR_BOOLEAN is classified as COMPOUND + EXPECT_EQ(FunctionSearch::ClauseTypeCategory::COMPOUND, + function_search->get_clause_type_category("OCCUR_BOOLEAN")); +} + +TEST_F(FunctionSearchTest, TestOccurBooleanQueryType) { + // Test that OCCUR_BOOLEAN maps to BOOLEAN_QUERY + EXPECT_EQ(segment_v2::InvertedIndexQueryType::BOOLEAN_QUERY, + function_search->clause_type_to_query_type("OCCUR_BOOLEAN")); +} + +TEST_F(FunctionSearchTest, TestOccurBooleanSearchParam) { + // Test creating OCCUR_BOOLEAN search param (Lucene mode) + TSearchParam searchParam; + searchParam.original_dsl = "field:a AND field:b OR field:c"; + + // Create child clauses with occur types + TSearchClause mustClause1; + mustClause1.clause_type = "TERM"; + mustClause1.field_name = "field"; + mustClause1.value = "a"; + mustClause1.__isset.field_name = true; + mustClause1.__isset.value = true; + mustClause1.occur = TSearchOccur::MUST; + mustClause1.__isset.occur = true; + + TSearchClause mustClause2; + mustClause2.clause_type = "TERM"; + mustClause2.field_name = "field"; + mustClause2.value = "b"; + mustClause2.__isset.field_name = true; + mustClause2.__isset.value = true; + mustClause2.occur = TSearchOccur::MUST; + mustClause2.__isset.occur = true; + + // Create root OCCUR_BOOLEAN clause + TSearchClause rootClause; + rootClause.clause_type = "OCCUR_BOOLEAN"; + rootClause.children = {mustClause1, mustClause2}; + rootClause.__isset.children = true; + rootClause.minimum_should_match = 0; + rootClause.__isset.minimum_should_match = true; + searchParam.root = rootClause; + + // Verify structure + EXPECT_EQ("OCCUR_BOOLEAN", searchParam.root.clause_type); + EXPECT_EQ(2, searchParam.root.children.size()); + EXPECT_EQ(TSearchOccur::MUST, searchParam.root.children[0].occur); + EXPECT_EQ(TSearchOccur::MUST, searchParam.root.children[1].occur); + EXPECT_EQ(0, searchParam.root.minimum_should_match); +} + +TEST_F(FunctionSearchTest, TestOccurBooleanWithMustNotClause) { + // Test OCCUR_BOOLEAN with MUST_NOT (NOT operator in Lucene mode) + TSearchParam searchParam; + searchParam.original_dsl = "NOT field:a"; + + TSearchClause mustNotClause; + mustNotClause.clause_type = "TERM"; + mustNotClause.field_name = "field"; + mustNotClause.value = "a"; + mustNotClause.__isset.field_name = true; + mustNotClause.__isset.value = true; + mustNotClause.occur = TSearchOccur::MUST_NOT; + mustNotClause.__isset.occur = true; + + TSearchClause rootClause; + rootClause.clause_type = "OCCUR_BOOLEAN"; + rootClause.children = {mustNotClause}; + rootClause.__isset.children = true; + searchParam.root = rootClause; + + // Verify structure + EXPECT_EQ("OCCUR_BOOLEAN", searchParam.root.clause_type); + EXPECT_EQ(1, searchParam.root.children.size()); + EXPECT_EQ(TSearchOccur::MUST_NOT, searchParam.root.children[0].occur); +} + +TEST_F(FunctionSearchTest, TestOccurBooleanWithShouldClauses) { + // Test OCCUR_BOOLEAN with SHOULD clauses (OR in Lucene mode) + TSearchParam searchParam; + searchParam.original_dsl = "field:a OR field:b"; + + TSearchClause shouldClause1; + shouldClause1.clause_type = "TERM"; + shouldClause1.field_name = "field"; + shouldClause1.value = "a"; + shouldClause1.__isset.field_name = true; + shouldClause1.__isset.value = true; + shouldClause1.occur = TSearchOccur::SHOULD; + shouldClause1.__isset.occur = true; + + TSearchClause shouldClause2; + shouldClause2.clause_type = "TERM"; + shouldClause2.field_name = "field"; + shouldClause2.value = "b"; + shouldClause2.__isset.field_name = true; + shouldClause2.__isset.value = true; + shouldClause2.occur = TSearchOccur::SHOULD; + shouldClause2.__isset.occur = true; + + TSearchClause rootClause; + rootClause.clause_type = "OCCUR_BOOLEAN"; + rootClause.children = {shouldClause1, shouldClause2}; + rootClause.__isset.children = true; + rootClause.minimum_should_match = 1; + rootClause.__isset.minimum_should_match = true; + searchParam.root = rootClause; + + // Verify structure + EXPECT_EQ("OCCUR_BOOLEAN", searchParam.root.clause_type); + EXPECT_EQ(2, searchParam.root.children.size()); + EXPECT_EQ(TSearchOccur::SHOULD, searchParam.root.children[0].occur); + EXPECT_EQ(TSearchOccur::SHOULD, searchParam.root.children[1].occur); + EXPECT_EQ(1, searchParam.root.minimum_should_match); +} + +TEST_F(FunctionSearchTest, TestOccurBooleanMixedOccurTypes) { + // Test OCCUR_BOOLEAN with mixed MUST, SHOULD, MUST_NOT (complex Lucene query) + // Example: +a +b c -d (a AND b, c is optional, NOT d) + TSearchParam searchParam; + searchParam.original_dsl = "field:a AND field:b OR field:c NOT field:d"; + + TSearchClause mustClause1; + mustClause1.clause_type = "TERM"; + mustClause1.field_name = "field"; + mustClause1.value = "a"; + mustClause1.__isset.field_name = true; + mustClause1.__isset.value = true; + mustClause1.occur = TSearchOccur::MUST; + mustClause1.__isset.occur = true; + + TSearchClause mustClause2; + mustClause2.clause_type = "TERM"; + mustClause2.field_name = "field"; + mustClause2.value = "b"; + mustClause2.__isset.field_name = true; + mustClause2.__isset.value = true; + mustClause2.occur = TSearchOccur::MUST; + mustClause2.__isset.occur = true; + + TSearchClause shouldClause; + shouldClause.clause_type = "TERM"; + shouldClause.field_name = "field"; + shouldClause.value = "c"; + shouldClause.__isset.field_name = true; + shouldClause.__isset.value = true; + shouldClause.occur = TSearchOccur::SHOULD; + shouldClause.__isset.occur = true; + + TSearchClause mustNotClause; + mustNotClause.clause_type = "TERM"; + mustNotClause.field_name = "field"; + mustNotClause.value = "d"; + mustNotClause.__isset.field_name = true; + mustNotClause.__isset.value = true; + mustNotClause.occur = TSearchOccur::MUST_NOT; + mustNotClause.__isset.occur = true; + + TSearchClause rootClause; + rootClause.clause_type = "OCCUR_BOOLEAN"; + rootClause.children = {mustClause1, mustClause2, shouldClause, mustNotClause}; + rootClause.__isset.children = true; + rootClause.minimum_should_match = 0; + rootClause.__isset.minimum_should_match = true; + searchParam.root = rootClause; + + // Verify structure + EXPECT_EQ("OCCUR_BOOLEAN", searchParam.root.clause_type); + EXPECT_EQ(4, searchParam.root.children.size()); + EXPECT_EQ(TSearchOccur::MUST, searchParam.root.children[0].occur); + EXPECT_EQ(TSearchOccur::MUST, searchParam.root.children[1].occur); + EXPECT_EQ(TSearchOccur::SHOULD, searchParam.root.children[2].occur); + EXPECT_EQ(TSearchOccur::MUST_NOT, searchParam.root.children[3].occur); + EXPECT_EQ(0, searchParam.root.minimum_should_match); +} + +TEST_F(FunctionSearchTest, TestOccurBooleanMinimumShouldMatchZero) { + // Test that SHOULD clauses are effectively ignored when minimum_should_match=0 + // and MUST clauses exist + TSearchParam searchParam; + searchParam.original_dsl = "field:a AND field:b OR field:c"; + + TSearchClause mustClause1; + mustClause1.clause_type = "TERM"; + mustClause1.field_name = "field"; + mustClause1.value = "a"; + mustClause1.__isset.field_name = true; + mustClause1.__isset.value = true; + mustClause1.occur = TSearchOccur::MUST; + mustClause1.__isset.occur = true; + + TSearchClause mustClause2; + mustClause2.clause_type = "TERM"; + mustClause2.field_name = "field"; + mustClause2.value = "b"; + mustClause2.__isset.field_name = true; + mustClause2.__isset.value = true; + mustClause2.occur = TSearchOccur::MUST; + mustClause2.__isset.occur = true; + + // Note: In Lucene mode with minimum_should_match=0 and MUST clauses, + // SHOULD clauses are filtered out during FE parsing. + // So only MUST clauses should be present. + TSearchClause rootClause; + rootClause.clause_type = "OCCUR_BOOLEAN"; + rootClause.children = {mustClause1, mustClause2}; + rootClause.__isset.children = true; + rootClause.minimum_should_match = 0; + rootClause.__isset.minimum_should_match = true; + searchParam.root = rootClause; + + // Verify structure + EXPECT_EQ("OCCUR_BOOLEAN", searchParam.root.clause_type); + EXPECT_EQ(2, searchParam.root.children.size()); + EXPECT_EQ(0, searchParam.root.minimum_should_match); +} + +TEST_F(FunctionSearchTest, TestOccurBooleanMinimumShouldMatchOne) { + // Test that at least one SHOULD clause must match when minimum_should_match=1 + TSearchParam searchParam; + searchParam.original_dsl = "field:a OR field:b OR field:c"; + + TSearchClause shouldClause1; + shouldClause1.clause_type = "TERM"; + shouldClause1.field_name = "field"; + shouldClause1.value = "a"; + shouldClause1.__isset.field_name = true; + shouldClause1.__isset.value = true; + shouldClause1.occur = TSearchOccur::SHOULD; + shouldClause1.__isset.occur = true; + + TSearchClause shouldClause2; + shouldClause2.clause_type = "TERM"; + shouldClause2.field_name = "field"; + shouldClause2.value = "b"; + shouldClause2.__isset.field_name = true; + shouldClause2.__isset.value = true; + shouldClause2.occur = TSearchOccur::SHOULD; + shouldClause2.__isset.occur = true; + + TSearchClause shouldClause3; + shouldClause3.clause_type = "TERM"; + shouldClause3.field_name = "field"; + shouldClause3.value = "c"; + shouldClause3.__isset.field_name = true; + shouldClause3.__isset.value = true; + shouldClause3.occur = TSearchOccur::SHOULD; + shouldClause3.__isset.occur = true; + + TSearchClause rootClause; + rootClause.clause_type = "OCCUR_BOOLEAN"; + rootClause.children = {shouldClause1, shouldClause2, shouldClause3}; + rootClause.__isset.children = true; + rootClause.minimum_should_match = 1; + rootClause.__isset.minimum_should_match = true; + searchParam.root = rootClause; + + // Verify structure + EXPECT_EQ("OCCUR_BOOLEAN", searchParam.root.clause_type); + EXPECT_EQ(3, searchParam.root.children.size()); + EXPECT_EQ(1, searchParam.root.minimum_should_match); +} + +TEST_F(FunctionSearchTest, TestOccurBooleanAnalyzeFieldQueryType) { + // Test field query type analysis for OCCUR_BOOLEAN + TSearchClause mustClause; + mustClause.clause_type = "TERM"; + mustClause.field_name = "title"; + mustClause.value = "hello"; + mustClause.__isset.field_name = true; + mustClause.__isset.value = true; + mustClause.occur = TSearchOccur::MUST; + mustClause.__isset.occur = true; + + TSearchClause rootClause; + rootClause.clause_type = "OCCUR_BOOLEAN"; + rootClause.children = {mustClause}; + rootClause.__isset.children = true; + + // Test field-specific query type analysis + auto title_query_type = function_search->analyze_field_query_type("title", rootClause); + EXPECT_EQ(segment_v2::InvertedIndexQueryType::EQUAL_QUERY, title_query_type); + + // Test field not in query + auto other_query_type = function_search->analyze_field_query_type("other_field", rootClause); + EXPECT_EQ(segment_v2::InvertedIndexQueryType::UNKNOWN_QUERY, other_query_type); +} + +TEST_F(FunctionSearchTest, TestOccurBooleanWithPhraseQuery) { + // Test OCCUR_BOOLEAN with PHRASE child clause + TSearchParam searchParam; + searchParam.original_dsl = "content:\"machine learning\" AND title:hello"; + + TSearchClause phraseClause; + phraseClause.clause_type = "PHRASE"; + phraseClause.field_name = "content"; + phraseClause.value = "machine learning"; + phraseClause.__isset.field_name = true; + phraseClause.__isset.value = true; + phraseClause.occur = TSearchOccur::MUST; + phraseClause.__isset.occur = true; + + TSearchClause termClause; + termClause.clause_type = "TERM"; + termClause.field_name = "title"; + termClause.value = "hello"; + termClause.__isset.field_name = true; + termClause.__isset.value = true; + termClause.occur = TSearchOccur::MUST; + termClause.__isset.occur = true; + + TSearchClause rootClause; + rootClause.clause_type = "OCCUR_BOOLEAN"; + rootClause.children = {phraseClause, termClause}; + rootClause.__isset.children = true; + searchParam.root = rootClause; + + // Verify structure + EXPECT_EQ("OCCUR_BOOLEAN", searchParam.root.clause_type); + EXPECT_EQ(2, searchParam.root.children.size()); + EXPECT_EQ("PHRASE", searchParam.root.children[0].clause_type); + EXPECT_EQ("TERM", searchParam.root.children[1].clause_type); + + // Test field-specific query type analysis + auto content_query_type = + function_search->analyze_field_query_type("content", searchParam.root); + EXPECT_EQ(segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY, content_query_type); + + auto title_query_type = function_search->analyze_field_query_type("title", searchParam.root); + EXPECT_EQ(segment_v2::InvertedIndexQueryType::EQUAL_QUERY, title_query_type); +} + +TEST_F(FunctionSearchTest, TestOccurBooleanNestedQuery) { + // Test nested OCCUR_BOOLEAN query + TSearchParam searchParam; + searchParam.original_dsl = "(field:a AND field:b) OR field:c"; + + TSearchClause innerMust1; + innerMust1.clause_type = "TERM"; + innerMust1.field_name = "field"; + innerMust1.value = "a"; + innerMust1.__isset.field_name = true; + innerMust1.__isset.value = true; + innerMust1.occur = TSearchOccur::MUST; + innerMust1.__isset.occur = true; + + TSearchClause innerMust2; + innerMust2.clause_type = "TERM"; + innerMust2.field_name = "field"; + innerMust2.value = "b"; + innerMust2.__isset.field_name = true; + innerMust2.__isset.value = true; + innerMust2.occur = TSearchOccur::MUST; + innerMust2.__isset.occur = true; + + TSearchClause innerOccurBoolean; + innerOccurBoolean.clause_type = "OCCUR_BOOLEAN"; + innerOccurBoolean.children = {innerMust1, innerMust2}; + innerOccurBoolean.__isset.children = true; + innerOccurBoolean.occur = TSearchOccur::SHOULD; + innerOccurBoolean.__isset.occur = true; + + TSearchClause shouldClause; + shouldClause.clause_type = "TERM"; + shouldClause.field_name = "field"; + shouldClause.value = "c"; + shouldClause.__isset.field_name = true; + shouldClause.__isset.value = true; + shouldClause.occur = TSearchOccur::SHOULD; + shouldClause.__isset.occur = true; + + TSearchClause rootClause; + rootClause.clause_type = "OCCUR_BOOLEAN"; + rootClause.children = {innerOccurBoolean, shouldClause}; + rootClause.__isset.children = true; + rootClause.minimum_should_match = 1; + rootClause.__isset.minimum_should_match = true; + searchParam.root = rootClause; + + // Verify structure + EXPECT_EQ("OCCUR_BOOLEAN", searchParam.root.clause_type); + EXPECT_EQ(2, searchParam.root.children.size()); + EXPECT_EQ("OCCUR_BOOLEAN", searchParam.root.children[0].clause_type); + EXPECT_EQ("TERM", searchParam.root.children[1].clause_type); + EXPECT_EQ(1, searchParam.root.minimum_should_match); +} + +TEST_F(FunctionSearchTest, TestEvaluateInvertedIndexWithOccurBoolean) { + // Test evaluate_inverted_index_with_search_param with OCCUR_BOOLEAN + TSearchParam search_param; + search_param.original_dsl = "title:hello AND content:world"; + + TSearchClause mustClause1; + mustClause1.clause_type = "TERM"; + mustClause1.field_name = "title"; + mustClause1.value = "hello"; + mustClause1.__isset.field_name = true; + mustClause1.__isset.value = true; + mustClause1.occur = TSearchOccur::MUST; + mustClause1.__isset.occur = true; + + TSearchClause mustClause2; + mustClause2.clause_type = "TERM"; + mustClause2.field_name = "content"; + mustClause2.value = "world"; + mustClause2.__isset.field_name = true; + mustClause2.__isset.value = true; + mustClause2.occur = TSearchOccur::MUST; + mustClause2.__isset.occur = true; + + TSearchClause rootClause; + rootClause.clause_type = "OCCUR_BOOLEAN"; + rootClause.children = {mustClause1, mustClause2}; + rootClause.__isset.children = true; + rootClause.minimum_should_match = 0; + rootClause.__isset.minimum_should_match = true; + search_param.root = rootClause; + + std::unordered_map data_types; + std::unordered_map iterators; + + // No real iterators - will fail but tests the code path + data_types["title"] = {"title", nullptr}; + data_types["content"] = {"content", nullptr}; + iterators["title"] = nullptr; + iterators["content"] = nullptr; + + uint32_t num_rows = 100; + InvertedIndexResultBitmap bitmap_result; + + auto status = function_search->evaluate_inverted_index_with_search_param( + search_param, data_types, iterators, num_rows, bitmap_result); + // Will return OK because root_query is nullptr (all child queries fail) + // EXPECT_TRUE(status.ok()); + EXPECT_TRUE(status.is()); +} + } // namespace doris::vectorized diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/SearchPredicate.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/SearchPredicate.java index 7ed0fb4b5d2af8..8440d70b334750 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/SearchPredicate.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/SearchPredicate.java @@ -26,6 +26,7 @@ import org.apache.doris.thrift.TExprNodeType; import org.apache.doris.thrift.TSearchClause; import org.apache.doris.thrift.TSearchFieldBinding; +import org.apache.doris.thrift.TSearchOccur; import org.apache.doris.thrift.TSearchParam; import org.apache.logging.log4j.LogManager; @@ -313,6 +314,16 @@ private TSearchClause convertQsNodeToThrift( clause.setValue(node.value); } + // Convert occur type for Lucene-style boolean queries + if (node.occur != null) { + clause.setOccur(convertQsOccurToThrift(node.occur)); + } + + // Convert minimum_should_match for OCCUR_BOOLEAN + if (node.minimumShouldMatch != null) { + clause.setMinimumShouldMatch(node.minimumShouldMatch); + } + if (node.children != null && !node.children.isEmpty()) { List childClauses = new ArrayList<>(); for (SearchDslParser.QsNode child : node.children) { @@ -324,6 +335,19 @@ private TSearchClause convertQsNodeToThrift( return clause; } + private TSearchOccur convertQsOccurToThrift(SearchDslParser.QsOccur occur) { + switch (occur) { + case MUST: + return TSearchOccur.MUST; + case SHOULD: + return TSearchOccur.SHOULD; + case MUST_NOT: + return TSearchOccur.MUST_NOT; + default: + return TSearchOccur.MUST; + } + } + // Getters public String getDslString() { return dslString; diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Search.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Search.java index f89af38cc22208..3a98a6cbf052a8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Search.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Search.java @@ -34,10 +34,17 @@ * ScalarFunction 'search' - simplified architecture similar to MultiMatch. * Handles DSL parsing and generates SearchPredicate during translation. *

- * Supports 1-3 parameters: - * - search(dsl_string): Traditional usage - * - search(dsl_string, default_field): Simplified syntax with default field - * - search(dsl_string, default_field, default_operator): Full control over expansion + * Supports 1-2 parameters: + * - search(dsl_string): Traditional usage with field specified in DSL + * - search(dsl_string, options): With JSON options for configuration + *

+ * Options parameter (JSON format): + * - default_field: default field name when DSL doesn't specify field + * - default_operator: "and" or "or" for multi-term queries (default: "and") + * - mode: "standard" (default) or "lucene" (ES/Lucene-style boolean parsing) + * - minimum_should_match: integer for Lucene mode (default: 0 for filter context) + *

+ * Example options: '{"default_field":"title","mode":"lucene","minimum_should_match":0}' */ public class Search extends ScalarFunction implements ExplicitlyCastableSignature, AlwaysNotNullable { @@ -45,11 +52,8 @@ public class Search extends ScalarFunction public static final List SIGNATURES = ImmutableList.of( // Original signature: search(dsl_string) FunctionSignature.ret(BooleanType.INSTANCE).args(StringType.INSTANCE), - // With default field: search(dsl_string, default_field) - FunctionSignature.ret(BooleanType.INSTANCE).args(StringType.INSTANCE, StringType.INSTANCE), - // With default field and operator: search(dsl_string, default_field, default_operator) - FunctionSignature.ret(BooleanType.INSTANCE).args(StringType.INSTANCE, StringType.INSTANCE, - StringType.INSTANCE) + // With options: search(dsl_string, options) + FunctionSignature.ret(BooleanType.INSTANCE).args(StringType.INSTANCE, StringType.INSTANCE) ); public Search(Expression... varArgs) { @@ -62,8 +66,8 @@ private Search(ScalarFunctionParams functionParams) { @Override public Search withChildren(List children) { - Preconditions.checkArgument(children.size() >= 1 && children.size() <= 3, - "search() requires 1-3 arguments"); + Preconditions.checkArgument(children.size() >= 1 && children.size() <= 2, + "search() requires 1-2 arguments"); return new Search(getFunctionParams(children)); } @@ -89,31 +93,23 @@ public String getDslString() { } /** - * Get default field from second argument (optional) + * Get options JSON string from second argument (optional). + * Options is a JSON string containing all configuration: + * - default_field: default field name when DSL doesn't specify field + * - default_operator: "and" or "or" for multi-term queries + * - mode: "standard" or "lucene" + * - minimum_should_match: integer for Lucene mode + * Example: '{"default_field":"title","mode":"lucene","minimum_should_match":0}' */ - public String getDefaultField() { + public String getOptionsJson() { if (children().size() < 2) { return null; } - Expression fieldArg = child(1); - if (fieldArg instanceof StringLikeLiteral) { - return ((StringLikeLiteral) fieldArg).getStringValue(); - } - return fieldArg.toString(); - } - - /** - * Get default operator from third argument (optional) - */ - public String getDefaultOperator() { - if (children().size() < 3) { - return null; - } - Expression operatorArg = child(2); - if (operatorArg instanceof StringLikeLiteral) { - return ((StringLikeLiteral) operatorArg).getStringValue(); + Expression optionsArg = child(1); + if (optionsArg instanceof StringLikeLiteral) { + return ((StringLikeLiteral) optionsArg).getStringValue(); } - return operatorArg.toString(); + return optionsArg.toString(); } /** @@ -122,7 +118,7 @@ public String getDefaultOperator() { */ public SearchDslParser.QsPlan getQsPlan() { // Lazy evaluation will be handled in SearchPredicate - return SearchDslParser.parseDsl(getDslString(), getDefaultField(), getDefaultOperator()); + return SearchDslParser.parseDsl(getDslString(), getOptionsJson()); } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParser.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParser.java index 8dfd9febb68536..b4c880546a700c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParser.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParser.java @@ -61,11 +61,42 @@ public class SearchDslParser { * Parse DSL string and return intermediate representation */ public static QsPlan parseDsl(String dsl) { - return parseDsl(dsl, null, null); + return parseDsl(dsl, (String) null); } /** - * Parse DSL string with default field and operator support + * Parse DSL string with JSON options support. + * This is the primary method for the new 2-parameter search function signature. + * + * @param dsl DSL query string + * @param optionsJson JSON options string containing all configuration: + * - default_field: default field name when DSL doesn't specify field + * - default_operator: "and" or "or" for multi-term queries + * - mode: "standard" or "lucene" + * - minimum_should_match: integer for Lucene mode + * Example: '{"default_field":"title","mode":"lucene","minimum_should_match":0}' + * @return Parsed QsPlan + */ + public static QsPlan parseDsl(String dsl, String optionsJson) { + // Parse options from JSON + SearchOptions searchOptions = parseOptions(optionsJson); + + // Extract default_field and default_operator from options + String defaultField = searchOptions.getDefaultField(); + String defaultOperator = searchOptions.getDefaultOperator(); + + // Use Lucene mode parser if specified + if (searchOptions.isLuceneMode()) { + return parseDslLuceneMode(dsl, defaultField, defaultOperator, searchOptions); + } + + // Standard mode parsing + return parseDslStandardMode(dsl, defaultField, defaultOperator); + } + + /** + * Parse DSL string with default field and operator support (legacy method). + * Kept for backward compatibility. * * @param dsl DSL query string * @param defaultField Default field name when DSL doesn't specify field (optional) @@ -73,6 +104,13 @@ public static QsPlan parseDsl(String dsl) { * @return Parsed QsPlan */ public static QsPlan parseDsl(String dsl, String defaultField, String defaultOperator) { + return parseDslStandardMode(dsl, defaultField, defaultOperator); + } + + /** + * Standard mode parsing (original behavior) + */ + private static QsPlan parseDslStandardMode(String dsl, String defaultField, String defaultOperator) { if (dsl == null || dsl.trim().isEmpty()) { return new QsPlan(new QsNode(QsClauseType.TERM, "error", "empty_dsl"), new ArrayList<>()); } @@ -204,14 +242,19 @@ private static String expandSimplifiedDsl(String dsl, String defaultField, Strin } /** - * Check if DSL contains field references (has colon not in quoted strings) + * Check if DSL contains field references (has colon not in quoted strings or escaped) */ private static boolean containsFieldReference(String dsl) { boolean inQuotes = false; boolean inRegex = false; for (int i = 0; i < dsl.length(); i++) { char c = dsl.charAt(i); - if (c == '"' && (i == 0 || dsl.charAt(i - 1) != '\\')) { + // Handle escape sequences - skip the escaped character + if (c == '\\' && i + 1 < dsl.length()) { + i++; // Skip next character (it's escaped) + continue; + } + if (c == '"') { inQuotes = !inQuotes; } else if (c == '/' && !inQuotes) { inRegex = !inRegex; @@ -247,6 +290,7 @@ private static boolean containsExplicitOperators(String dsl) { /** * Add field prefix to expressions with explicit operators * Example: "foo AND bar" → "field:foo AND field:bar" + * Handles escape sequences properly (e.g., "First\ Value" stays as single term) */ private static String addFieldPrefixToOperatorExpression(String dsl, String defaultField) { StringBuilder result = new StringBuilder(); @@ -254,7 +298,7 @@ private static String addFieldPrefixToOperatorExpression(String dsl, String defa int i = 0; while (i < dsl.length()) { - // Skip whitespace + // Skip whitespace (but not escaped whitespace) while (i < dsl.length() && Character.isWhitespace(dsl.charAt(i))) { i++; } @@ -262,6 +306,14 @@ private static String addFieldPrefixToOperatorExpression(String dsl, String defa break; } + // Handle escape sequences - include both backslash and next char + if (dsl.charAt(i) == '\\' && i + 1 < dsl.length()) { + currentTerm.append(dsl.charAt(i)); + currentTerm.append(dsl.charAt(i + 1)); + i += 2; + continue; + } + // Try to match operators String remaining = dsl.substring(i); String upperRemaining = remaining.toUpperCase(); @@ -333,7 +385,7 @@ private static String addFieldPrefixToOperatorExpression(String dsl, String defa } /** - * Tokenize DSL into terms (split by whitespace, respecting quotes and functions) + * Tokenize DSL into terms (split by whitespace, respecting quotes, escapes, and functions) */ private static List tokenizeDsl(String dsl) { List terms = new ArrayList<>(); @@ -358,8 +410,13 @@ private static List tokenizeDsl(String dsl) { inParens = false; } currentTerm.append(c); + } else if (c == '\\' && i + 1 < dsl.length()) { + // Escape sequence - include both backslash and next char in term + currentTerm.append(c); + currentTerm.append(dsl.charAt(i + 1)); + i++; // Skip next character } else if (Character.isWhitespace(c) && !inQuotes && !inParens) { - // End of term + // End of term (only if not escaped - handled above) if (currentTerm.length() > 0) { terms.add(currentTerm.toString()); currentTerm = new StringBuilder(); @@ -379,6 +436,7 @@ private static List tokenizeDsl(String dsl) { /** * Check if a term contains wildcard characters (* or ?) + * Escaped wildcards (\* or \?) are not counted. */ private static boolean containsWildcard(String term) { // Ignore wildcards in quoted strings or regex @@ -388,26 +446,48 @@ private static boolean containsWildcard(String term) { if (term.startsWith("/") && term.endsWith("/")) { return false; } - return term.contains("*") || term.contains("?"); + // Check for unescaped wildcards + for (int i = 0; i < term.length(); i++) { + char c = term.charAt(i); + if (c == '\\' && i + 1 < term.length()) { + // Skip escaped character + i++; + continue; + } + if (c == '*' || c == '?') { + return true; + } + } + return false; } /** * Clause types supported */ public enum QsClauseType { - TERM, // field:value - PHRASE, // field:"phrase search" - PREFIX, // field:prefix* - WILDCARD, // field:*wild*card* - REGEXP, // field:/pattern/ - RANGE, // field:[1 TO 10] or field:{1 TO 10} - LIST, // field:IN(value1 value2) - ANY, // field:ANY(value) - any match - ALL, // field:ALL(value) - all match - EXACT, // field:EXACT(value) - exact match without tokenization - AND, // clause1 AND clause2 - OR, // clause1 OR clause2 - NOT // NOT clause + TERM, // field:value + PHRASE, // field:"phrase search" + PREFIX, // field:prefix* + WILDCARD, // field:*wild*card* + REGEXP, // field:/pattern/ + RANGE, // field:[1 TO 10] or field:{1 TO 10} + LIST, // field:IN(value1 value2) + ANY, // field:ANY(value) - any match + ALL, // field:ALL(value) - all match + EXACT, // field:EXACT(value) - exact match without tokenization + AND, // clause1 AND clause2 (standard boolean algebra) + OR, // clause1 OR clause2 (standard boolean algebra) + NOT, // NOT clause (standard boolean algebra) + OCCUR_BOOLEAN // Lucene-style boolean query with MUST/SHOULD/MUST_NOT + } + + /** + * Occur type for Lucene-style boolean queries + */ + public enum QsOccur { + MUST, // Term must appear (equivalent to +term) + SHOULD, // Term should appear (optional) + MUST_NOT // Term must not appear (equivalent to -term) } /** @@ -601,15 +681,15 @@ public QsNode visitSearchValue(SearchParser.SearchValueContext ctx) { } private QsNode createTermNode(String fieldName, String value) { - return new QsNode(QsClauseType.TERM, fieldName, value); + return new QsNode(QsClauseType.TERM, fieldName, unescapeTermValue(value)); } private QsNode createPrefixNode(String fieldName, String value) { - return new QsNode(QsClauseType.PREFIX, fieldName, value); + return new QsNode(QsClauseType.PREFIX, fieldName, unescapeTermValue(value)); } private QsNode createWildcardNode(String fieldName, String value) { - return new QsNode(QsClauseType.WILDCARD, fieldName, value); + return new QsNode(QsClauseType.WILDCARD, fieldName, unescapeTermValue(value)); } private QsNode createRegexpNode(String fieldName, String regexpText) { @@ -780,15 +860,35 @@ public static class QsNode { @JsonProperty("children") public List children; + @JsonProperty("occur") + public QsOccur occur; + + @JsonProperty("minimumShouldMatch") + public Integer minimumShouldMatch; + + /** + * Constructor for JSON deserialization + * + * @param type the clause type + * @param field the field name + * @param value the field value + * @param children the child nodes + * @param occur the occurrence type + * @param minimumShouldMatch the minimum should match value + */ @JsonCreator public QsNode(@JsonProperty("type") QsClauseType type, @JsonProperty("field") String field, @JsonProperty("value") String value, - @JsonProperty("children") List children) { + @JsonProperty("children") List children, + @JsonProperty("occur") QsOccur occur, + @JsonProperty("minimumShouldMatch") Integer minimumShouldMatch) { this.type = type; this.field = field; this.value = value; this.children = children != null ? children : new ArrayList<>(); + this.occur = occur; + this.minimumShouldMatch = minimumShouldMatch; } public QsNode(QsClauseType type, String field, String value) { @@ -803,9 +903,20 @@ public QsNode(QsClauseType type, List children) { this.children = children != null ? children : new ArrayList<>(); } + public QsNode(QsClauseType type, List children, Integer minimumShouldMatch) { + this.type = type; + this.children = children != null ? children : new ArrayList<>(); + this.minimumShouldMatch = minimumShouldMatch; + } + + public QsNode withOccur(QsOccur occur) { + this.occur = occur; + return this; + } + @Override public int hashCode() { - return Objects.hash(type, field, value, children); + return Objects.hash(type, field, value, children, occur, minimumShouldMatch); } @Override @@ -820,7 +931,9 @@ public boolean equals(Object o) { return type == qsNode.type && Objects.equals(field, qsNode.field) && Objects.equals(value, qsNode.value) - && Objects.equals(children, qsNode.children); + && Objects.equals(children, qsNode.children) + && occur == qsNode.occur + && Objects.equals(minimumShouldMatch, qsNode.minimumShouldMatch); } } @@ -859,4 +972,598 @@ public boolean equals(Object o) { && Objects.equals(fieldName, that.fieldName); } } + + /** + * Search options parsed from JSON. + * Supports all configuration in a single JSON object: + * - default_field: default field name when DSL doesn't specify field + * - default_operator: "and" or "or" for multi-term queries (default: "or") + * - mode: "standard" (default) or "lucene" (ES/Lucene-style boolean parsing) + * - minimum_should_match: integer for Lucene mode (default: 0 for filter context) + */ + public static class SearchOptions { + private String defaultField = null; + private String defaultOperator = null; + private String mode = "standard"; + private Integer minimumShouldMatch = null; + + public String getDefaultField() { + return defaultField; + } + + public void setDefaultField(String defaultField) { + this.defaultField = defaultField; + } + + public String getDefaultOperator() { + return defaultOperator; + } + + public void setDefaultOperator(String defaultOperator) { + this.defaultOperator = defaultOperator; + } + + public boolean isLuceneMode() { + return "lucene".equalsIgnoreCase(mode); + } + + public String getMode() { + return mode; + } + + public void setMode(String mode) { + this.mode = mode; + } + + public Integer getMinimumShouldMatch() { + return minimumShouldMatch; + } + + public void setMinimumShouldMatch(Integer minimumShouldMatch) { + this.minimumShouldMatch = minimumShouldMatch; + } + } + + /** + * Parse options JSON string. + * Supports the following fields: + * - default_field: default field name when DSL doesn't specify field + * - default_operator: "and" or "or" for multi-term queries + * - mode: "standard" or "lucene" + * - minimum_should_match: integer for Lucene mode + */ + private static SearchOptions parseOptions(String optionsJson) { + SearchOptions options = new SearchOptions(); + if (optionsJson == null || optionsJson.trim().isEmpty()) { + return options; + } + + try { + // Parse JSON using Jackson + com.fasterxml.jackson.databind.JsonNode jsonNode = JSON_MAPPER.readTree(optionsJson); + + if (jsonNode.has("default_field")) { + options.setDefaultField(jsonNode.get("default_field").asText()); + } + if (jsonNode.has("default_operator")) { + options.setDefaultOperator(jsonNode.get("default_operator").asText()); + } + if (jsonNode.has("mode")) { + options.setMode(jsonNode.get("mode").asText()); + } + if (jsonNode.has("minimum_should_match")) { + options.setMinimumShouldMatch(jsonNode.get("minimum_should_match").asInt()); + } + } catch (Exception e) { + LOG.warn("Failed to parse search options JSON: {}", optionsJson, e); + } + + return options; + } + + /** + * Lucene mode parsing - implements ES/Lucene-style boolean query semantics. + *

+ * Key differences from standard mode: + * - Operators are processed left-to-right as modifiers (not traditional boolean algebra) + * - AND marks preceding and current terms as MUST (+) + * - OR marks preceding and current terms as SHOULD + * - NOT marks current term as MUST_NOT (-) + * - When minimum_should_match=0 and there are MUST clauses, SHOULD clauses are ignored + *

+ * Examples: + * - "a AND b OR c" → +a (with minimum_should_match=0, SHOULD terms discarded) + * - "a AND b OR NOT c AND d" → +a -c +d + * - "a OR b OR c" → a b c (at least one must match) + */ + private static QsPlan parseDslLuceneMode(String dsl, String defaultField, String defaultOperator, + SearchOptions options) { + if (dsl == null || dsl.trim().isEmpty()) { + return new QsPlan(new QsNode(QsClauseType.TERM, "error", "empty_dsl"), new ArrayList<>()); + } + + // Expand simplified DSL if default field is provided + String expandedDsl = dsl; + if (defaultField != null && !defaultField.trim().isEmpty()) { + expandedDsl = expandSimplifiedDsl(dsl.trim(), defaultField.trim(), + normalizeDefaultOperator(defaultOperator)); + } + + try { + // Create ANTLR lexer and parser + SearchLexer lexer = new SearchLexer(new ANTLRInputStream(expandedDsl)); + CommonTokenStream tokens = new CommonTokenStream(lexer); + SearchParser parser = new SearchParser(tokens); + + // Add error listener + parser.removeErrorListeners(); + parser.addErrorListener(new org.antlr.v4.runtime.BaseErrorListener() { + @Override + public void syntaxError(org.antlr.v4.runtime.Recognizer recognizer, + Object offendingSymbol, + int line, int charPositionInLine, + String msg, org.antlr.v4.runtime.RecognitionException e) { + throw new RuntimeException("Invalid search DSL syntax at line " + line + + ":" + charPositionInLine + " " + msg); + } + }); + + // Parse using standard parser first + ParseTree tree = parser.search(); + if (tree == null) { + throw new RuntimeException("Invalid search DSL syntax"); + } + + // Build AST using Lucene-mode visitor + QsLuceneModeAstBuilder visitor = new QsLuceneModeAstBuilder(options); + QsNode root = visitor.visit(tree); + + // Extract field bindings + Set fieldNames = visitor.getFieldNames(); + List bindings = new ArrayList<>(); + int slotIndex = 0; + for (String fieldName : fieldNames) { + bindings.add(new QsFieldBinding(fieldName, slotIndex++)); + } + + return new QsPlan(root, bindings); + + } catch (Exception e) { + LOG.error("Failed to parse search DSL in Lucene mode: '{}' (expanded: '{}')", dsl, expandedDsl, e); + throw new RuntimeException("Invalid search DSL syntax: " + dsl + ". Error: " + e.getMessage(), e); + } + } + + /** + * ANTLR visitor for Lucene-mode AST building. + * Transforms standard boolean expressions into Lucene-style OCCUR_BOOLEAN queries. + */ + private static class QsLuceneModeAstBuilder extends SearchParserBaseVisitor { + private final Set fieldNames = new HashSet<>(); + private final SearchOptions options; + private String currentFieldName = null; + + public QsLuceneModeAstBuilder(SearchOptions options) { + this.options = options; + } + + public Set getFieldNames() { + return fieldNames; + } + + @Override + public QsNode visitSearch(SearchParser.SearchContext ctx) { + QsNode result = visit(ctx.clause()); + if (result == null) { + throw new RuntimeException("Invalid search clause"); + } + return result; + } + + @Override + public QsNode visitOrClause(SearchParser.OrClauseContext ctx) { + // In Lucene mode, we need to process the entire OR chain together + // to correctly assign MUST/SHOULD/MUST_NOT based on operator sequence + return processLuceneBooleanChain(ctx); + } + + /** + * Process the entire boolean expression chain in Lucene mode. + * This is the core of Lucene-style boolean parsing. + */ + private QsNode processLuceneBooleanChain(SearchParser.OrClauseContext ctx) { + // Collect all terms and operators from the expression tree + List terms = new ArrayList<>(); + collectTermsWithOperators(ctx, terms, QsOccur.MUST); // default_operator = AND means MUST + + if (terms.isEmpty()) { + throw new RuntimeException("No terms found in boolean expression"); + } + + if (terms.size() == 1) { + TermWithOccur singleTerm = terms.get(0); + if (singleTerm.isNegated) { + // Single negated term - must wrap in OCCUR_BOOLEAN for BE to handle MUST_NOT + singleTerm.node.occur = QsOccur.MUST_NOT; + List children = new ArrayList<>(); + children.add(singleTerm.node); + return new QsNode(QsClauseType.OCCUR_BOOLEAN, children, 0); + } + // Single non-negated term - return directly without wrapper + return singleTerm.node; + } + + // Apply Lucene boolean logic + applyLuceneBooleanLogic(terms); + + // Determine minimum_should_match + Integer minShouldMatch = options.getMinimumShouldMatch(); + if (minShouldMatch == null) { + // Default: 0 if there are MUST clauses, 1 if only SHOULD + boolean hasMust = terms.stream().anyMatch(t -> t.occur == QsOccur.MUST); + boolean hasMustNot = terms.stream().anyMatch(t -> t.occur == QsOccur.MUST_NOT); + minShouldMatch = (hasMust || hasMustNot) ? 0 : 1; + } + + // Filter out SHOULD clauses if minimum_should_match=0 and there are MUST clauses + final int finalMinShouldMatch = minShouldMatch; + if (minShouldMatch == 0) { + boolean hasMust = terms.stream().anyMatch(t -> t.occur == QsOccur.MUST); + if (hasMust) { + terms = terms.stream() + .filter(t -> t.occur != QsOccur.SHOULD) + .collect(java.util.stream.Collectors.toList()); + } + } + + if (terms.isEmpty()) { + throw new RuntimeException("All terms filtered out in Lucene boolean logic"); + } + + if (terms.size() == 1) { + TermWithOccur remainingTerm = terms.get(0); + if (remainingTerm.occur == QsOccur.MUST_NOT) { + // Single MUST_NOT term - must wrap in OCCUR_BOOLEAN for BE to handle + remainingTerm.node.occur = QsOccur.MUST_NOT; + List children = new ArrayList<>(); + children.add(remainingTerm.node); + return new QsNode(QsClauseType.OCCUR_BOOLEAN, children, 0); + } + return remainingTerm.node; + } + + // Build OCCUR_BOOLEAN node + List children = new ArrayList<>(); + for (TermWithOccur term : terms) { + term.node.occur = term.occur; + children.add(term.node); + } + + return new QsNode(QsClauseType.OCCUR_BOOLEAN, children, finalMinShouldMatch); + } + + /** + * Collect terms from the parse tree with their positions + */ + private void collectTermsWithOperators(ParseTree ctx, List terms, QsOccur defaultOccur) { + if (ctx instanceof SearchParser.OrClauseContext) { + SearchParser.OrClauseContext orCtx = (SearchParser.OrClauseContext) ctx; + List andClauses = orCtx.andClause(); + + for (int i = 0; i < andClauses.size(); i++) { + // Mark that this term is introduced by OR if not the first + boolean introducedByOr = (i > 0); + collectTermsFromAndClause(andClauses.get(i), terms, defaultOccur, introducedByOr); + } + } + } + + private void collectTermsFromAndClause(SearchParser.AndClauseContext ctx, List terms, + QsOccur defaultOccur, boolean introducedByOr) { + List notClauses = ctx.notClause(); + + for (int i = 0; i < notClauses.size(); i++) { + boolean introducedByAnd = (i > 0); + collectTermsFromNotClause(notClauses.get(i), terms, defaultOccur, introducedByOr, introducedByAnd); + // After first term, all subsequent in same AND chain are introducedByOr=false + introducedByOr = false; + } + } + + private void collectTermsFromNotClause(SearchParser.NotClauseContext ctx, List terms, + QsOccur defaultOccur, boolean introducedByOr, boolean introducedByAnd) { + boolean isNegated = ctx.NOT() != null; + SearchParser.AtomClauseContext atomCtx = ctx.atomClause(); + + if (atomCtx.clause() != null) { + // Parenthesized clause - visit recursively + QsNode subNode = visit(atomCtx.clause()); + TermWithOccur term = new TermWithOccur(subNode, defaultOccur); + term.introducedByOr = introducedByOr; + term.introducedByAnd = introducedByAnd; + term.isNegated = isNegated; + terms.add(term); + } else { + // Field query + QsNode node = visit(atomCtx.fieldQuery()); + TermWithOccur term = new TermWithOccur(node, defaultOccur); + term.introducedByOr = introducedByOr; + term.introducedByAnd = introducedByAnd; + term.isNegated = isNegated; + terms.add(term); + } + } + + /** + * Apply Lucene boolean logic to determine final MUST/SHOULD/MUST_NOT for each term. + *

+ * Rules (processed left-to-right): + * 1. First term: MUST (due to default_operator=AND) + * 2. AND introduces: marks preceding and current as MUST + * 3. OR introduces: marks preceding and current as SHOULD + * 4. NOT modifier: marks current as MUST_NOT + * 5. AND after MUST_NOT: the MUST_NOT term is not affected, current becomes MUST + */ + private void applyLuceneBooleanLogic(List terms) { + for (int i = 0; i < terms.size(); i++) { + TermWithOccur current = terms.get(i); + + if (current.isNegated) { + // NOT modifier - mark as MUST_NOT + current.occur = QsOccur.MUST_NOT; + + // OR + NOT: preceding becomes SHOULD (if not already MUST_NOT) + if (current.introducedByOr && i > 0) { + TermWithOccur prev = terms.get(i - 1); + if (prev.occur != QsOccur.MUST_NOT) { + prev.occur = QsOccur.SHOULD; + } + } + } else if (current.introducedByAnd) { + // AND introduces: both preceding and current are MUST + current.occur = QsOccur.MUST; + if (i > 0) { + TermWithOccur prev = terms.get(i - 1); + // Don't change MUST_NOT to MUST + if (prev.occur != QsOccur.MUST_NOT) { + prev.occur = QsOccur.MUST; + } + } + } else if (current.introducedByOr) { + // OR introduces: both preceding and current are SHOULD + current.occur = QsOccur.SHOULD; + if (i > 0) { + TermWithOccur prev = terms.get(i - 1); + // Don't change MUST_NOT to SHOULD + if (prev.occur != QsOccur.MUST_NOT) { + prev.occur = QsOccur.SHOULD; + } + } + } else { + // First term: MUST (default_operator=AND) + current.occur = QsOccur.MUST; + } + } + } + + @Override + public QsNode visitAndClause(SearchParser.AndClauseContext ctx) { + // This is called for simple cases, delegate to parent's logic + if (ctx.notClause().size() == 1) { + return visit(ctx.notClause(0)); + } + + // Multiple AND terms - use processLuceneBooleanChain via parent + List children = new ArrayList<>(); + for (SearchParser.NotClauseContext notCtx : ctx.notClause()) { + QsNode child = visit(notCtx); + if (child != null) { + children.add(child); + } + } + + if (children.size() == 1) { + return children.get(0); + } + + return new QsNode(QsClauseType.AND, children); + } + + @Override + public QsNode visitNotClause(SearchParser.NotClauseContext ctx) { + if (ctx.NOT() != null) { + QsNode child = visit(ctx.atomClause()); + if (child == null) { + throw new RuntimeException("Invalid NOT clause: missing operand"); + } + List children = new ArrayList<>(); + children.add(child); + return new QsNode(QsClauseType.NOT, children); + } + return visit(ctx.atomClause()); + } + + @Override + public QsNode visitAtomClause(SearchParser.AtomClauseContext ctx) { + if (ctx.clause() != null) { + return visit(ctx.clause()); + } + return visit(ctx.fieldQuery()); + } + + @Override + public QsNode visitFieldQuery(SearchParser.FieldQueryContext ctx) { + // Build complete field path + StringBuilder fullPath = new StringBuilder(); + List segments = ctx.fieldPath().fieldSegment(); + + for (int i = 0; i < segments.size(); i++) { + if (i > 0) { + fullPath.append('.'); + } + String segment = segments.get(i).getText(); + if (segment.startsWith("\"") && segment.endsWith("\"")) { + segment = segment.substring(1, segment.length() - 1); + } + fullPath.append(segment); + } + + String fieldPath = fullPath.toString(); + fieldNames.add(fieldPath); + + String previousFieldName = currentFieldName; + currentFieldName = fieldPath; + + try { + return visit(ctx.searchValue()); + } finally { + currentFieldName = previousFieldName; + } + } + + @Override + public QsNode visitSearchValue(SearchParser.SearchValueContext ctx) { + String fieldName = currentFieldName != null ? currentFieldName : "_all"; + + if (ctx.TERM() != null) { + return new QsNode(QsClauseType.TERM, fieldName, unescapeTermValue(ctx.TERM().getText())); + } + if (ctx.PREFIX() != null) { + return new QsNode(QsClauseType.PREFIX, fieldName, unescapeTermValue(ctx.PREFIX().getText())); + } + if (ctx.WILDCARD() != null) { + return new QsNode(QsClauseType.WILDCARD, fieldName, unescapeTermValue(ctx.WILDCARD().getText())); + } + if (ctx.REGEXP() != null) { + String regexp = ctx.REGEXP().getText(); + if (regexp.startsWith("/") && regexp.endsWith("/")) { + regexp = regexp.substring(1, regexp.length() - 1); + } + return new QsNode(QsClauseType.REGEXP, fieldName, regexp); + } + if (ctx.QUOTED() != null) { + String quoted = ctx.QUOTED().getText(); + if (quoted.startsWith("\"") && quoted.endsWith("\"")) { + quoted = quoted.substring(1, quoted.length() - 1); + } + return new QsNode(QsClauseType.PHRASE, fieldName, quoted); + } + if (ctx.rangeValue() != null) { + SearchParser.RangeValueContext rangeCtx = ctx.rangeValue(); + String rangeText; + if (rangeCtx.LBRACKET() != null) { + rangeText = "[" + rangeCtx.rangeEndpoint(0).getText() + " TO " + + rangeCtx.rangeEndpoint(1).getText() + "]"; + } else { + rangeText = "{" + rangeCtx.rangeEndpoint(0).getText() + " TO " + + rangeCtx.rangeEndpoint(1).getText() + "}"; + } + return new QsNode(QsClauseType.RANGE, fieldName, rangeText); + } + if (ctx.listValue() != null) { + StringBuilder listText = new StringBuilder("IN("); + for (int i = 0; i < ctx.listValue().LIST_TERM().size(); i++) { + if (i > 0) { + listText.append(" "); + } + listText.append(ctx.listValue().LIST_TERM(i).getText()); + } + listText.append(")"); + return new QsNode(QsClauseType.LIST, fieldName, listText.toString()); + } + if (ctx.anyAllValue() != null) { + String text = ctx.anyAllValue().getText(); + String innerContent = extractParenthesesContent(text); + String sanitizedContent = stripOuterQuotes(innerContent); + if (text.toUpperCase().startsWith("ANY(")) { + return new QsNode(QsClauseType.ANY, fieldName, sanitizedContent); + } + return new QsNode(QsClauseType.ALL, fieldName, sanitizedContent); + } + if (ctx.exactValue() != null) { + String innerContent = extractParenthesesContent(ctx.exactValue().getText()); + return new QsNode(QsClauseType.EXACT, fieldName, innerContent); + } + + return new QsNode(QsClauseType.TERM, fieldName, unescapeTermValue(ctx.getText())); + } + + private String extractParenthesesContent(String text) { + int openParen = text.indexOf('('); + int closeParen = text.lastIndexOf(')'); + if (openParen >= 0 && closeParen > openParen) { + return text.substring(openParen + 1, closeParen).trim(); + } + return ""; + } + + private String stripOuterQuotes(String text) { + if (text == null || text.length() < 2) { + return text; + } + char first = text.charAt(0); + char last = text.charAt(text.length() - 1); + if ((first == '"' && last == '"') || (first == '\'' && last == '\'')) { + return text.substring(1, text.length() - 1); + } + return text; + } + } + + /** + * Helper class to track term with its occur status during parsing + */ + private static class TermWithOccur { + QsNode node; + QsOccur occur; + boolean introducedByOr = false; + boolean introducedByAnd = false; + boolean isNegated = false; + + TermWithOccur(QsNode node, QsOccur occur) { + this.node = node; + this.occur = occur; + } + } + + /** + * Process escape sequences in a term value. + * Converts escape sequences to their literal characters: + * - \ (backslash space) -> space + * - \( -> ( + * - \) -> ) + * - \: -> : + * - \\ -> \ + * - \* -> * + * - \? -> ? + * - etc. + * + * @param value The raw term value with escape sequences + * @return The unescaped value + */ + private static String unescapeTermValue(String value) { + if (value == null || value.isEmpty()) { + return value; + } + + // Quick check: if no backslash, return as-is + if (value.indexOf('\\') < 0) { + return value; + } + + StringBuilder result = new StringBuilder(value.length()); + int i = 0; + while (i < value.length()) { + char c = value.charAt(i); + if (c == '\\' && i + 1 < value.length()) { + // Escape sequence - take the next character literally + result.append(value.charAt(i + 1)); + i += 2; + } else { + result.append(c); + i++; + } + } + return result.toString(); + } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java index eb1bf3f5d3a52c..6279aead20a708 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java @@ -20,6 +20,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.SearchDslParser.QsClauseType; import org.apache.doris.nereids.trees.expressions.functions.scalar.SearchDslParser.QsFieldBinding; import org.apache.doris.nereids.trees.expressions.functions.scalar.SearchDslParser.QsNode; +import org.apache.doris.nereids.trees.expressions.functions.scalar.SearchDslParser.QsOccur; import org.apache.doris.nereids.trees.expressions.functions.scalar.SearchDslParser.QsPlan; import org.junit.jupiter.api.Assertions; @@ -577,4 +578,351 @@ public void testDefaultFieldPreservesFieldBindings() { Assertions.assertEquals("tags", plan.fieldBindings.get(0).fieldName); Assertions.assertEquals(0, plan.fieldBindings.get(0).slotIndex); } + + // ============ Tests for Lucene Mode Parsing ============ + + @Test + public void testLuceneModeSimpleAndQuery() { + // Test: "a AND b" in Lucene mode → both MUST + String dsl = "field:a AND field:b"; + String options = "{\"mode\":\"lucene\",\"minimum_should_match\":0}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, plan.root.type); + Assertions.assertEquals(2, plan.root.children.size()); + Assertions.assertEquals(Integer.valueOf(0), plan.root.minimumShouldMatch); + + // Both children should have MUST occur + for (QsNode child : plan.root.children) { + Assertions.assertEquals(SearchDslParser.QsOccur.MUST, child.occur); + } + } + + @Test + public void testLuceneModeSimpleOrQuery() { + // Test: "a OR b OR c" in Lucene mode → all SHOULD, at least one must match + String dsl = "field:a OR field:b OR field:c"; + String options = "{\"mode\":\"lucene\"}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, plan.root.type); + Assertions.assertEquals(3, plan.root.children.size()); + + // All children should have SHOULD occur + for (QsNode child : plan.root.children) { + Assertions.assertEquals(SearchDslParser.QsOccur.SHOULD, child.occur); + } + + // minimum_should_match should be 1 (at least one must match) + Assertions.assertEquals(Integer.valueOf(1), plan.root.minimumShouldMatch); + } + + @Test + public void testLuceneModeAndOrMixed() { + // Test: "a AND b OR c" in Lucene mode with minimum_should_match=0 + // Expected: +a (SHOULD terms discarded because MUST exists) + String dsl = "field:a AND field:b OR field:c"; + String options = "{\"mode\":\"lucene\",\"minimum_should_match\":0}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + // With minimum_should_match=0 and MUST clauses present, SHOULD is discarded + // Only "a" remains with MUST + Assertions.assertEquals(QsClauseType.TERM, plan.root.type); + Assertions.assertEquals("field", plan.root.field); + Assertions.assertEquals("a", plan.root.value); + } + + @Test + public void testLuceneModeAndOrNotMixed() { + // Test: "a AND b OR NOT c AND d" in Lucene mode + // Expected processing: + // - a: MUST (first term, default_operator=AND) + // - b: MUST (AND introduces) + // - c: MUST_NOT (OR + NOT, but OR makes preceding SHOULD, NOT makes current MUST_NOT) + // - d: MUST (AND introduces) + // With minimum_should_match=0: b becomes SHOULD and is discarded + // Result: +a -c +d + String dsl = "field:a AND field:b OR NOT field:c AND field:d"; + String options = "{\"mode\":\"lucene\",\"minimum_should_match\":0}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, plan.root.type); + + // Should have 3 children: a(MUST), c(MUST_NOT), d(MUST) + // b is filtered out because it becomes SHOULD + Assertions.assertEquals(3, plan.root.children.size()); + + QsNode nodeA = plan.root.children.get(0); + Assertions.assertEquals("a", nodeA.value); + Assertions.assertEquals(SearchDslParser.QsOccur.MUST, nodeA.occur); + + QsNode nodeC = plan.root.children.get(1); + Assertions.assertEquals("c", nodeC.value); + Assertions.assertEquals(SearchDslParser.QsOccur.MUST_NOT, nodeC.occur); + + QsNode nodeD = plan.root.children.get(2); + Assertions.assertEquals("d", nodeD.value); + Assertions.assertEquals(SearchDslParser.QsOccur.MUST, nodeD.occur); + } + + @Test + public void testLuceneModeWithDefaultField() { + // Test: Lucene mode with default field expansion + String dsl = "aterm AND bterm OR cterm"; + // Now default_field and default_operator are inside the options JSON + String options = "{\"default_field\":\"firstname\",\"default_operator\":\"and\"," + + "\"mode\":\"lucene\",\"minimum_should_match\":0}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + // With minimum_should_match=0, only aterm (MUST) remains + Assertions.assertEquals(QsClauseType.TERM, plan.root.type); + Assertions.assertEquals("firstname", plan.root.field); + Assertions.assertEquals("aterm", plan.root.value); + } + + @Test + public void testLuceneModeNotOperator() { + // Test: "NOT a" in Lucene mode + // In Lucene mode, single NOT produces OCCUR_BOOLEAN with a MUST_NOT child + // (wrapped for BE to handle the negation properly) + String dsl = "NOT field:a"; + String options = "{\"mode\":\"lucene\"}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, plan.root.type); + Assertions.assertEquals(1, plan.root.children.size()); + Assertions.assertEquals(QsClauseType.TERM, plan.root.children.get(0).type); + Assertions.assertEquals(QsOccur.MUST_NOT, plan.root.children.get(0).occur); + } + + @Test + public void testLuceneModeMinimumShouldMatchExplicit() { + // Test: explicit minimum_should_match=1 keeps SHOULD clauses + String dsl = "field:a AND field:b OR field:c"; + String options = "{\"mode\":\"lucene\",\"minimum_should_match\":1}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, plan.root.type); + // All 3 terms should be present + Assertions.assertEquals(3, plan.root.children.size()); + Assertions.assertEquals(Integer.valueOf(1), plan.root.minimumShouldMatch); + } + + @Test + public void testLuceneModeSingleTerm() { + // Test: single term should not create OCCUR_BOOLEAN wrapper + String dsl = "field:hello"; + String options = "{\"mode\":\"lucene\"}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.TERM, plan.root.type); + Assertions.assertEquals("field", plan.root.field); + Assertions.assertEquals("hello", plan.root.value); + } + + @Test + public void testStandardModeUnchanged() { + // Test: standard mode (default) should work as before + String dsl = "field:a AND field:b OR field:c"; + QsPlan plan = SearchDslParser.parseDsl(dsl, (String) null); + + Assertions.assertNotNull(plan); + // Standard mode uses traditional boolean algebra: OR at top level + Assertions.assertEquals(QsClauseType.OR, plan.root.type); + } + + @Test + public void testLuceneModeInvalidJson() { + // Test: invalid JSON options should fall back to standard mode + String dsl = "field:a AND field:b"; + String options = "not valid json"; + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + // Should fall back to standard mode (AND type) + Assertions.assertEquals(QsClauseType.AND, plan.root.type); + } + + @Test + public void testLuceneModeEmptyOptions() { + // Test: empty options string should use standard mode + String dsl = "field:a AND field:b"; + QsPlan plan = SearchDslParser.parseDsl(dsl, ""); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.AND, plan.root.type); + } + + // ============ Tests for Escape Handling ============ + + @Test + public void testEscapedSpaceInTerm() { + // Test: "First\ Value" should be treated as a single term "First Value" + // The escape sequence is processed: \ + space -> space + String dsl = "field:First\\ Value"; + QsPlan plan = SearchDslParser.parseDsl(dsl); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.TERM, plan.root.type); + Assertions.assertEquals("field", plan.root.field); + // After unescape: "First\ Value" -> "First Value" + Assertions.assertEquals("First Value", plan.root.value); + } + + @Test + public void testEscapedParentheses() { + // Test: \( and \) should be treated as literal characters, not grouping + // The escape sequence is processed: \( -> ( and \) -> ) + String dsl = "field:hello\\(world\\)"; + QsPlan plan = SearchDslParser.parseDsl(dsl); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.TERM, plan.root.type); + Assertions.assertEquals("field", plan.root.field); + // After unescape: "hello\(world\)" -> "hello(world)" + Assertions.assertEquals("hello(world)", plan.root.value); + } + + @Test + public void testEscapedColon() { + // Test: \: should be treated as literal colon, not field separator + // The escape sequence is processed: \: -> : + String dsl = "field:value\\:with\\:colons"; + QsPlan plan = SearchDslParser.parseDsl(dsl); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.TERM, plan.root.type); + Assertions.assertEquals("field", plan.root.field); + // After unescape: "value\:with\:colons" -> "value:with:colons" + Assertions.assertEquals("value:with:colons", plan.root.value); + } + + @Test + public void testEscapedBackslash() { + // Test: \\ should be treated as a literal backslash + // The escape sequence is processed: \\ -> \ + String dsl = "field:path\\\\to\\\\file"; + QsPlan plan = SearchDslParser.parseDsl(dsl); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.TERM, plan.root.type); + Assertions.assertEquals("field", plan.root.field); + // After unescape: "path\\to\\file" -> "path\to\file" + Assertions.assertEquals("path\\to\\file", plan.root.value); + } + + @Test + public void testUppercaseAndOperator() { + // Test: uppercase AND should be treated as operator + String dsl = "field:a AND field:b"; + QsPlan plan = SearchDslParser.parseDsl(dsl); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.AND, plan.root.type); + Assertions.assertEquals(2, plan.root.children.size()); + } + + @Test + public void testLowercaseAndOperator() { + // Test: Currently lowercase 'and' is also treated as operator + // According to PDF requirement, only uppercase should be operators + // This test documents current behavior - may need to change + String dsl = "field:a and field:b"; + QsPlan plan = SearchDslParser.parseDsl(dsl); + + Assertions.assertNotNull(plan); + // Current behavior: lowercase 'and' IS an operator + Assertions.assertEquals(QsClauseType.AND, plan.root.type); + // TODO: If PDF requires only uppercase, this should fail and return OR or different structure + } + + @Test + public void testUppercaseOrOperator() { + // Test: uppercase OR should be treated as operator + String dsl = "field:a OR field:b"; + QsPlan plan = SearchDslParser.parseDsl(dsl); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.OR, plan.root.type); + Assertions.assertEquals(2, plan.root.children.size()); + } + + @Test + public void testLowercaseOrOperator() { + // Test: Currently lowercase 'or' is also treated as operator + // According to PDF requirement, only uppercase should be operators + String dsl = "field:a or field:b"; + QsPlan plan = SearchDslParser.parseDsl(dsl); + + Assertions.assertNotNull(plan); + // Current behavior: lowercase 'or' IS an operator + Assertions.assertEquals(QsClauseType.OR, plan.root.type); + // TODO: If PDF requires only uppercase, this should fail + } + + @Test + public void testUppercaseNotOperator() { + // Test: uppercase NOT should be treated as operator + String dsl = "NOT field:spam"; + QsPlan plan = SearchDslParser.parseDsl(dsl); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.NOT, plan.root.type); + } + + @Test + public void testLowercaseNotOperator() { + // Test: Currently lowercase 'not' is also treated as operator + // According to PDF requirement, only uppercase should be operators + String dsl = "not field:spam"; + QsPlan plan = SearchDslParser.parseDsl(dsl); + + Assertions.assertNotNull(plan); + // Current behavior: lowercase 'not' IS an operator + Assertions.assertEquals(QsClauseType.NOT, plan.root.type); + // TODO: If PDF requires only uppercase, this should fail + } + + @Test + public void testExclamationNotOperator() { + // Test: ! should be treated as NOT operator + String dsl = "!field:spam"; + QsPlan plan = SearchDslParser.parseDsl(dsl); + + Assertions.assertNotNull(plan); + // Current behavior: ! IS a NOT operator + Assertions.assertEquals(QsClauseType.NOT, plan.root.type); + } + + @Test + public void testEscapedSpecialCharactersInQuoted() { + // Test: escaped characters inside quoted strings + // Note: For PHRASE queries, escape handling is preserved as-is for now + // The backend will handle escape processing for phrase queries + String dsl = "field:\"hello\\\"world\""; + QsPlan plan = SearchDslParser.parseDsl(dsl); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.PHRASE, plan.root.type); + Assertions.assertEquals("hello\\\"world", plan.root.value); + } + + @Test + public void testNoEscapeWithoutBackslash() { + // Test: normal term without escape characters + String dsl = "field:normalterm"; + QsPlan plan = SearchDslParser.parseDsl(dsl); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.TERM, plan.root.type); + Assertions.assertEquals("normalterm", plan.root.value); + } } diff --git a/gensrc/thrift/Exprs.thrift b/gensrc/thrift/Exprs.thrift index 137818634cbc47..2bca8c2b877fb0 100644 --- a/gensrc/thrift/Exprs.thrift +++ b/gensrc/thrift/Exprs.thrift @@ -238,11 +238,21 @@ struct TSchemaChangeExpr { } // Search DSL parameter structure + +// Occur type for Lucene-style boolean queries +enum TSearchOccur { + MUST = 0, // Term must appear (equivalent to +term) + SHOULD = 1, // Term should appear (optional, but contributes to matching) + MUST_NOT = 2 // Term must not appear (equivalent to -term) +} + struct TSearchClause { - 1: required string clause_type // TERM, QUOTED, PREFIX, WILDCARD, REGEXP, RANGE, LIST, ANY_ALL, AND, OR, NOT + 1: required string clause_type // TERM, QUOTED, PREFIX, WILDCARD, REGEXP, RANGE, LIST, ANY_ALL, AND, OR, NOT, OCCUR_BOOLEAN 2: optional string field_name // Field name for leaf clauses 3: optional string value // Search value for leaf clauses - 4: optional list children // Child clauses for compound clauses (AND, OR, NOT) + 4: optional list children // Child clauses for compound clauses (AND, OR, NOT, OCCUR_BOOLEAN) + 5: optional TSearchOccur occur // Occur type for this clause (used with OCCUR_BOOLEAN parent) + 6: optional i32 minimum_should_match // Minimum number of SHOULD clauses that must match (for OCCUR_BOOLEAN) } struct TSearchFieldBinding { diff --git a/regression-test/data/search/test_search_escape.out b/regression-test/data/search/test_search_escape.out new file mode 100644 index 00000000000000..09bd9f80b2b40d --- /dev/null +++ b/regression-test/data/search/test_search_escape.out @@ -0,0 +1,46 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !escape_space -- +1 First Value + +-- !phrase_query -- +1 First Value + +-- !escape_parentheses -- +3 hello(world) + +-- !escape_colon -- +5 key:value + +-- !escape_backslash -- +6 path\\to\\file + +-- !uppercase_and -- +7 first fruit + +-- !uppercase_or -- +1 first content +2 second content +7 first fruit +8 second fruit + +-- !uppercase_not -- +8 second fruit + +-- !lowercase_and -- +7 first fruit + +-- !lowercase_or -- +1 first content +2 second content +7 first fruit +8 second fruit + +-- !exclamation_not -- +8 second fruit + +-- !default_field_escape -- +1 First Value + +-- !lucene_mode_escape -- +1 First Value + diff --git a/regression-test/data/search/test_search_lucene_mode.out b/regression-test/data/search/test_search_lucene_mode.out new file mode 100644 index 00000000000000..68d8e6c1279012 --- /dev/null +++ b/regression-test/data/search/test_search_lucene_mode.out @@ -0,0 +1,86 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !standard_and -- +1 apple banana cherry +2 apple banana + +-- !lucene_and -- +1 apple banana cherry +2 apple banana + +-- !standard_or -- +1 apple banana cherry +2 apple banana +3 apple +5 cherry date +6 date elderberry +8 apple fig + +-- !lucene_or -- +1 apple banana cherry +2 apple banana +3 apple +5 cherry date +6 date elderberry +8 apple fig + +-- !lucene_complex_and_or -- +1 apple banana cherry +2 apple banana +3 apple +8 apple fig + +-- !lucene_min_should_match_1 -- +1 apple banana cherry +2 apple banana + +-- !lucene_not -- + +-- !lucene_and_not -- +3 apple +8 apple fig + +-- !lucene_or_not -- +3 apple +8 apple fig + +-- !lucene_or_only -- +1 apple banana cherry +2 apple banana +3 apple +5 cherry date +6 date elderberry +7 fig grape +8 apple fig + +-- !lucene_cross_field -- +1 apple banana cherry fruit +2 apple banana fruit +3 apple fruit + +-- !standard_cross_field -- +1 apple banana cherry fruit +2 apple banana fruit +3 apple fruit + +-- !lucene_phrase -- +1 apple banana cherry +2 apple banana + +-- !lucene_wildcard -- +1 apple banana cherry +2 apple banana + +-- !standard_unchanged -- +1 apple banana cherry +2 apple banana + +-- !empty_options -- +1 apple banana cherry +2 apple banana + +-- !lucene_min_should_match_0 -- +1 apple banana cherry +2 apple banana +3 apple +8 apple fig + diff --git a/regression-test/suites/search/test_search_default_field_operator.groovy b/regression-test/suites/search/test_search_default_field_operator.groovy index fd5c7ce6198fc7..230825862357ef 100644 --- a/regression-test/suites/search/test_search_default_field_operator.groovy +++ b/regression-test/suites/search/test_search_default_field_operator.groovy @@ -52,41 +52,41 @@ suite("test_search_default_field_operator") { // ============ Test 1: Wildcard Prefix with Default Field ============ // Requirement: firstname EQ Chris* - // SQL: search('Chris*', 'firstname') + // SQL: search('Chris*', '{"default_field":"firstname"}') // Expected: Chris (1), Christopher (2) // Note: Without parser, inverted index is case-sensitive qt_wildcard_prefix """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname FROM ${tableName} - WHERE search('Chris*', 'firstname') + WHERE search('Chris*', '{"default_field":"firstname"}') ORDER BY id """ // ============ Test 2: Multi-term AND with Default Operator ============ // Requirement: tags EQ foo bar (with AND semantics) - // SQL: search('foo bar', 'tags', 'and') + // SQL: search('foo bar', '{"default_field":"tags","default_operator":"and"}') // Expected: 'foo bar' (1), 'bar foo' (3) qt_multi_term_and """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags FROM ${tableName} - WHERE search('foo bar', 'tags', 'and') + WHERE search('foo bar', '{"default_field":"tags","default_operator":"and"}') ORDER BY id """ // ============ Test 3: Multi-term OR with Default Operator ============ // Requirement: tags EQ foo OR bark (with OR semantics) - // SQL: search('foo bark', 'tags', 'or') + // SQL: search('foo bark', '{"default_field":"tags","default_operator":"or"}') // Expected: 'foo bar' (1), 'bar foo' (3), 'foolish bark' (4) qt_multi_term_or """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags FROM ${tableName} - WHERE search('foo bark', 'tags', 'or') + WHERE search('foo bark', '{"default_field":"tags","default_operator":"or"}') ORDER BY id """ // ============ Test 4: Multi-wildcard AND ============ // Requirement: tags EQ foo* bar* (with AND semantics) - // SQL: search('foo* bar*', 'tags', 'and') + // SQL: search('foo* bar*', '{"default_field":"tags","default_operator":"and"}') // Expands to: tags:foo* AND tags:bar* // Expected: rows with tokens matching foo* AND tokens matching bar* // - 'foo bar' (1): tokens=['foo','bar'] - matches foo* ✓ and bar* ✓ @@ -96,29 +96,29 @@ suite("test_search_default_field_operator") { qt_wildcard_multi_and """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags FROM ${tableName} - WHERE search('foo* bar*', 'tags', 'and') + WHERE search('foo* bar*', '{"default_field":"tags","default_operator":"and"}') ORDER BY id """ // ============ Test 5: Explicit OR operator overrides default ============ - // SQL: search('foo OR bark', 'tags', 'and') + // SQL: search('foo OR bark', '{"default_field":"tags","default_operator":"and"}') // The explicit OR in DSL should override the default 'and' operator // Expected: 'foo bar' (1), 'bar foo' (3), 'foolish bark' (4) qt_explicit_or_override """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags FROM ${tableName} - WHERE search('foo OR bark', 'tags', 'and') + WHERE search('foo OR bark', '{"default_field":"tags","default_operator":"and"}') ORDER BY id """ // ============ Test 6: EXACT function with default field ============ // Requirement: EXACT(foo bar) on tags_exact field (no tokenization) - // SQL: search('EXACT(foo bar)', 'tags_exact') + // SQL: search('EXACT(foo bar)', '{"default_field":"tags_exact"}') // Expected: 'foo bar' (1) only - exact string match qt_exact_function """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags_exact FROM ${tableName} - WHERE search('EXACT(foo bar)', 'tags_exact') + WHERE search('EXACT(foo bar)', '{"default_field":"tags_exact"}') ORDER BY id """ @@ -135,7 +135,7 @@ suite("test_search_default_field_operator") { qt_single_term """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags FROM ${tableName} - WHERE search('bar', 'tags') + WHERE search('bar', '{"default_field":"tags"}') ORDER BY id """ @@ -143,7 +143,7 @@ suite("test_search_default_field_operator") { qt_wildcard_middle """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname FROM ${tableName} - WHERE search('*ris*', 'firstname') + WHERE search('*ris*', '{"default_field":"firstname"}') ORDER BY id """ @@ -153,7 +153,7 @@ suite("test_search_default_field_operator") { qt_case_sensitive """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname FROM ${tableName} - WHERE search('CHRIS*', 'firstname') + WHERE search('CHRIS*', '{"default_field":"firstname"}') ORDER BY id """ @@ -161,7 +161,7 @@ suite("test_search_default_field_operator") { qt_default_or """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags FROM ${tableName} - WHERE search('foo bark', 'tags') + WHERE search('foo bark', '{"default_field":"tags"}') ORDER BY id """ @@ -169,7 +169,7 @@ suite("test_search_default_field_operator") { qt_any_function """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags FROM ${tableName} - WHERE search('ANY(foo bark)', 'tags') + WHERE search('ANY(foo bark)', '{"default_field":"tags"}') ORDER BY id """ @@ -177,7 +177,7 @@ suite("test_search_default_field_operator") { qt_all_function """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags FROM ${tableName} - WHERE search('ALL(foo bar)', 'tags') + WHERE search('ALL(foo bar)', '{"default_field":"tags"}') ORDER BY id """ @@ -185,7 +185,7 @@ suite("test_search_default_field_operator") { qt_complex_wildcard """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname FROM ${tableName} - WHERE search('?evin', 'firstname') + WHERE search('?evin', '{"default_field":"firstname"}') ORDER BY id """ @@ -193,7 +193,7 @@ suite("test_search_default_field_operator") { qt_explicit_and """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags FROM ${tableName} - WHERE search('foo AND bar', 'tags') + WHERE search('foo AND bar', '{"default_field":"tags"}') ORDER BY id """ @@ -209,19 +209,19 @@ suite("test_search_default_field_operator") { qt_not_operator """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, tags FROM ${tableName} - WHERE search('NOT foobar', 'tags') + WHERE search('NOT foobar', '{"default_field":"tags"}') ORDER BY id """ // ============ Test 18: Combining different parameter counts ============ - // Tests mixing 1-param, 2-param, and 3-param search() calls in same query + // Tests mixing 1-param and 2-param search() calls in same query // - search('firstname:Chris*'): 1-param, traditional syntax → matches id 1,2 - // - search('foo*', 'tags', 'or'): 3-param with wildcard → matches id 1,3,4 + // - search('foo*', '{"default_field":"tags","default_operator":"or"}'): 2-param with JSON options → matches id 1,3,4 // - OR combination → matches id 1,2,3,4 (all rows) qt_param_count_mix """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id FROM ${tableName} - WHERE search('firstname:Chris*') OR search('foo*', 'tags', 'or') + WHERE search('firstname:Chris*') OR search('foo*', '{"default_field":"tags","default_operator":"or"}') ORDER BY id """ diff --git a/regression-test/suites/search/test_search_escape.groovy b/regression-test/suites/search/test_search_escape.groovy new file mode 100644 index 00000000000000..629d3fcc1f5129 --- /dev/null +++ b/regression-test/suites/search/test_search_escape.groovy @@ -0,0 +1,189 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/** + * Tests for escape character handling in search() function. + * + * Escape semantics in DSL: + * - Backslash (\) escapes the next character + * - Escaped space (\ ) joins terms: "First\ Value" -> single term "First Value" + * - Escaped parentheses (\( \)) are literal characters, not grouping + * - Escaped colon (\:) is literal, not field separator + * - Escaped backslash (\\) is a literal backslash + * + * Escape chain in Groovy regression tests: + * - Groovy string: \\\\ -> SQL string: \\ -> DSL: \ (escape char) + * - Groovy string: \\\\\\\\ -> SQL string: \\\\ -> DSL: \\ -> literal: \ + */ +suite("test_search_escape") { + def tableName = "search_escape_test" + + sql "DROP TABLE IF EXISTS ${tableName}" + + // Create table with inverted indexes + // parser=none: store the entire value as a single term (no tokenization) + sql """ + CREATE TABLE ${tableName} ( + id INT, + title VARCHAR(200), + content VARCHAR(500), + INDEX idx_title(title) USING INVERTED PROPERTIES("parser" = "none"), + INDEX idx_content(content) USING INVERTED PROPERTIES("parser" = "english") + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_allocation" = "tag.location.default: 1") + """ + + // Insert test data + // With parser=none, these values are stored as-is (single terms) + // Groovy \\\\ -> SQL \\ -> stored as single backslash \ + sql """INSERT INTO ${tableName} VALUES + (1, 'First Value', 'first content'), + (2, 'FirstValue', 'second content'), + (3, 'hello(world)', 'third content'), + (4, 'hello world', 'fourth content'), + (5, 'key:value', 'fifth content'), + (6, 'path\\\\to\\\\file', 'sixth content'), + (7, 'apple', 'first fruit'), + (8, 'banana', 'second fruit') + """ + + // Wait for index building + Thread.sleep(3000) + + // ============ Test 1: Escaped space - search for "First Value" as single term ============ + // DSL: title:First\ Value -> searches for term "First Value" (with space) + // Groovy: \\\\ -> SQL: \\ -> DSL: \ (escape) + // This should match row 1 which has "First Value" stored as single term (parser=none) + qt_escape_space """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title + FROM ${tableName} + WHERE search('title:First\\\\ Value') + ORDER BY id + """ + + // ============ Test 2: Without escape - space separates terms ============ + // DSL: title:First Value -> "First" and "Value" as separate terms (syntax error without field) + // This query won't work as expected, showing the difference + // Using phrase query instead to show the contrast + qt_phrase_query """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title + FROM ${tableName} + WHERE search('title:"First Value"') + ORDER BY id + """ + + // ============ Test 3: Escaped parentheses ============ + // DSL: title:hello\(world\) -> searches for literal "hello(world)" + // Groovy: \\\\( -> SQL: \\( -> DSL: \( -> literal: ( + qt_escape_parentheses """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title + FROM ${tableName} + WHERE search('title:hello\\\\(world\\\\)') + ORDER BY id + """ + + // ============ Test 4: Escaped colon ============ + // DSL: title:key\:value -> searches for literal "key:value" + // Groovy: \\\\: -> SQL: \\: -> DSL: \: -> literal: : + qt_escape_colon """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title + FROM ${tableName} + WHERE search('title:key\\\\:value') + ORDER BY id + """ + + // ============ Test 5: Escaped backslash ============ + // DSL: title:path\\to\\file -> searches for "path\to\file" + // Groovy: \\\\\\\\ -> SQL: \\\\ -> DSL: \\ -> literal: \ + // Data stored: path\to\file (Groovy \\\\ -> SQL \\ -> stored \) + qt_escape_backslash """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title + FROM ${tableName} + WHERE search('title:path\\\\\\\\to\\\\\\\\file') + ORDER BY id + """ + + // ============ Test 6: Uppercase AND operator ============ + qt_uppercase_and """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, content + FROM ${tableName} + WHERE search('content:first AND content:fruit') + ORDER BY id + """ + + // ============ Test 7: Uppercase OR operator ============ + qt_uppercase_or """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, content + FROM ${tableName} + WHERE search('content:first OR content:second') + ORDER BY id + """ + + // ============ Test 8: Uppercase NOT operator ============ + qt_uppercase_not """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, content + FROM ${tableName} + WHERE search('content:fruit AND NOT content:first') + ORDER BY id + """ + + // ============ Test 9: Lowercase and operator ============ + qt_lowercase_and """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, content + FROM ${tableName} + WHERE search('content:first and content:fruit') + ORDER BY id + """ + + // ============ Test 10: Lowercase or operator ============ + qt_lowercase_or """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, content + FROM ${tableName} + WHERE search('content:first or content:second') + ORDER BY id + """ + + // ============ Test 11: Exclamation NOT operator ============ + qt_exclamation_not """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, content + FROM ${tableName} + WHERE search('content:fruit AND !content:first') + ORDER BY id + """ + + // ============ Test 12: Default field with escaped space ============ + // DSL: First\ Value with default_field=title (JSON options format) + qt_default_field_escape """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title + FROM ${tableName} + WHERE search('First\\\\ Value', '{"default_field":"title","default_operator":"and"}') + ORDER BY id + """ + + // ============ Test 13: Lucene mode with escaped space ============ + qt_lucene_mode_escape """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title + FROM ${tableName} + WHERE search('First\\\\ Value', '{"default_field":"title","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // Cleanup + sql "DROP TABLE IF EXISTS ${tableName}" +} diff --git a/regression-test/suites/search/test_search_inverted_is_null_pushdown.groovy b/regression-test/suites/search/test_search_inverted_is_null_pushdown.groovy new file mode 100644 index 00000000000000..8a4eec5f8eadf1 --- /dev/null +++ b/regression-test/suites/search/test_search_inverted_is_null_pushdown.groovy @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_search_inverted_is_null_pushdown", "p0") { + def tableName = "tbl_search_inverted_is_null_pushdown" + sql """DROP TABLE IF EXISTS ${tableName}""" + sql """ + CREATE TABLE ${tableName} ( + id INT, + dt DATE NULL, + str_col STRING NULL, + val INT NULL, + INDEX idx_dt (dt) USING INVERTED, + INDEX idx_str (str_col) USING INVERTED, + INDEX idx_val (val) USING INVERTED + ) + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ) + """ + + sql """INSERT INTO ${tableName} VALUES + (1, NULL, 'foo', 1), + (2, NULL, 'bar', -1), + (3, '2024-01-01', 'baz', 5), + (4, NULL, 'qux', 10) + """ + + sql "SET enable_common_expr_pushdown=true" + sql "SET inverted_index_skip_threshold=0" + + def nullBranchQuery = """ + SELECT COUNT(*) + FROM ${tableName} + WHERE (str_col LIKE CONCAT('%', 'no-hit', '%')) + OR (dt IS NULL) AND NOT val BETWEEN -9223372036854775808 AND 0 + """ + + def negatedNotNullQuery = """ + SELECT COUNT(*) + FROM ${tableName} + WHERE NOT (dt IS NOT NULL) + """ + + sql "SET enable_inverted_index_query=true" + def resultWithIndex = sql(nullBranchQuery) + def resultWithIndexNegatedNotNull = sql(negatedNotNullQuery) + assertEquals(2, resultWithIndex[0][0]) // previously returned 0 when dt IS NULL relied on inverted index + assertEquals(3, resultWithIndexNegatedNotNull[0][0]) // previously returned 0 when NOT (dt IS NOT NULL) was evaluated via inverted index + + sql "SET enable_inverted_index_query=false" + def resultWithoutIndex = sql(nullBranchQuery) + def resultWithoutIndexNegatedNotNull = sql(negatedNotNullQuery) + assertEquals(2, resultWithoutIndex[0][0]) + assertEquals(3, resultWithoutIndexNegatedNotNull[0][0]) + + sql """DROP TABLE IF EXISTS ${tableName}""" +} diff --git a/regression-test/suites/search/test_search_lucene_mode.groovy b/regression-test/suites/search/test_search_lucene_mode.groovy new file mode 100644 index 00000000000000..8e9d4edb7e37c6 --- /dev/null +++ b/regression-test/suites/search/test_search_lucene_mode.groovy @@ -0,0 +1,250 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/** + * Tests for Lucene mode parsing in search() function. + * + * Lucene mode mimics Elasticsearch/Lucene query_string behavior where boolean + * operators work as left-to-right modifiers, not traditional boolean algebra. + * + * Key differences from standard mode: + * - AND/OR/NOT are modifiers that affect adjacent terms + * - Operator precedence is left-to-right + * - Uses MUST/SHOULD/MUST_NOT internally (like Lucene's Occur enum) + * - minimum_should_match controls SHOULD clause behavior + * + * Enable Lucene mode with options parameter (JSON format): + * search(dsl, '{"default_field":"title","default_operator":"and","mode":"lucene"}') + */ +suite("test_search_lucene_mode") { + def tableName = "search_lucene_mode_test" + + sql "DROP TABLE IF EXISTS ${tableName}" + + // Create table with inverted indexes + sql """ + CREATE TABLE ${tableName} ( + id INT, + title VARCHAR(100), + content VARCHAR(200), + category VARCHAR(50), + INDEX idx_title(title) USING INVERTED PROPERTIES("parser" = "english"), + INDEX idx_content(content) USING INVERTED PROPERTIES("parser" = "english"), + INDEX idx_category(category) USING INVERTED + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_allocation" = "tag.location.default: 1") + """ + + // Insert test data + // Test data designed to verify Lucene-style boolean logic + sql """INSERT INTO ${tableName} VALUES + (1, 'apple banana cherry', 'red green blue', 'fruit'), + (2, 'apple banana', 'red green', 'fruit'), + (3, 'apple', 'red', 'fruit'), + (4, 'banana cherry', 'green blue', 'fruit'), + (5, 'cherry date', 'blue yellow', 'fruit'), + (6, 'date elderberry', 'yellow purple', 'berry'), + (7, 'fig grape', 'orange pink', 'mixed'), + (8, 'apple fig', 'red orange', 'mixed') + """ + + // Wait for index building + Thread.sleep(3000) + + // ============ Test 1: Standard mode AND behavior ============ + // In standard mode, 'apple AND banana' behaves like boolean AND + qt_standard_and """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title + FROM ${tableName} + WHERE search('title:apple AND title:banana') + ORDER BY id + """ + + // ============ Test 2: Lucene mode AND behavior ============ + // In Lucene mode, 'a AND b' marks both as MUST (+a +b) + // Expected same result as standard mode for simple AND + qt_lucene_and """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title + FROM ${tableName} + WHERE search('apple AND banana', '{"default_field":"title","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // ============ Test 3: Standard mode OR behavior ============ + // In standard mode, 'apple OR date' returns any row matching either + qt_standard_or """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title + FROM ${tableName} + WHERE search('title:apple OR title:date') + ORDER BY id + """ + + // ============ Test 4: Lucene mode OR behavior ============ + // In Lucene mode, 'a OR b' marks both as SHOULD with minimum_should_match=1 + qt_lucene_or """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title + FROM ${tableName} + WHERE search('apple OR date', '{"default_field":"title","default_operator":"or","mode":"lucene"}') + ORDER BY id + """ + + // ============ Test 5: Lucene mode complex expression ============ + // 'a AND b OR c' in Lucene mode (left-to-right parsing): + // - 'apple' starts as MUST (default_operator=AND) + // - 'AND banana' makes 'banana' MUST + // - 'OR cherry' makes 'cherry' SHOULD, AND changes 'banana' from MUST to SHOULD! + // Final state: +apple banana cherry (only 'apple' is MUST, 'banana' and 'cherry' are SHOULD) + // With minimum_should_match=0 (default when MUST exists), SHOULD clauses are discarded. + // So effectively: +apple only + // Expected: rows containing 'apple' -> 1, 2, 3, 8 + qt_lucene_complex_and_or """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title + FROM ${tableName} + WHERE search('apple AND banana OR cherry', '{"default_field":"title","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // ============ Test 6: Lucene mode with explicit minimum_should_match=1 ============ + // 'a AND b OR c' with minimum_should_match=1 (same Lucene left-to-right parsing): + // - 'apple': MUST + // - 'AND banana': banana becomes MUST + // - 'OR cherry': cherry becomes SHOULD, banana changes from MUST to SHOULD + // Final state: +apple banana cherry (apple is MUST, banana and cherry are SHOULD) + // With minimum_should_match=1, at least 1 SHOULD must match. + // So effectively: apple AND (banana OR cherry) + // Expected: rows with 'apple' AND ('banana' OR 'cherry') -> 1, 2 + qt_lucene_min_should_match_1 """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title + FROM ${tableName} + WHERE search('apple AND banana OR cherry', '{"default_field":"title","default_operator":"and","mode":"lucene","minimum_should_match":1}') + ORDER BY id + """ + + // ============ Test 7: Lucene mode NOT operator (pure negative query) ============ + // 'NOT a' in Lucene mode produces a pure MUST_NOT query. + // IMPORTANT: In Lucene/ES semantics, a pure negative query (only MUST_NOT, no MUST/SHOULD) + // returns EMPTY results because there's no positive clause to match against. + // This is correct Lucene behavior - to get "all except X", you need: + // match_all AND NOT X (i.e., a positive clause combined with negation) + // Expected: empty result (correct Lucene semantics) + qt_lucene_not """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title + FROM ${tableName} + WHERE search('NOT apple', '{"default_field":"title","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // ============ Test 8: Lucene mode AND NOT ============ + // 'a AND NOT b' in Lucene mode: + // - 'a' is MUST + // - 'NOT b' makes 'b' MUST_NOT + // Expected: rows with 'apple' but NOT 'banana' + qt_lucene_and_not """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title + FROM ${tableName} + WHERE search('apple AND NOT banana', '{"default_field":"title","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // ============ Test 9: Lucene mode OR NOT ============ + // 'a OR NOT b' in Lucene mode: + // - 'a' is SHOULD + // - 'NOT b' makes 'b' MUST_NOT + // Expected: rows with 'apple' OR (NOT 'banana') + qt_lucene_or_not """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title + FROM ${tableName} + WHERE search('apple OR NOT banana', '{"default_field":"title","default_operator":"or","mode":"lucene","minimum_should_match":1}') + ORDER BY id + """ + + // ============ Test 10: Lucene mode only OR (SHOULD only) ============ + // 'a OR b OR c' with only SHOULD clauses + // minimum_should_match defaults to 1 (at least one must match) + qt_lucene_or_only """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title + FROM ${tableName} + WHERE search('apple OR date OR fig', '{"default_field":"title","default_operator":"or","mode":"lucene"}') + ORDER BY id + """ + + // ============ Test 11: Lucene mode cross-field query ============ + // Multi-field query with Lucene mode + qt_lucene_cross_field """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title, category + FROM ${tableName} + WHERE search('title:apple AND category:fruit', '{"default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // ============ Test 12: Standard mode for comparison ============ + // Same query in standard mode for comparison + qt_standard_cross_field """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title, category + FROM ${tableName} + WHERE search('title:apple AND category:fruit') + ORDER BY id + """ + + // ============ Test 13: Lucene mode with phrase query ============ + qt_lucene_phrase """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title + FROM ${tableName} + WHERE search('"apple banana"', '{"default_field":"title","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // ============ Test 14: Lucene mode with wildcard ============ + qt_lucene_wildcard """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title + FROM ${tableName} + WHERE search('app* AND ban*', '{"default_field":"title","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // ============ Test 15: Verify standard mode unchanged ============ + // Ensure standard mode is not affected by the Lucene mode addition + qt_standard_unchanged """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title + FROM ${tableName} + WHERE search('title:apple AND title:banana') + ORDER BY id + """ + + // ============ Test 16: Lucene mode with empty options (should use standard mode) ============ + qt_empty_options """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title + FROM ${tableName} + WHERE search('apple AND banana', '{"default_field":"title","default_operator":"and"}') + ORDER BY id + """ + + // ============ Test 17: Lucene mode minimum_should_match=0 default behavior ============ + // With minimum_should_match=0 (default in filter context), SHOULD clauses are discarded + // when MUST clauses exist + qt_lucene_min_should_match_0 """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title + FROM ${tableName} + WHERE search('apple AND banana OR date', '{"default_field":"title","default_operator":"and","mode":"lucene","minimum_should_match":0}') + ORDER BY id + """ + + // Cleanup + sql "DROP TABLE IF EXISTS ${tableName}" +}