diff --git a/.github/actions/action-pr-title b/.github/actions/action-pr-title index 10f7ff082a0f12..077bddd7bdabd0 160000 --- a/.github/actions/action-pr-title +++ b/.github/actions/action-pr-title @@ -1 +1 @@ -Subproject commit 10f7ff082a0f1239f8cc39ccba39d11f32ca2407 +Subproject commit 077bddd7bdabd0d2b1b25ed0754c7e62e184d7ee diff --git a/.github/actions/ccache-action b/.github/actions/ccache-action index ca3acd2731eef1..5ebbd400eff9e7 160000 --- a/.github/actions/ccache-action +++ b/.github/actions/ccache-action @@ -1 +1 @@ -Subproject commit ca3acd2731eef11f1572ccb126356c2f9298d35e +Subproject commit 5ebbd400eff9e74630f759d94ddd7b6c26299639 diff --git a/.github/actions/get-workflow-origin b/.github/actions/get-workflow-origin index e2dae063368361..3778755869bc9c 160000 --- a/.github/actions/get-workflow-origin +++ b/.github/actions/get-workflow-origin @@ -1 +1 @@ -Subproject commit e2dae063368361e4cd1f510e8785cd73bca9352e +Subproject commit 3778755869bc9ca829e7b45b5d179fa000f97b44 diff --git a/.github/actions/paths-filter b/.github/actions/paths-filter index 4512585405083f..668c092af3649c 160000 --- a/.github/actions/paths-filter +++ b/.github/actions/paths-filter @@ -1 +1 @@ -Subproject commit 4512585405083f25c027a35db413c2b3b9006d50 +Subproject commit 668c092af3649c4b664c54e4b704aa46782f6f7c diff --git a/be/src/vec/functions/function_string.cpp b/be/src/vec/functions/function_string.cpp index 01524ccf0ed0a9..1acbbcb24d21b2 100644 --- a/be/src/vec/functions/function_string.cpp +++ b/be/src/vec/functions/function_string.cpp @@ -1290,6 +1290,95 @@ struct StringAppendTrailingCharIfAbsent { } }; +struct HammingDistanceImpl { + static constexpr auto name = "hamming_distance"; + using Chars = ColumnString::Chars; + using Offsets = ColumnString::Offsets; + using ReturnType = DataTypeInt64; + using ColumnType = ColumnInt64; + + // Calculate Hamming distance between two strings of equal length + static Int64 calculate_hamming_distance(const StringRef& str1, const StringRef& str2) { + DCHECK_EQ(str1.size, str2.size); + Int64 distance = 0; + for (size_t i = 0; i < str1.size; ++i) { + if (str1.data[i] != str2.data[i]) { + ++distance; + } + } + return distance; + } + + // vector_vector: both arguments are columns + static void vector_vector(FunctionContext* context, const Chars& ldata, const Offsets& loffsets, + const Chars& rdata, const Offsets& roffsets, + PaddedPODArray& res, NullMap& null_map_data) { + DCHECK_EQ(loffsets.size(), roffsets.size()); + size_t input_rows_count = loffsets.size(); + res.resize(input_rows_count); + + for (size_t i = 0; i < input_rows_count; ++i) { + StringRef lstr = StringRef(reinterpret_cast(&ldata[loffsets[i - 1]]), + loffsets[i] - loffsets[i - 1]); + StringRef rstr = StringRef(reinterpret_cast(&rdata[roffsets[i - 1]]), + roffsets[i] - roffsets[i - 1]); + + // Throw an error if strings have different lengths (enforce contract). + if (lstr.size != rstr.size) { + throw doris::Exception( + ErrorCode::INVALID_ARGUMENT, + "hamming_distance: input strings must have equal length, got {} and {}", + lstr.size, rstr.size); + } + res[i] = calculate_hamming_distance(lstr, rstr); + } + } + + // vector_scalar: first argument is column, second is constant + static void vector_scalar(FunctionContext* context, const Chars& ldata, const Offsets& loffsets, + const StringRef& rstr, PaddedPODArray& res, + NullMap& null_map_data) { + size_t input_rows_count = loffsets.size(); + res.resize(input_rows_count); + + for (size_t i = 0; i < input_rows_count; ++i) { + StringRef lstr = StringRef(reinterpret_cast(&ldata[loffsets[i - 1]]), + loffsets[i] - loffsets[i - 1]); + + // Throw an error if strings have different lengths (enforce contract). + if (lstr.size != rstr.size) { + throw doris::Exception( + ErrorCode::INVALID_ARGUMENT, + "hamming_distance: input strings must have equal length, got {} and {}", + lstr.size, rstr.size); + } + res[i] = calculate_hamming_distance(lstr, rstr); + } + } + + // scalar_vector: first argument is constant, second is column + static void scalar_vector(FunctionContext* context, const StringRef& lstr, const Chars& rdata, + const Offsets& roffsets, PaddedPODArray& res, + NullMap& null_map_data) { + size_t input_rows_count = roffsets.size(); + res.resize(input_rows_count); + + for (size_t i = 0; i < input_rows_count; ++i) { + StringRef rstr = StringRef(reinterpret_cast(&rdata[roffsets[i - 1]]), + roffsets[i] - roffsets[i - 1]); + + // Throw an error if strings have different lengths (enforce contract). + if (lstr.size != rstr.size) { + throw doris::Exception( + ErrorCode::INVALID_ARGUMENT, + "hamming_distance: input strings must have equal length, got {} and {}", + lstr.size, rstr.size); + } + res[i] = calculate_hamming_distance(lstr, rstr); + } + } +}; + struct StringLPad { static constexpr auto name = "lpad"; static constexpr auto is_lpad = true; @@ -1342,7 +1431,7 @@ using FunctionFromBase64 = FunctionStringOperateToNullType; using FunctionStringAppendTrailingCharIfAbsent = FunctionBinaryStringOperateToNullType; - +using FunctionHammingDistance = FunctionBinaryStringOperateToNullType; using FunctionStringLPad = FunctionStringPad; using FunctionStringRPad = FunctionStringPad; @@ -1440,6 +1529,7 @@ void register_function_string(SimpleFunctionFactory& factory) { factory.register_function>(); factory.register_function(); factory.register_function(); + factory.register_function(); factory.register_function(); factory.register_function(); factory.register_function(); diff --git a/be/test/vec/function/function_string_test.cpp b/be/test/vec/function/function_string_test.cpp index 97e6f47ef957bd..9f5e2aa4bb27d1 100644 --- a/be/test/vec/function/function_string_test.cpp +++ b/be/test/vec/function/function_string_test.cpp @@ -3778,95 +3778,33 @@ TEST(function_string_test, function_sha1_test) { } } -TEST(function_string_test, function_unicode_normalize_nfc_basic) { - std::string func_name = "unicode_normalize"; - - InputTypeSet input_types = { - PrimitiveType::TYPE_VARCHAR, - Consted {PrimitiveType::TYPE_VARCHAR}, - }; - - std::string cafe_decomposed = std::string("Cafe\xCC\x81"); - std::string cafe_composed = std::string("Caf\xC3\xA9"); +TEST(function_string_test, function_hamming_distance_test) { + std::string func_name = "hamming_distance"; { - DataSet data_set = { - {{cafe_decomposed, std::string("NFC")}, cafe_composed}, - }; - static_cast(check_function(func_name, input_types, data_set)); - } - - { - DataSet data_set = { - {{cafe_composed, std::string("NFC")}, cafe_composed}, - }; - static_cast(check_function(func_name, input_types, data_set)); - } -} - -TEST(function_string_test, function_unicode_normalize_modes_and_trim) { - std::string func_name = "unicode_normalize"; - - InputTypeSet input_types = { - PrimitiveType::TYPE_VARCHAR, - Consted {PrimitiveType::TYPE_VARCHAR}, - }; - - std::string cafe_decomposed = std::string("Cafe\xCC\x81"); - std::string cafe_composed = std::string("Caf\xC3\xA9"); - - { - DataSet data_set = { - {{cafe_composed, std::string(" nFd ")}, cafe_decomposed}, - }; - static_cast(check_function(func_name, input_types, data_set)); - } + InputTypeSet input_types = {PrimitiveType::TYPE_VARCHAR, PrimitiveType::TYPE_VARCHAR}; - { DataSet data_set = { - {{std::string("ABC 123"), std::string(" nfkc_cf ")}, std::string("abc 123")}, + // Same strings - distance 0 + {{std::string("abc"), std::string("abc")}, std::int64_t(0)}, + {{std::string(""), std::string("")}, std::int64_t(0)}, + {{std::string("hello"), std::string("hello")}, std::int64_t(0)}, + + // Different strings - distance > 0 + {{std::string("abc"), std::string("axc")}, std::int64_t(1)}, + {{std::string("abc"), std::string("xyz")}, std::int64_t(3)}, + {{std::string("hello"), std::string("hallo")}, std::int64_t(1)}, + {{std::string("test"), std::string("text")}, std::int64_t(1)}, + {{std::string("abcd"), std::string("abed")}, std::int64_t(1)}, + + // NULL inputs + {{Null(), std::string("abc")}, Null()}, + {{std::string("abc"), Null()}, Null()}, + {{Null(), Null()}, Null()}, }; - static_cast(check_function(func_name, input_types, data_set)); - } - { - DataSet data_set = { - {{std::string("plain-ascii"), std::string("NFKD")}, std::string("plain-ascii")}, - }; - static_cast(check_function(func_name, input_types, data_set)); + check_function_all_arg_comb(func_name, input_types, data_set); } } -TEST(function_string_test, function_unicode_normalize_mode_not_const) { - std::string func_name = "unicode_normalize"; - - InputTypeSet input_types = { - PrimitiveType::TYPE_VARCHAR, - PrimitiveType::TYPE_VARCHAR, - }; - - DataSet data_set = { - {{std::string("abc"), std::string("NFC")}, std::string("abc")}, - }; - - Status st = check_function(func_name, input_types, data_set); - EXPECT_NE(Status::OK(), st); -} - -TEST(function_string_test, function_unicode_normalize_invalid_mode) { - std::string func_name = "unicode_normalize"; - - InputTypeSet input_types = { - PrimitiveType::TYPE_VARCHAR, - Consted {PrimitiveType::TYPE_VARCHAR}, - }; - - DataSet data_set = { - {{std::string("abc"), std::string("INVALID_MODE")}, std::string("abc")}, - }; - - Status st = check_function(func_name, input_types, data_set); - EXPECT_NE(Status::OK(), st); -} - } // namespace doris::vectorized diff --git a/contrib/clucene b/contrib/clucene index 8b57674e9d7876..a8d1f58f393ef3 160000 --- a/contrib/clucene +++ b/contrib/clucene @@ -1 +1 @@ -Subproject commit 8b57674e9d78769b10aa0c1441cd12671a394745 +Subproject commit a8d1f58f393ef3ed13cedf82c77a3581ab5d57ef diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/HammingDistance.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/HammingDistance.java new file mode 100644 index 00000000000000..45125f630b1256 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/HammingDistance.java @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.scalar; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.AlwaysNullable; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.shape.BinaryExpression; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.BigIntType; +import org.apache.doris.nereids.types.StringType; +import org.apache.doris.nereids.types.VarcharType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * ScalarFunction 'hamming_distance'. + */ +public class HammingDistance extends ScalarFunction + implements BinaryExpression, ExplicitlyCastableSignature, AlwaysNullable { + + public static final List SIGNATURES = ImmutableList.of( + FunctionSignature.ret(BigIntType.INSTANCE) + .args(VarcharType.SYSTEM_DEFAULT, VarcharType.SYSTEM_DEFAULT), + FunctionSignature.ret(BigIntType.INSTANCE) + .args(StringType.INSTANCE, StringType.INSTANCE)); + + /** + * constructor with 2 arguments. + */ + public HammingDistance(Expression arg0, Expression arg1) { + super("hamming_distance", arg0, arg1); + } + + /** constructor for withChildren and reuse signature */ + private HammingDistance(ScalarFunctionParams functionParams) { + super(functionParams); + } + + /** + * withChildren. + */ + @Override + public HammingDistance withChildren(List children) { + Preconditions.checkArgument(children.size() == 2); + return new HammingDistance(getFunctionParams(children)); + } + + @Override + public List getSignatures() { + return SIGNATURES; + } + + @Override + public R accept(ExpressionVisitor visitor, C context) { + return visitor.visitHammingDistance(this, context); + } +} + diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java index 0b0c85d34c5a63..dd74fd0b78ae8f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java @@ -78,6 +78,7 @@ import org.apache.doris.nereids.trees.expressions.SlotReference; import org.apache.doris.nereids.trees.expressions.SubqueryExpr; import org.apache.doris.nereids.trees.expressions.Subtract; +import org.apache.doris.nereids.trees.expressions.TimestampArithmetic; import org.apache.doris.nereids.trees.expressions.TryCast; import org.apache.doris.nereids.trees.expressions.UnaryArithmetic; import org.apache.doris.nereids.trees.expressions.UnaryOperator; @@ -90,6 +91,7 @@ import org.apache.doris.nereids.trees.expressions.functions.agg.AggregateFunction; import org.apache.doris.nereids.trees.expressions.functions.generator.TableGeneratingFunction; import org.apache.doris.nereids.trees.expressions.functions.scalar.GroupingScalarFunction; +import org.apache.doris.nereids.trees.expressions.functions.scalar.HammingDistance; import org.apache.doris.nereids.trees.expressions.functions.scalar.Lambda; import org.apache.doris.nereids.trees.expressions.functions.scalar.ScalarFunction; import org.apache.doris.nereids.trees.expressions.functions.table.TableValuedFunction; @@ -147,6 +149,18 @@ public R visitScalarFunction(ScalarFunction scalarFunction, C context) { return visitBoundFunction(scalarFunction, context); } + public R visitToSeconds(org.apache.doris.nereids.trees.expressions.functions.scalar.ToSeconds toSeconds, C context) { + return visitScalarFunction(toSeconds, context); + } + + public R visitUnicodeNormalize(org.apache.doris.nereids.trees.expressions.functions.scalar.UnicodeNormalize unicodeNormalize, C context) { + return visitScalarFunction(unicodeNormalize, context); + } + + public R visitHammingDistance(org.apache.doris.nereids.trees.expressions.functions.scalar.HammingDistance hammingDistance, C context) { + return visitScalarFunction(hammingDistance, context); + } + public R visitSearchExpression(SearchExpression searchExpression, C context) { return visit(searchExpression, context); } @@ -455,6 +469,10 @@ public R visitSubqueryExpr(SubqueryExpr subqueryExpr, C context) { return visit(subqueryExpr, context); } + public R visitTimestampArithmetic(TimestampArithmetic arithmetic, C context) { + return visit(arithmetic, context); + } + public R visitScalarSubquery(ScalarSubquery scalar, C context) { return visitSubqueryExpr(scalar, context); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java index c1db9cbd2888a2..7aebda7711698c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java @@ -237,6 +237,8 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.GetFormat; import org.apache.doris.nereids.trees.expressions.functions.scalar.GetVariantType; import org.apache.doris.nereids.trees.expressions.functions.scalar.Greatest; +import org.apache.doris.nereids.trees.expressions.functions.scalar.HammingDistance; +import org.apache.doris.nereids.trees.expressions.functions.scalar.ToSeconds; import org.apache.doris.nereids.trees.expressions.functions.scalar.Hex; import org.apache.doris.nereids.trees.expressions.functions.scalar.HllCardinality; import org.apache.doris.nereids.trees.expressions.functions.scalar.HllEmpty; @@ -2505,6 +2507,10 @@ default R visitStrcmp(Strcmp strcmp, C context) { return visitScalarFunction(strcmp, context); } + default R visitHammingDistance(HammingDistance hammingDistance, C context) { + return visitScalarFunction(hammingDistance, context); + } + default R visitStripNullValue(StripNullValue stripNullValue, C context) { return visitScalarFunction(stripNullValue, context); } diff --git a/fe/pom.xml b/fe/pom.xml index 3691d10340ff06..1203e87355e765 100644 --- a/fe/pom.xml +++ b/fe/pom.xml @@ -122,6 +122,17 @@ under the License. prepare-agent + + report + test + + report + + + ${project.build.directory}/jacoco.exec + ${project.build.directory}/site/jacoco + + @@ -202,6 +213,17 @@ under the License. prepare-agent + + report + test + + report + + + ${project.build.directory}/jacoco.exec + ${project.build.directory}/site/jacoco + + diff --git a/regression-test/data/query_p0/sql_functions/test_hamming_distance.out b/regression-test/data/query_p0/sql_functions/test_hamming_distance.out new file mode 100644 index 00000000000000..50ddfab0022dbd --- /dev/null +++ b/regression-test/data/query_p0/sql_functions/test_hamming_distance.out @@ -0,0 +1,12 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !empty_nullable -- + +-- !empty_not_nullable -- + +-- !empty_partial_nullable -- + +-- !all_null -- +\N +\N +\N + diff --git a/regression-test/suites/query_p0/sql_functions/test_hamming_distance.groovy b/regression-test/suites/query_p0/sql_functions/test_hamming_distance.groovy new file mode 100644 index 00000000000000..271c0a4702d76c --- /dev/null +++ b/regression-test/suites/query_p0/sql_functions/test_hamming_distance.groovy @@ -0,0 +1,120 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_hamming_distance") { + // this table has nothing todo. just make it eaiser to generate query + sql " drop table if exists hits_hamming_distance " + sql """ create table hits_hamming_distance( + nothing boolean + ) + properties("replication_num" = "1"); + """ + sql "insert into hits_hamming_distance values(true);" + + sql " drop table if exists arg1_hamming_distance" + sql """ + create table arg1_hamming_distance ( + k0 int, + a varchar(100) not null, + b varchar(100) not null, + ) + DISTRIBUTED BY HASH(k0) + PROPERTIES + ( + "replication_num" = "1" + ); + """ + + order_qt_empty_not_nullable "select hamming_distance(a, a) from arg1_hamming_distance" + + sql """ insert into arg1_hamming_distance values + (1, 'abc', 'abc'), + (2, 'abc', 'axc'), + (3, 'abc', 'xyz'), + (4, 'hello', 'hallo'), + (5, 'test', 'text'), + (6, '', ''), + (7, 'a', 'a'), + (8, 'abcd', 'abce'), + (9, 'ab', 'ac'), + (10, 'hi', 'ho'); + """ + + /// all values + order_qt_nullable """ + SELECT hamming_distance(t.arg1_hamming_distance, t.ARG2) as result + FROM ( + SELECT hits_hamming_distance.nothing, TABLE1.arg1_hamming_distance, TABLE1.order1, TABLE2.ARG2, TABLE2.order2 + FROM hits_hamming_distance + CROSS JOIN ( + SELECT b as arg1_hamming_distance, k0 as order1 + FROM arg1_hamming_distance + ) as TABLE1 + CROSS JOIN ( + SELECT b as ARG2, k0 as order2 + FROM arg1_hamming_distance + ) as TABLE2 + )t; + """ + + /// consts. most by BE-UT + order_qt_const_not_nullable "select hamming_distance('abc', 'abc') from arg1_hamming_distance" + order_qt_const_other_not_nullable "select hamming_distance(a, 'abc') from arg1_hamming_distance" + + /// Test same length strings + order_qt_const_same_length "select hamming_distance('abc', 'abc') from arg1_hamming_distance" + order_qt_const_same_length_diff "select hamming_distance('abc', 'axc') from arg1_hamming_distance" + order_qt_const_same_length_all_diff "select hamming_distance('abc', 'xyz') from arg1_hamming_distance" + + /// Test exception cases - NULL inputs and unequal lengths + test { + sql "select hamming_distance(NULL, 'abc')" + exception "hamming_distance: input strings must have equal length" + } + + test { + sql "select hamming_distance('abc', NULL)" + exception "hamming_distance: input strings must have equal length" + } + + test { + sql "select hamming_distance(NULL, NULL)" + exception "hamming_distance: input strings must have equal length" + } + + test { + sql "select hamming_distance('abc', 'abcd')" + exception "hamming_distance: input strings must have equal length" + } + + test { + sql "select hamming_distance('hello', 'hi')" + exception "hamming_distance: input strings must have equal length" + } + + test { + sql "select hamming_distance('', 'abc')" + exception "hamming_distance: input strings must have equal length" + } + + /// folding + check_fold_consistency "hamming_distance('abc', 'abc')" + check_fold_consistency "hamming_distance('abc', 'axc')" + check_fold_consistency "hamming_distance('', '')" + check_fold_consistency "hamming_distance('hello', 'hallo')" +} +