1+ #include < Core/Settings.h>
2+ #include < Functions/IFunction.h>
13#include < Functions/FunctionFactory.h>
2- #include < Functions/countMatches.h>
4+ #include < Functions/FunctionHelpers.h>
5+ #include < Columns/ColumnFixedString.h>
6+ #include < Columns/ColumnString.h>
7+ #include < Columns/ColumnsNumber.h>
8+ #include < DataTypes/DataTypesNumber.h>
9+ #include < DataTypes/DataTypeString.h>
10+ #include < Functions/Regexps.h>
11+ #include < Interpreters/Context.h>
12+
13+
14+ namespace DB
15+ {
16+
17+ namespace ErrorCodes
18+ {
19+ extern const int ILLEGAL_COLUMN;
20+ }
21+
22+ namespace Setting
23+ {
24+ extern const SettingsBool count_matches_stop_at_empty_match;
25+ }
326
427namespace
528{
629
30+ using Pos = const char *;
31+
32+ template <typename CountMatchesBase>
33+ class FunctionCountMatches : public IFunction
34+ {
35+ const bool count_matches_stop_at_empty_match;
36+
37+ public:
38+ static constexpr auto name = CountMatchesBase::name;
39+ static FunctionPtr create (ContextPtr context) { return std::make_shared<FunctionCountMatches<CountMatchesBase>>(context); }
40+
41+ explicit FunctionCountMatches (ContextPtr context)
42+ : count_matches_stop_at_empty_match(context->getSettingsRef ()[Setting::count_matches_stop_at_empty_match])
43+ {
44+ }
45+
46+ String getName () const override { return name; }
47+ size_t getNumberOfArguments () const override { return 2 ; }
48+ bool isSuitableForShortCircuitArgumentsExecution (const DataTypesWithConstInfo & /* arguments*/ ) const override { return true ; }
49+
50+ DataTypePtr getReturnTypeImpl (const ColumnsWithTypeAndName & arguments) const override
51+ {
52+ FunctionArgumentDescriptors args
53+ {
54+ {" haystack" , static_cast <FunctionArgumentDescriptor::TypeValidator>(&isStringOrFixedString), nullptr , " String or FixedString" },
55+ {" pattern" , static_cast <FunctionArgumentDescriptor::TypeValidator>(&isString), isColumnConst, " constant String" }
56+ };
57+ validateFunctionArguments (*this , arguments, args);
58+
59+ return std::make_shared<DataTypeUInt64>();
60+ }
61+
62+ DataTypePtr getReturnTypeForDefaultImplementationForDynamic () const override
63+ {
64+ return std::make_shared<DataTypeUInt64>();
65+ }
66+
67+ ColumnPtr executeImpl (const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
68+ {
69+ const IColumn * col_pattern = arguments[1 ].column .get ();
70+ const ColumnConst * col_pattern_const = checkAndGetColumnConst<ColumnString>(col_pattern);
71+ if (col_pattern_const == nullptr )
72+ throw Exception (ErrorCodes::ILLEGAL_COLUMN, " Pattern argument is not const" );
73+
74+ const OptimizedRegularExpression re = Regexps::createRegexp</* is_like*/ false , /* no_capture*/ true , CountMatchesBase::case_insensitive>(col_pattern_const->getValue <String>());
75+
76+ const IColumn * col_haystack = arguments[0 ].column .get ();
77+ OptimizedRegularExpression::MatchVec matches;
78+
79+ if (const ColumnConst * col_haystack_const = checkAndGetColumnConstStringOrFixedString (col_haystack))
80+ {
81+ std::string_view str = col_haystack_const->getDataColumn ().getDataAt (0 ).toView ();
82+ uint64_t matches_count = countMatches (str, re, matches);
83+ return result_type->createColumnConst (input_rows_count, matches_count);
84+ }
85+ if (const ColumnString * col_haystack_string = checkAndGetColumn<ColumnString>(col_haystack))
86+ {
87+ auto col_res = ColumnUInt64::create ();
88+
89+ const ColumnString::Chars & src_chars = col_haystack_string->getChars ();
90+ const ColumnString::Offsets & src_offsets = col_haystack_string->getOffsets ();
91+
92+ ColumnUInt64::Container & vec_res = col_res->getData ();
93+ vec_res.resize (input_rows_count);
94+
95+ ColumnString::Offset current_src_offset = 0 ;
96+
97+ for (size_t i = 0 ; i < input_rows_count; ++i)
98+ {
99+ Pos pos = reinterpret_cast <Pos>(&src_chars[current_src_offset]);
100+ current_src_offset = src_offsets[i];
101+ Pos end = reinterpret_cast <Pos>(&src_chars[current_src_offset]);
102+
103+ std::string_view str (pos, end - pos);
104+ vec_res[i] = countMatches (str, re, matches);
105+ }
106+
107+ return col_res;
108+ }
109+ if (const ColumnFixedString * col_haystack_fixedstring = checkAndGetColumn<ColumnFixedString>(col_haystack))
110+ {
111+ auto col_res = ColumnUInt64::create ();
112+
113+ ColumnUInt64::Container & vec_res = col_res->getData ();
114+ vec_res.resize (input_rows_count);
115+
116+ for (size_t i = 0 ; i < input_rows_count; ++i)
117+ {
118+ std::string_view str = col_haystack_fixedstring->getDataAt (i).toView ();
119+ vec_res[i] = countMatches (str, re, matches);
120+ }
121+
122+ return col_res;
123+ }
124+ throw Exception (ErrorCodes::ILLEGAL_COLUMN, " Could not cast haystack argument to String or FixedString" );
125+ }
126+
127+ uint64_t countMatches (std::string_view src, const OptimizedRegularExpression & re, OptimizedRegularExpression::MatchVec & matches) const
128+ {
129+ // / Only one match is required, no need to copy more.
130+ static const unsigned matches_limit = 1 ;
131+
132+ Pos pos = reinterpret_cast <Pos>(src.data ());
133+ Pos end = reinterpret_cast <Pos>(src.data () + src.size ());
134+
135+ uint64_t match_count = 0 ;
136+ while (pos < end)
137+ {
138+ if (re.match (pos, end - pos, matches, matches_limit))
139+ {
140+ if (matches[0 ].length > 0 )
141+ {
142+ pos += matches[0 ].offset + matches[0 ].length ;
143+ ++match_count;
144+ }
145+ else
146+ {
147+ if (count_matches_stop_at_empty_match)
148+ // / Progress should be made, but with empty match the progress will not be done.
149+ break ;
150+
151+ // / Progress is made by a single character in case the pattern does not match or have zero-byte match.
152+ // / The reason is simply because the pattern could match another part of input when forwarded.
153+ ++pos;
154+ }
155+ }
156+ else
157+ break ;
158+ }
159+
160+ return match_count;
161+ }
162+ };
163+
7164struct FunctionCountMatchesCaseSensitive
8165{
9166 static constexpr auto name = " countMatches" ;
@@ -17,9 +174,6 @@ struct FunctionCountMatchesCaseInsensitive
17174
18175}
19176
20- namespace DB
21- {
22-
23177REGISTER_FUNCTION (CountMatches)
24178{
25179 factory.registerFunction <FunctionCountMatches<FunctionCountMatchesCaseSensitive>>();
0 commit comments