Skip to content

Commit 74f8b18

Browse files
Backport ClickHouse#88401 to 25.8: Fix quadratic complexity in countMatches
1 parent 5d92144 commit 74f8b18

File tree

4 files changed

+162
-160
lines changed

4 files changed

+162
-160
lines changed

src/Functions/countMatches.cpp

Lines changed: 158 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,166 @@
1+
#include <Core/Settings.h>
2+
#include <Functions/IFunction.h>
13
#include <Functions/FunctionFactory.h>
2-
#include <Functions/countMatches.h>
4+
#include <Functions/FunctionHelpers.h>
5+
#include <Columns/ColumnFixedString.h>
6+
#include <Columns/ColumnString.h>
7+
#include <Columns/ColumnsNumber.h>
8+
#include <DataTypes/DataTypesNumber.h>
9+
#include <DataTypes/DataTypeString.h>
10+
#include <Functions/Regexps.h>
11+
#include <Interpreters/Context.h>
12+
13+
14+
namespace DB
15+
{
16+
17+
namespace ErrorCodes
18+
{
19+
extern const int ILLEGAL_COLUMN;
20+
}
21+
22+
namespace Setting
23+
{
24+
extern const SettingsBool count_matches_stop_at_empty_match;
25+
}
326

427
namespace
528
{
629

30+
using Pos = const char *;
31+
32+
template <typename CountMatchesBase>
33+
class FunctionCountMatches : public IFunction
34+
{
35+
const bool count_matches_stop_at_empty_match;
36+
37+
public:
38+
static constexpr auto name = CountMatchesBase::name;
39+
static FunctionPtr create(ContextPtr context) { return std::make_shared<FunctionCountMatches<CountMatchesBase>>(context); }
40+
41+
explicit FunctionCountMatches(ContextPtr context)
42+
: count_matches_stop_at_empty_match(context->getSettingsRef()[Setting::count_matches_stop_at_empty_match])
43+
{
44+
}
45+
46+
String getName() const override { return name; }
47+
size_t getNumberOfArguments() const override { return 2; }
48+
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
49+
50+
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
51+
{
52+
FunctionArgumentDescriptors args
53+
{
54+
{"haystack", static_cast<FunctionArgumentDescriptor::TypeValidator>(&isStringOrFixedString), nullptr, "String or FixedString"},
55+
{"pattern", static_cast<FunctionArgumentDescriptor::TypeValidator>(&isString), isColumnConst, "constant String"}
56+
};
57+
validateFunctionArguments(*this, arguments, args);
58+
59+
return std::make_shared<DataTypeUInt64>();
60+
}
61+
62+
DataTypePtr getReturnTypeForDefaultImplementationForDynamic() const override
63+
{
64+
return std::make_shared<DataTypeUInt64>();
65+
}
66+
67+
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
68+
{
69+
const IColumn * col_pattern = arguments[1].column.get();
70+
const ColumnConst * col_pattern_const = checkAndGetColumnConst<ColumnString>(col_pattern);
71+
if (col_pattern_const == nullptr)
72+
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Pattern argument is not const");
73+
74+
const OptimizedRegularExpression re = Regexps::createRegexp</*is_like*/ false, /*no_capture*/ true, CountMatchesBase::case_insensitive>(col_pattern_const->getValue<String>());
75+
76+
const IColumn * col_haystack = arguments[0].column.get();
77+
OptimizedRegularExpression::MatchVec matches;
78+
79+
if (const ColumnConst * col_haystack_const = checkAndGetColumnConstStringOrFixedString(col_haystack))
80+
{
81+
std::string_view str = col_haystack_const->getDataColumn().getDataAt(0).toView();
82+
uint64_t matches_count = countMatches(str, re, matches);
83+
return result_type->createColumnConst(input_rows_count, matches_count);
84+
}
85+
if (const ColumnString * col_haystack_string = checkAndGetColumn<ColumnString>(col_haystack))
86+
{
87+
auto col_res = ColumnUInt64::create();
88+
89+
const ColumnString::Chars & src_chars = col_haystack_string->getChars();
90+
const ColumnString::Offsets & src_offsets = col_haystack_string->getOffsets();
91+
92+
ColumnUInt64::Container & vec_res = col_res->getData();
93+
vec_res.resize(input_rows_count);
94+
95+
ColumnString::Offset current_src_offset = 0;
96+
97+
for (size_t i = 0; i < input_rows_count; ++i)
98+
{
99+
Pos pos = reinterpret_cast<Pos>(&src_chars[current_src_offset]);
100+
current_src_offset = src_offsets[i];
101+
Pos end = reinterpret_cast<Pos>(&src_chars[current_src_offset]);
102+
103+
std::string_view str(pos, end - pos);
104+
vec_res[i] = countMatches(str, re, matches);
105+
}
106+
107+
return col_res;
108+
}
109+
if (const ColumnFixedString * col_haystack_fixedstring = checkAndGetColumn<ColumnFixedString>(col_haystack))
110+
{
111+
auto col_res = ColumnUInt64::create();
112+
113+
ColumnUInt64::Container & vec_res = col_res->getData();
114+
vec_res.resize(input_rows_count);
115+
116+
for (size_t i = 0; i < input_rows_count; ++i)
117+
{
118+
std::string_view str = col_haystack_fixedstring->getDataAt(i).toView();
119+
vec_res[i] = countMatches(str, re, matches);
120+
}
121+
122+
return col_res;
123+
}
124+
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Could not cast haystack argument to String or FixedString");
125+
}
126+
127+
uint64_t countMatches(std::string_view src, const OptimizedRegularExpression & re, OptimizedRegularExpression::MatchVec & matches) const
128+
{
129+
/// Only one match is required, no need to copy more.
130+
static const unsigned matches_limit = 1;
131+
132+
Pos pos = reinterpret_cast<Pos>(src.data());
133+
Pos end = reinterpret_cast<Pos>(src.data() + src.size());
134+
135+
uint64_t match_count = 0;
136+
while (pos < end)
137+
{
138+
if (re.match(pos, end - pos, matches, matches_limit))
139+
{
140+
if (matches[0].length > 0)
141+
{
142+
pos += matches[0].offset + matches[0].length;
143+
++match_count;
144+
}
145+
else
146+
{
147+
if (count_matches_stop_at_empty_match)
148+
/// Progress should be made, but with empty match the progress will not be done.
149+
break;
150+
151+
/// Progress is made by a single character in case the pattern does not match or have zero-byte match.
152+
/// The reason is simply because the pattern could match another part of input when forwarded.
153+
++pos;
154+
}
155+
}
156+
else
157+
break;
158+
}
159+
160+
return match_count;
161+
}
162+
};
163+
7164
struct FunctionCountMatchesCaseSensitive
8165
{
9166
static constexpr auto name = "countMatches";
@@ -17,9 +174,6 @@ struct FunctionCountMatchesCaseInsensitive
17174

18175
}
19176

20-
namespace DB
21-
{
22-
23177
REGISTER_FUNCTION(CountMatches)
24178
{
25179
factory.registerFunction<FunctionCountMatches<FunctionCountMatchesCaseSensitive>>();

src/Functions/countMatches.h

Lines changed: 0 additions & 156 deletions
This file was deleted.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
0
2+
1000000
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
SELECT countMatches(repeat('\0\0\0\0\0\0\0\0\0\0', 1000000), 'a');
2+
SELECT countMatches(repeat('\0\0\0\0\0\0\0\0\0\0a', 1000000), 'a');

0 commit comments

Comments
 (0)