Skip to content

Commit ff0abcb

Browse files
Merge pull request ClickHouse#79517 from scanhex12/improve_sparse_grams
Improve sparse grams function
2 parents 1a9e19e + 497fe8b commit ff0abcb

File tree

3 files changed

+178
-116
lines changed

3 files changed

+178
-116
lines changed

src/Functions/sparseGrams.cpp

Lines changed: 147 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@
33
#include <Functions/FunctionFactory.h>
44
#include <Functions/FunctionHelpers.h>
55
#include <Functions/FunctionTokens.h>
6+
#include <Common/UTF8Helpers.h>
67
#include <Common/Exception.h>
7-
8-
#include <zlib.h>
9-
#include <Poco/UTF8Encoding.h>
8+
#include <base/types.h>
9+
#include <Common/HashTable/Hash.h>
1010

1111
namespace DB
1212
{
@@ -27,83 +27,164 @@ extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
2727
namespace
2828
{
2929

30+
struct CRC32CHasher
31+
{
32+
size_t operator()(const char* data, size_t length) const
33+
{
34+
return updateWeakHash32(reinterpret_cast<const UInt8*>(data), length, 0);
35+
}
36+
};
37+
3038
using Pos = const char *;
3139

3240
template <bool is_utf8>
3341
class SparseGramsImpl
3442
{
3543
private:
44+
CRC32CHasher hasher;
45+
3646
Pos pos;
3747
Pos end;
38-
std::vector<UInt32> ngram_hashes;
39-
std::vector<size_t> utf8_offsets;
40-
size_t left;
41-
size_t right;
4248
UInt64 min_ngram_length = 3;
4349
UInt64 max_ngram_length = 100;
4450

45-
void buildNgramHashes()
51+
/// Current batch of answers. The size of result can not be greater than `convex_hull`.
52+
/// The size of `convex_hull` should not be large, see comment to `convex_hull` for more details.
53+
std::vector<std::pair<size_t, size_t>> result;
54+
size_t iter_result = 0;
55+
56+
struct PositionAndHash
57+
{
58+
size_t position;
59+
size_t left_ngram_position;
60+
size_t symbol_index;
61+
size_t hash;
62+
};
63+
64+
class NGramSymbolIterator
4665
{
47-
if constexpr (is_utf8)
66+
public:
67+
NGramSymbolIterator() = default;
68+
69+
NGramSymbolIterator(Pos data_, Pos end_, size_t n_)
70+
: data(data_), end(end_), n(n_)
71+
{
72+
}
73+
74+
void increment()
4875
{
49-
Poco::UTF8Encoding encoder{};
50-
size_t byte_offset = 0;
51-
while (pos + byte_offset < end)
76+
right_iterator = getNextPosition(right_iterator);
77+
78+
if (++num_increments >= n)
79+
left_iterator = getNextPosition(left_iterator);
80+
}
81+
82+
bool isEnd() const
83+
{
84+
return data + right_iterator >= end;
85+
}
86+
87+
std::pair<size_t, size_t> getNGramPositions() const
88+
{
89+
return {left_iterator, right_iterator};
90+
}
91+
92+
size_t getRightSymbol() const
93+
{
94+
return num_increments;
95+
}
96+
97+
size_t getNextPosition(size_t iterator) const
98+
{
99+
if constexpr (is_utf8)
100+
return iterator + UTF8::seqLength(data[iterator]);
101+
else
102+
return iterator + 1;
103+
}
104+
105+
private:
106+
107+
Pos data;
108+
Pos end;
109+
size_t n;
110+
size_t right_iterator = 0;
111+
size_t left_iterator = 0;
112+
size_t num_increments = 0;
113+
};
114+
115+
/// The convex hull contains the maximum values ​​of the suffixes that start from the current right iterator.
116+
/// For example, if we have n-gram hashes like [1,5,2,4,1,3] and current right position is 4 (the last one)
117+
/// than our convex hull will consists of elements:
118+
/// [{position:1, hash:5}, {position:3, hash:4}, {position:4,hash:1}]
119+
/// Assuming that hashes are uniformly distributed, the expected size of convex_hull is N^{1/3},
120+
/// where N is the length of the string.
121+
/// Proof: https://math.stackexchange.com/questions/3469295/expected-number-of-vertices-in-a-convex-hull
122+
std::vector<PositionAndHash> convex_hull;
123+
NGramSymbolIterator symbol_iterator;
124+
125+
/// Get the next batch of answers. Returns false if there can be no more answers.
126+
bool consume()
127+
{
128+
if (symbol_iterator.isEnd())
129+
return false;
130+
131+
auto [ngram_left_position, right_position] = symbol_iterator.getNGramPositions();
132+
size_t right_symbol_index = symbol_iterator.getRightSymbol();
133+
size_t next_right_position = symbol_iterator.getNextPosition(right_position);
134+
size_t right_border_ngram_hash = hasher(pos + ngram_left_position, next_right_position - ngram_left_position);
135+
136+
while (!convex_hull.empty() && convex_hull.back().hash < right_border_ngram_hash)
137+
{
138+
size_t possible_left_position = convex_hull.back().left_ngram_position;
139+
size_t possible_left_symbol_index = convex_hull.back().symbol_index;
140+
size_t length = right_symbol_index - possible_left_symbol_index + 2;
141+
if (length > max_ngram_length)
52142
{
53-
utf8_offsets.push_back(byte_offset);
54-
auto len = encoder.sequenceLength(reinterpret_cast<const unsigned char *>(pos + byte_offset), end - pos - byte_offset);
55-
if (len < 1)
56-
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Incorrect utf8 symbol");
57-
byte_offset += len;
143+
/// If the current length is greater than the current right position, it will be greater at future right positions, so we can just delete them all.
144+
convex_hull.clear();
145+
break;
58146
}
59-
if (pos + byte_offset != end)
60-
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Incorrect utf8 symbol");
61-
62-
utf8_offsets.push_back(byte_offset);
63-
64-
if (utf8_offsets.size() >= min_ngram_length)
65-
ngram_hashes.reserve(utf8_offsets.size() - min_ngram_length + 1);
66-
for (size_t i = 0; i + min_ngram_length - 1 < utf8_offsets.size(); ++i)
67-
ngram_hashes.push_back(crc32_z(
68-
0UL,
69-
reinterpret_cast<const unsigned char *>(pos + utf8_offsets[i]),
70-
utf8_offsets[i + min_ngram_length - 1] - utf8_offsets[i]));
147+
result.push_back({possible_left_position, next_right_position});
148+
convex_hull.pop_back();
71149
}
72-
else
150+
151+
if (!convex_hull.empty())
73152
{
74-
if (pos + min_ngram_length <= end)
75-
ngram_hashes.reserve(end - pos - min_ngram_length + 1);
76-
for (size_t i = 0; pos + i + min_ngram_length - 2 < end; ++i)
77-
ngram_hashes.push_back(crc32_z(0L, reinterpret_cast<const unsigned char *>(pos + i), min_ngram_length - 1));
153+
size_t possible_left_position = convex_hull.back().left_ngram_position;
154+
size_t possible_left_symbol_index = convex_hull.back().symbol_index;
155+
size_t length = right_symbol_index - possible_left_symbol_index + 2;
156+
if (length <= max_ngram_length)
157+
result.push_back({possible_left_position, next_right_position});
78158
}
159+
160+
/// there should not be identical hashes in the convex hull. If there are, then we leave only the last one
161+
while (!convex_hull.empty() && convex_hull.back().hash == right_border_ngram_hash)
162+
convex_hull.pop_back();
163+
164+
convex_hull.push_back(PositionAndHash{
165+
.position = right_position,
166+
.left_ngram_position = ngram_left_position,
167+
.symbol_index = right_symbol_index,
168+
.hash = right_border_ngram_hash
169+
});
170+
symbol_iterator.increment();
171+
return true;
79172
}
80173

81174
std::optional<std::pair<size_t, size_t>> getNextIndices()
82175
{
83-
chassert(right > left);
84-
while (left < ngram_hashes.size())
176+
if (result.size() <= iter_result)
85177
{
86-
while (right < ngram_hashes.size() && right <= left + max_ngram_length - min_ngram_length + 1)
87-
{
88-
if (right > left + 1)
89-
{
90-
if (ngram_hashes[left] < ngram_hashes[right - 1])
91-
break;
92-
93-
if (ngram_hashes[right] < ngram_hashes[right - 1])
94-
{
95-
++right;
96-
continue;
97-
}
98-
}
99-
100-
return {{left, right++}};
101-
}
102-
++left;
103-
right = left + 1;
178+
result.clear();
179+
iter_result = 0;
180+
181+
if (!consume())
182+
return std::nullopt;
183+
184+
return getNextIndices();
104185
}
105186

106-
return std::nullopt;
187+
return result[iter_result++];
107188
}
108189

109190
public:
@@ -154,35 +235,23 @@ class SparseGramsImpl
154235
{
155236
pos = pos_;
156237
end = end_;
157-
left = 0;
158-
right = 1;
159238

160-
ngram_hashes.clear();
161-
if constexpr (is_utf8)
162-
utf8_offsets.clear();
163-
164-
buildNgramHashes();
239+
symbol_iterator = NGramSymbolIterator(pos, end, min_ngram_length - 1);
240+
for (size_t i = 0; i < min_ngram_length - 2; ++i)
241+
symbol_iterator.increment();
165242
}
166243

167244
/// Get the next token, if any, or return false.
168245
bool get(Pos & token_begin, Pos & token_end)
169246
{
170-
auto result = getNextIndices();
171-
if (!result)
247+
auto cur_result = getNextIndices();
248+
if (!cur_result)
172249
return false;
173250

174-
auto [iter_left, iter_right] = *result;
251+
auto [iter_left, iter_right] = *cur_result;
175252

176-
if constexpr (is_utf8)
177-
{
178-
token_begin = pos + utf8_offsets[iter_left];
179-
token_end = pos + utf8_offsets[iter_right + min_ngram_length - 1];
180-
}
181-
else
182-
{
183-
token_begin = pos + iter_left;
184-
token_end = pos + iter_right + min_ngram_length - 1;
185-
}
253+
token_begin = pos + iter_left;
254+
token_end = pos + iter_right;
186255
return true;
187256
}
188257
};
@@ -211,6 +280,8 @@ class SparseGramsHashes : public IFunction
211280
SparseGramsImpl<is_utf8> impl;
212281
impl.init(arguments, false);
213282

283+
CRC32CHasher hasher;
284+
214285
auto col_res = ColumnUInt32::create();
215286
auto & res_data = col_res->getData();
216287

@@ -238,7 +309,7 @@ class SparseGramsHashes : public IFunction
238309
end = reinterpret_cast<Pos>(&src_data[current_src_offset]) - 1;
239310
impl.set(start, end);
240311
while (impl.get(start, end))
241-
res_data.push_back(crc32_z(0UL, reinterpret_cast<const unsigned char *>(start), end - start));
312+
res_data.push_back(hasher(start, end - start));
242313

243314
res_offsets_data.push_back(res_data.size());
244315
}

0 commit comments

Comments
 (0)