33#include < Functions/FunctionFactory.h>
44#include < Functions/FunctionHelpers.h>
55#include < Functions/FunctionTokens.h>
6+ #include < Common/UTF8Helpers.h>
67#include < Common/Exception.h>
7-
8- #include < zlib.h>
9- #include < Poco/UTF8Encoding.h>
8+ #include < base/types.h>
9+ #include < Common/HashTable/Hash.h>
1010
1111namespace DB
1212{
@@ -27,83 +27,164 @@ extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
2727namespace
2828{
2929
30+ struct CRC32CHasher
31+ {
32+ size_t operator ()(const char * data, size_t length) const
33+ {
34+ return updateWeakHash32 (reinterpret_cast <const UInt8*>(data), length, 0 );
35+ }
36+ };
37+
3038using Pos = const char *;
3139
3240template <bool is_utf8>
3341class SparseGramsImpl
3442{
3543private:
44+ CRC32CHasher hasher;
45+
3646 Pos pos;
3747 Pos end;
38- std::vector<UInt32> ngram_hashes;
39- std::vector<size_t > utf8_offsets;
40- size_t left;
41- size_t right;
4248 UInt64 min_ngram_length = 3 ;
4349 UInt64 max_ngram_length = 100 ;
4450
45- void buildNgramHashes ()
51+ // / Current batch of answers. The size of result can not be greater than `convex_hull`.
52+ // / The size of `convex_hull` should not be large, see comment to `convex_hull` for more details.
53+ std::vector<std::pair<size_t , size_t >> result;
54+ size_t iter_result = 0 ;
55+
56+ struct PositionAndHash
57+ {
58+ size_t position;
59+ size_t left_ngram_position;
60+ size_t symbol_index;
61+ size_t hash;
62+ };
63+
64+ class NGramSymbolIterator
4665 {
47- if constexpr (is_utf8)
66+ public:
67+ NGramSymbolIterator () = default ;
68+
69+ NGramSymbolIterator (Pos data_, Pos end_, size_t n_)
70+ : data(data_), end(end_), n(n_)
71+ {
72+ }
73+
74+ void increment ()
4875 {
49- Poco::UTF8Encoding encoder{};
50- size_t byte_offset = 0 ;
51- while (pos + byte_offset < end)
76+ right_iterator = getNextPosition (right_iterator);
77+
78+ if (++num_increments >= n)
79+ left_iterator = getNextPosition (left_iterator);
80+ }
81+
82+ bool isEnd () const
83+ {
84+ return data + right_iterator >= end;
85+ }
86+
87+ std::pair<size_t , size_t > getNGramPositions () const
88+ {
89+ return {left_iterator, right_iterator};
90+ }
91+
92+ size_t getRightSymbol () const
93+ {
94+ return num_increments;
95+ }
96+
97+ size_t getNextPosition (size_t iterator) const
98+ {
99+ if constexpr (is_utf8)
100+ return iterator + UTF8::seqLength (data[iterator]);
101+ else
102+ return iterator + 1 ;
103+ }
104+
105+ private:
106+
107+ Pos data;
108+ Pos end;
109+ size_t n;
110+ size_t right_iterator = 0 ;
111+ size_t left_iterator = 0 ;
112+ size_t num_increments = 0 ;
113+ };
114+
115+ // / The convex hull contains the maximum values of the suffixes that start from the current right iterator.
116+ // / For example, if we have n-gram hashes like [1,5,2,4,1,3] and current right position is 4 (the last one)
117+ // / than our convex hull will consists of elements:
118+ // / [{position:1, hash:5}, {position:3, hash:4}, {position:4,hash:1}]
119+ // / Assuming that hashes are uniformly distributed, the expected size of convex_hull is N^{1/3},
120+ // / where N is the length of the string.
121+ // / Proof: https://math.stackexchange.com/questions/3469295/expected-number-of-vertices-in-a-convex-hull
122+ std::vector<PositionAndHash> convex_hull;
123+ NGramSymbolIterator symbol_iterator;
124+
125+ // / Get the next batch of answers. Returns false if there can be no more answers.
126+ bool consume ()
127+ {
128+ if (symbol_iterator.isEnd ())
129+ return false ;
130+
131+ auto [ngram_left_position, right_position] = symbol_iterator.getNGramPositions ();
132+ size_t right_symbol_index = symbol_iterator.getRightSymbol ();
133+ size_t next_right_position = symbol_iterator.getNextPosition (right_position);
134+ size_t right_border_ngram_hash = hasher (pos + ngram_left_position, next_right_position - ngram_left_position);
135+
136+ while (!convex_hull.empty () && convex_hull.back ().hash < right_border_ngram_hash)
137+ {
138+ size_t possible_left_position = convex_hull.back ().left_ngram_position ;
139+ size_t possible_left_symbol_index = convex_hull.back ().symbol_index ;
140+ size_t length = right_symbol_index - possible_left_symbol_index + 2 ;
141+ if (length > max_ngram_length)
52142 {
53- utf8_offsets.push_back (byte_offset);
54- auto len = encoder.sequenceLength (reinterpret_cast <const unsigned char *>(pos + byte_offset), end - pos - byte_offset);
55- if (len < 1 )
56- throw Exception (ErrorCodes::BAD_ARGUMENTS, " Incorrect utf8 symbol" );
57- byte_offset += len;
143+ // / If the current length is greater than the current right position, it will be greater at future right positions, so we can just delete them all.
144+ convex_hull.clear ();
145+ break ;
58146 }
59- if (pos + byte_offset != end)
60- throw Exception (ErrorCodes::BAD_ARGUMENTS, " Incorrect utf8 symbol" );
61-
62- utf8_offsets.push_back (byte_offset);
63-
64- if (utf8_offsets.size () >= min_ngram_length)
65- ngram_hashes.reserve (utf8_offsets.size () - min_ngram_length + 1 );
66- for (size_t i = 0 ; i + min_ngram_length - 1 < utf8_offsets.size (); ++i)
67- ngram_hashes.push_back (crc32_z (
68- 0UL ,
69- reinterpret_cast <const unsigned char *>(pos + utf8_offsets[i]),
70- utf8_offsets[i + min_ngram_length - 1 ] - utf8_offsets[i]));
147+ result.push_back ({possible_left_position, next_right_position});
148+ convex_hull.pop_back ();
71149 }
72- else
150+
151+ if (!convex_hull.empty ())
73152 {
74- if (pos + min_ngram_length <= end)
75- ngram_hashes.reserve (end - pos - min_ngram_length + 1 );
76- for (size_t i = 0 ; pos + i + min_ngram_length - 2 < end; ++i)
77- ngram_hashes.push_back (crc32_z (0L , reinterpret_cast <const unsigned char *>(pos + i), min_ngram_length - 1 ));
153+ size_t possible_left_position = convex_hull.back ().left_ngram_position ;
154+ size_t possible_left_symbol_index = convex_hull.back ().symbol_index ;
155+ size_t length = right_symbol_index - possible_left_symbol_index + 2 ;
156+ if (length <= max_ngram_length)
157+ result.push_back ({possible_left_position, next_right_position});
78158 }
159+
160+ // / there should not be identical hashes in the convex hull. If there are, then we leave only the last one
161+ while (!convex_hull.empty () && convex_hull.back ().hash == right_border_ngram_hash)
162+ convex_hull.pop_back ();
163+
164+ convex_hull.push_back (PositionAndHash{
165+ .position = right_position,
166+ .left_ngram_position = ngram_left_position,
167+ .symbol_index = right_symbol_index,
168+ .hash = right_border_ngram_hash
169+ });
170+ symbol_iterator.increment ();
171+ return true ;
79172 }
80173
81174 std::optional<std::pair<size_t , size_t >> getNextIndices ()
82175 {
83- chassert (right > left);
84- while (left < ngram_hashes.size ())
176+ if (result.size () <= iter_result)
85177 {
86- while (right < ngram_hashes.size () && right <= left + max_ngram_length - min_ngram_length + 1 )
87- {
88- if (right > left + 1 )
89- {
90- if (ngram_hashes[left] < ngram_hashes[right - 1 ])
91- break ;
92-
93- if (ngram_hashes[right] < ngram_hashes[right - 1 ])
94- {
95- ++right;
96- continue ;
97- }
98- }
99-
100- return {{left, right++}};
101- }
102- ++left;
103- right = left + 1 ;
178+ result.clear ();
179+ iter_result = 0 ;
180+
181+ if (!consume ())
182+ return std::nullopt ;
183+
184+ return getNextIndices ();
104185 }
105186
106- return std:: nullopt ;
187+ return result[iter_result++] ;
107188 }
108189
109190public:
@@ -154,35 +235,23 @@ class SparseGramsImpl
154235 {
155236 pos = pos_;
156237 end = end_;
157- left = 0 ;
158- right = 1 ;
159238
160- ngram_hashes.clear ();
161- if constexpr (is_utf8)
162- utf8_offsets.clear ();
163-
164- buildNgramHashes ();
239+ symbol_iterator = NGramSymbolIterator (pos, end, min_ngram_length - 1 );
240+ for (size_t i = 0 ; i < min_ngram_length - 2 ; ++i)
241+ symbol_iterator.increment ();
165242 }
166243
167244 // / Get the next token, if any, or return false.
168245 bool get (Pos & token_begin, Pos & token_end)
169246 {
170- auto result = getNextIndices ();
171- if (!result )
247+ auto cur_result = getNextIndices ();
248+ if (!cur_result )
172249 return false ;
173250
174- auto [iter_left, iter_right] = *result ;
251+ auto [iter_left, iter_right] = *cur_result ;
175252
176- if constexpr (is_utf8)
177- {
178- token_begin = pos + utf8_offsets[iter_left];
179- token_end = pos + utf8_offsets[iter_right + min_ngram_length - 1 ];
180- }
181- else
182- {
183- token_begin = pos + iter_left;
184- token_end = pos + iter_right + min_ngram_length - 1 ;
185- }
253+ token_begin = pos + iter_left;
254+ token_end = pos + iter_right;
186255 return true ;
187256 }
188257};
@@ -211,6 +280,8 @@ class SparseGramsHashes : public IFunction
211280 SparseGramsImpl<is_utf8> impl;
212281 impl.init (arguments, false );
213282
283+ CRC32CHasher hasher;
284+
214285 auto col_res = ColumnUInt32::create ();
215286 auto & res_data = col_res->getData ();
216287
@@ -238,7 +309,7 @@ class SparseGramsHashes : public IFunction
238309 end = reinterpret_cast <Pos>(&src_data[current_src_offset]) - 1 ;
239310 impl.set (start, end);
240311 while (impl.get (start, end))
241- res_data.push_back (crc32_z ( 0UL , reinterpret_cast < const unsigned char *>( start) , end - start));
312+ res_data.push_back (hasher ( start, end - start));
242313
243314 res_offsets_data.push_back (res_data.size ());
244315 }
0 commit comments