@@ -57,6 +57,7 @@ class SparseGramsImpl
5757 {
5858 size_t position;
5959 size_t left_ngram_position;
60+ size_t symbol_index;
6061 size_t hash;
6162 };
6263
@@ -88,6 +89,11 @@ class SparseGramsImpl
8889 return {left_iterator, right_iterator};
8990 }
9091
92+ size_t getRightSymbol () const
93+ {
94+ return num_increments;
95+ }
96+
9197 size_t getNextPosition (size_t iterator) const
9298 {
9399 if constexpr (is_utf8)
@@ -123,13 +129,15 @@ class SparseGramsImpl
123129 return false ;
124130
125131 auto [ngram_left_position, right_position] = symbol_iterator.getNGramPositions ();
132+ size_t right_symbol_index = symbol_iterator.getRightSymbol ();
126133 size_t next_right_position = symbol_iterator.getNextPosition (right_position);
127134 size_t right_border_ngram_hash = hasher (pos + ngram_left_position, next_right_position - ngram_left_position);
128135
129136 while (!convex_hull.empty () && convex_hull.back ().hash < right_border_ngram_hash)
130137 {
131138 size_t possible_left_position = convex_hull.back ().left_ngram_position ;
132- size_t length = next_right_position - possible_left_position;
139+ size_t possible_left_symbol_index = convex_hull.back ().symbol_index ;
140+ size_t length = right_symbol_index - possible_left_symbol_index + 2 ;
133141 if (length > max_ngram_length)
134142 {
135143 // / If the current length is greater than the current right position, it will be greater at future right positions, so we can just delete them all.
@@ -143,7 +151,8 @@ class SparseGramsImpl
143151 if (!convex_hull.empty ())
144152 {
145153 size_t possible_left_position = convex_hull.back ().left_ngram_position ;
146- size_t length = next_right_position - possible_left_position;
154+ size_t possible_left_symbol_index = convex_hull.back ().symbol_index ;
155+ size_t length = right_symbol_index - possible_left_symbol_index + 2 ;
147156 if (length <= max_ngram_length)
148157 result.push_back ({possible_left_position, next_right_position});
149158 }
@@ -152,7 +161,12 @@ class SparseGramsImpl
152161 while (!convex_hull.empty () && convex_hull.back ().hash == right_border_ngram_hash)
153162 convex_hull.pop_back ();
154163
155- convex_hull.push_back (PositionAndHash{.position = right_position, .left_ngram_position = ngram_left_position, .hash = right_border_ngram_hash});
164+ convex_hull.push_back (PositionAndHash{
165+ .position = right_position,
166+ .left_ngram_position = ngram_left_position,
167+ .symbol_index = right_symbol_index,
168+ .hash = right_border_ngram_hash
169+ });
156170 symbol_iterator.increment ();
157171 return true ;
158172 }
0 commit comments