@@ -79,10 +79,12 @@ namespace tier {
7979 };
8080
8181 struct ConsolidationConfig {
82- static constexpr size_t candidate_size { 4 }; // candidate selection window size: 4
83- static constexpr double maxMergeScore { 1.5 }; // max score allowed for candidates consolidation.
82+ static constexpr size_t candidate_size { 2 }; // candidate selection window size: 4
83+ static constexpr double maxMergeScore { 0.4 }; // max score allowed for candidates consolidation.
8484 // Skip consolidation if candidate score is greater
8585 // than this value.
86+ static constexpr double maxLivePercentage { 0.5 }; // Max live docs % of a segment to consider it
87+ // for cleanup during consolidation.
8688 };
8789
8890 // interface to fetch the required attributes from
@@ -125,62 +127,119 @@ namespace tier {
125127 do
126128 {
127129 accessor_ (itr, byte_size, docs_count, live_docs_count);
128-
129130 mergeBytes += byte_size;
130- delCount += (docs_count - live_docs_count);
131131
132132 } while (itr++ != end);
133133
134134 skew = static_cast <double >(byte_size) / mergeBytes;
135- mergeScore = skew + (1.0 / (1 + delCount));
136- cost = mergeBytes * mergeScore;
135+ mergeScore = skew;
137136 }
138137
139- void advance () noexcept {
138+ bool pop_front () {
140139 if (!initialized)
141- return ;
140+ return false ;
142141
143- const auto & removeSegment = segments. first ;
144- const auto & addSegment = segments. second + 1 ;
142+ const auto removeSegment = first () ;
143+ const auto lastSegment = last () ;
145144
146145 std::advance (segments.first , 1 );
147- std::advance (segments.second , 1 );
148146
149147 // Segment to be removed
150148 uint64_t rem_byte_size;
151149 uint64_t rem_docs_count;
152150 uint64_t rem_live_docs_count;
153151 accessor_ (removeSegment, rem_byte_size, rem_docs_count, rem_live_docs_count);
154- auto rem_del_count = rem_docs_count - rem_live_docs_count;
152+
153+ uint64_t last_seg_byte_size;
154+ uint64_t ignore;
155+ accessor_ (lastSegment, last_seg_byte_size, ignore, ignore);
156+
157+ mergeBytes -= rem_byte_size;
158+ skew = static_cast <double >(last_seg_byte_size) / mergeBytes;
159+ mergeScore = skew;
160+
161+ return true ;
162+ }
163+
164+ bool push_back () noexcept {
165+ if (!initialized)
166+ return false ;
167+
168+ const auto addSegment = segments.second + 1 ;
169+
170+ std::advance (segments.second , 1 );
155171
156172 // Segment to be added
157173 uint64_t add_byte_size;
158174 uint64_t add_docs_count;
159175 uint64_t add_live_docs_count;
160176 accessor_ (addSegment, add_byte_size, add_docs_count, add_live_docs_count);
161- auto add_del_count = add_docs_count - add_live_docs_count;
162177
163- mergeBytes = mergeBytes - rem_byte_size + add_byte_size;
178+ mergeBytes += add_byte_size;
164179 skew = static_cast <double >(add_byte_size) / mergeBytes;
165- delCount = delCount - rem_del_count + add_del_count ;
166- mergeScore = skew + ( 1.0 / ( 1 + delCount));
167- cost = mergeBytes * mergeScore ;
180+ mergeScore = skew ;
181+
182+ return true ;
168183 }
169184
170185 iterator_t first () const noexcept { return segments.first ; }
171186 iterator_t last () const noexcept { return segments.second ; }
172187
173188 size_t mergeBytes { 0 };
174189 double skew { 0.0 };
175- size_t delCount { 0 };
176190 double mergeScore { 0.0 };
177- double cost { 0.0 };
178191 bool initialized { false };
179192
180193 range_t segments;
181194 std::function<void (iterator_t , uint64_t &, uint64_t &, uint64_t &)> accessor_;
182195 };
183196
197+ template <typename Segment>
198+ bool findBestCleanupCandidate (
199+ std::vector<Segment>& segments,
200+ const std::function<
201+ void (const typename std::vector<Segment>::const_iterator,
202+ uint64_t & /* byte_size */ ,
203+ uint64_t & /* docs_count */ ,
204+ uint64_t & /* live_docs_count */ )>& getSegmentAttributes,
205+ tier::ConsolidationCandidate<Segment>& best) {
206+
207+ auto segmentSortFunc = [](const Segment& left, const Segment& right) {
208+
209+ auto lMeta = left.meta ;
210+ auto rMeta = right.meta ;
211+ auto lLivePerc = static_cast <double >(lMeta->live_docs_count ) / lMeta->docs_count ;
212+ auto rLivePerc = static_cast <double >(rMeta->live_docs_count ) / rMeta->docs_count ;
213+
214+ return (lLivePerc < rLivePerc);
215+ };
216+
217+ std::sort (segments.begin (), segments.end (), segmentSortFunc);
218+
219+ auto count = 0 ;
220+ auto total_docs_count = 0 ;
221+ auto total_live_docs_count = 0 ;
222+ double livePerc;
223+
224+ for (auto itr = segments.begin (); itr != segments.end (); itr++) {
225+ auto meta = itr->meta ;
226+ total_docs_count += meta->docs_count ;
227+ total_live_docs_count += meta->live_docs_count ;
228+
229+ livePerc = static_cast <double >(total_live_docs_count) / total_docs_count;
230+ if (livePerc > tier::ConsolidationConfig::maxLivePercentage)
231+ break ;
232+
233+ ++count;
234+ }
235+
236+ if (count < 1 )
237+ return false ;
238+
239+ best = ConsolidationCandidate<Segment>(segments.begin (), segments.begin () + count - 1 , getSegmentAttributes);
240+ return true ;
241+ }
242+
184243 //
185244 // This function receives a sorted vector of segments
186245 // and finds the best contiguous subset of segments to
@@ -197,13 +256,65 @@ namespace tier {
197256 //
198257 template <typename Segment>
199258 bool findBestConsolidationCandidate (
259+ std::vector<Segment>& sorted_segments,
260+ size_t max_segments_bytes,
261+ const std::function<
262+ void (const typename std::vector<Segment>::const_iterator,
263+ uint64_t & /* byte_size */ ,
264+ uint64_t & /* docs_count */ ,
265+ uint64_t & /* live_docs_count */ )>& getSegmentAttributes,
266+ tier::ConsolidationCandidate<Segment>& best) {
267+
268+ // sort segments in increasing order of the segment byte size
269+ std::sort (sorted_segments.begin (), sorted_segments.end ());
270+
271+ // We start with a min. window size of 2
272+ // since a window of size 1 will always
273+ // give us a skew of 1.0.
274+ uint64_t minWindowSize { tier::ConsolidationConfig::candidate_size };
275+ auto front = sorted_segments.begin ();
276+ auto rear = front + minWindowSize - 1 ;
277+ tier::ConsolidationCandidate<Segment> candidate (front, rear, getSegmentAttributes);
278+
279+ double prev_score { 1.0 };
280+
281+ while ((candidate.first () + 1 ) < sorted_segments.end ()) {
282+
283+ if (!best.initialized || (best.mergeScore > candidate.mergeScore && candidate.mergeBytes <= max_segments_bytes))
284+ best = candidate;
285+
286+ if (std::distance (candidate.first (), candidate.last ()) < (minWindowSize - 1 )) {
287+ candidate.push_back ();
288+ continue ;
289+ }
290+
291+ if (candidate.mergeScore > prev_score ||
292+ candidate.mergeBytes > max_segments_bytes ||
293+ candidate.last () == (sorted_segments.end () - 1 )) {
294+ prev_score = candidate.mergeScore ;
295+ candidate.pop_front ();
296+ }
297+ else if (candidate.mergeScore <= prev_score && candidate.last () < (sorted_segments.end () - 1 ) &&
298+ candidate.mergeBytes <= max_segments_bytes) {
299+ prev_score = candidate.mergeScore ;
300+ candidate.push_back ();
301+ }
302+ }
303+
304+ return (best.initialized &&
305+ best.mergeScore <= tier::ConsolidationConfig::maxMergeScore);
306+ }
307+
308+ template <typename Segment>
309+ bool findBestConsolidationCandidate1 (
200310 const std::vector<Segment>& sorted_segments,
201311 const std::function<
202312 void (const typename std::vector<Segment>::const_iterator,
203313 uint64_t & /* byte_size */ ,
204314 uint64_t & /* docs_count */ ,
205315 uint64_t & /* live_docs_count */ )>& getSegmentAttributes,
206316 tier::ConsolidationCandidate<Segment>& best) {
317+
207318 if (sorted_segments.size () < tier::ConsolidationConfig::candidate_size)
208319 return false ;
209320
0 commit comments