@@ -71,13 +71,13 @@ namespace tier {
7171 };
7272
7373 struct SegmentAttributes {
74- uint64_t byte_size { 0 };
75- uint64_t docs_count { 0 };
76- uint64_t live_docs_count { 0 };
74+ uint64_t byteSize { 0 };
75+ uint64_t docsCount { 0 };
76+ uint64_t liveDocsCount { 0 };
7777
7878 SegmentAttributes () = default ;
7979 SegmentAttributes (uint64_t b, uint64_t d, uint64_t l) :
80- byte_size (b), docs_count (d), live_docs_count (l) {}
80+ byteSize (b), docsCount (d), liveDocsCount (l) {}
8181 };
8282
8383 // interface to fetch the required attributes from
@@ -115,11 +115,11 @@ namespace tier {
115115 do
116116 {
117117 accessor_ (*itr, attrs);
118- mergeBytes += attrs.byte_size ;
118+ mergeBytes += attrs.byteSize ;
119119
120120 } while (itr++ != end);
121121
122- skew = static_cast <double >(attrs.byte_size ) / mergeBytes;
122+ skew = static_cast <double >(attrs.byteSize ) / mergeBytes;
123123 mergeScore = skew;
124124 }
125125
@@ -136,14 +136,14 @@ namespace tier {
136136 std::advance (segments.first , 1 );
137137
138138 // Segment to be removed
139- SegmentAttributes rem_seg_attrs ;
140- accessor_ (*removeSegment, rem_seg_attrs );
139+ SegmentAttributes remSegAttrs ;
140+ accessor_ (*removeSegment, remSegAttrs );
141141
142- SegmentAttributes last_seg_attrs ;
143- accessor_ (*lastSegment, last_seg_attrs );
142+ SegmentAttributes lastSegAttrs ;
143+ accessor_ (*lastSegment, lastSegAttrs );
144144
145- mergeBytes -= rem_seg_attrs. byte_size ;
146- skew = static_cast <double >(last_seg_attrs. byte_size ) / mergeBytes;
145+ mergeBytes -= remSegAttrs. byteSize ;
146+ skew = static_cast <double >(lastSegAttrs. byteSize ) / mergeBytes;
147147 mergeScore = skew;
148148
149149 return true ;
@@ -164,8 +164,8 @@ namespace tier {
164164 SegmentAttributes attrs;
165165 accessor_ (*addSegment, attrs);
166166
167- mergeBytes += attrs.byte_size ;
168- skew = static_cast <double >(attrs.byte_size ) / mergeBytes;
167+ mergeBytes += attrs.byteSize ;
168+ skew = static_cast <double >(attrs.byteSize ) / mergeBytes;
169169 mergeScore = skew;
170170
171171 return true ;
@@ -188,38 +188,38 @@ namespace tier {
188188 std::vector<Segment>& segments,
189189 const std::function<
190190 void (const Segment&,
191- tier::SegmentAttributes& /* byte_size */
191+ tier::SegmentAttributes&
192192 )>& getSegmentAttributes,
193193 tier::ConsolidationCandidate<Segment>& best) {
194194
195195 auto segmentSortFunc = [&](const Segment& left, const Segment& right) {
196196
197197 tier::SegmentAttributes attrs;
198198 getSegmentAttributes (left, attrs);
199- auto lLivePerc = static_cast <double >(attrs.live_docs_count ) / attrs.docs_count ;
199+ auto lLivePerc = static_cast <double >(attrs.liveDocsCount ) / attrs.docsCount ;
200200
201201 getSegmentAttributes (right, attrs);
202- auto rLivePerc = static_cast <double >(attrs.live_docs_count ) / attrs.docs_count ;
202+ auto rLivePerc = static_cast <double >(attrs.liveDocsCount ) / attrs.docsCount ;
203203
204204 return lLivePerc < rLivePerc;
205205 };
206206
207207 std::sort (segments.begin (), segments.end (), segmentSortFunc);
208208
209209 auto count = 0 ;
210- auto total_docs_count = 0 ;
211- auto total_live_docs_count = 0 ;
210+ auto totalDocsCount = 0 ;
211+ auto totalLiveDocsCount = 0 ;
212212 double livePerc;
213213
214214 for (auto itr = segments.begin (); itr != segments.end (); itr++) {
215215
216216 tier::SegmentAttributes attrs;
217217 getSegmentAttributes (*itr, attrs);
218218
219- total_docs_count += attrs.docs_count ;
220- total_live_docs_count += attrs.live_docs_count ;
219+ totalDocsCount += attrs.docsCount ;
220+ totalLiveDocsCount += attrs.liveDocsCount ;
221221
222- livePerc = static_cast <double >(total_live_docs_count ) / total_docs_count ;
222+ livePerc = static_cast <double >(totalLiveDocsCount ) / totalDocsCount ;
223223 if (livePerc > tier::ConsolidationConfig::maxLivePercentage)
224224 break ;
225225
@@ -234,72 +234,85 @@ namespace tier {
234234 }
235235
236236 //
237- // This function receives a sorted vector of segments
238- // and finds the best contiguous subset of segments to
239- // merge together.
237+ // This function receives a set of segments and finds
238+ // the best subset to merge together.
240239 // The best subset is defined as the one with the lowest
241- // merge cost and the merge cost is computed inside the
242- // ConslidateCandidate struct.
240+ // merge cost (i.e. skew). The merge cost is computed inside
241+ // the ConslidationCandidate struct upon candidate init,
242+ // push_back() and pop_front() operations.
243243 //
244- // findBestConsolidationCandidate merely sets a rolling
245- // window of size 4 on a subset of segments, it lets
246- // ConsolidationCandidate compute the cost of merge for that
247- // subset, repeats this process for all contiguous subsets
248- // and identifies the best candidate.
244+ // findBestConsolidationCandidate sorts the set of segments
245+ // in the increasing order of the segment sizes and then finds
246+ // the largest possible subset of segments whose consolidated
247+ // size is within the maxSegmentsBytes range and has the
248+ // lowest skew.
249+ //
250+ // Currently it is only executed with struct tier::SegmentStats
251+ // as the template argument in ArangoSearch. However we leverage
252+ // this templatized design for writing unit tests.
253+ //
254+ // findBestConsolidationCandidate does not use the live %
255+ // to find the best candidate. It only needs the segment
256+ // byte size.
249257 //
250258 template <typename Segment>
251259 bool findBestConsolidationCandidate (
252- std::vector<Segment>& sorted_segments ,
253- size_t max_segments_bytes ,
260+ std::vector<Segment>& segments ,
261+ size_t maxSegmentsBytes ,
254262 const std::function<
255263 void (const Segment&,
256264 SegmentAttributes&
257265 )>& getSegmentAttributes,
258266 tier::ConsolidationCandidate<Segment>& best) {
259267
268+ // sort segments by segment size
260269 auto comp = [&](const Segment& lhs, const Segment& rhs) {
261270
262- SegmentAttributes l_attrs;
263- SegmentAttributes r_attrs;
271+ SegmentAttributes lAttrs;
272+ SegmentAttributes rAttrs;
273+
274+ getSegmentAttributes (lhs, lAttrs);
275+ getSegmentAttributes (rhs, rAttrs);
264276
265- getSegmentAttributes (lhs, l_attrs);
266- getSegmentAttributes (rhs, r_attrs);
277+ if (lAttrs.byteSize == rAttrs.byteSize ) {
267278
268- if (l_attrs.byte_size == r_attrs.byte_size ) {
269-
270- double lfill_factor = static_cast <double >(l_attrs.live_docs_count ) / l_attrs.docs_count ;
271- double rfill_factor = static_cast <double >(r_attrs.live_docs_count ) / r_attrs.docs_count ;
279+ double lfill_factor = static_cast <double >(lAttrs.liveDocsCount ) / lAttrs.docsCount ;
280+ double rfill_factor = static_cast <double >(rAttrs.liveDocsCount ) / rAttrs.docsCount ;
272281 return lfill_factor > rfill_factor;
273282 }
274283
275- return l_attrs. byte_size < r_attrs. byte_size ;
284+ return lAttrs. byteSize < rAttrs. byteSize ;
276285 };
277286
278287 // sort segments in increasing order of the segment byte size
279- std::sort (sorted_segments .begin (), sorted_segments .end (), comp);
288+ std::sort (segments .begin (), segments .end (), comp);
280289
281290 // We start with a min. window size of 2
282291 // since a window of size 1 will always
283292 // give us a skew of 1.0.
284293 uint64_t minWindowSize { tier::ConsolidationConfig::candidate_size };
285- auto front = sorted_segments .begin ();
294+ auto front = segments .begin ();
286295 auto rear = front + minWindowSize - 1 ;
287296 tier::ConsolidationCandidate<Segment> candidate (front, rear, getSegmentAttributes);
288297
289298 // Algorithm:
290- // We
299+ // We start by setting the smallest possible window on the list of
300+ // sorted segments. We move the right end ahead to add more segments to
301+ // the window and we incrementally compute the merge cost for each subset.
302+ // We move the left end ahead to remove segments from the window and we
303+ // only do this when we're over the maxSegmentsBytes limit.
291304 while ((candidate.first () + minWindowSize - 1 ) <= candidate.last () &&
292- candidate.last () < sorted_segments .end ()) {
305+ candidate.last () < segments .end ()) {
293306
294- if (candidate.mergeBytes > max_segments_bytes ) {
307+ if (candidate.mergeBytes > maxSegmentsBytes ) {
295308 candidate.pop_front ();
296309 continue ;
297310 }
298311
299312 if (!best.initialized || best.mergeScore > candidate.mergeScore )
300313 best = candidate;
301314
302- if (candidate.last () == (sorted_segments .end () - 1 ))
315+ if (candidate.last () == (segments .end () - 1 ))
303316 break ;
304317
305318 candidate.push_back ();
@@ -353,6 +366,8 @@ struct ConsolidateDocsFill {
353366
354367ConsolidationPolicy MakePolicy (const ConsolidateDocsFill& options);
355368
369+ // [TODO] Currently unused as the new algorithm uses a different
370+ // approach. Only max_segments_bytes is in use.
356371struct ConsolidateTier {
357372 // minimum allowed number of segments to consolidate at once
358373 size_t min_segments = 1 ;
0 commit comments