2828
2929#include " formats/format_utils.hpp"
3030
31- namespace {
32-
33- // Returns percentage of live documents
34- inline double FillFactor (const irs::SegmentInfo& segment) noexcept {
35- return static_cast <double >(segment.live_docs_count ) /
36- static_cast <double >(segment.docs_count );
37- }
38-
39- // Returns approximated size of a segment in the absence of removals
40- inline size_t SizeWithoutRemovals (const irs::SegmentInfo& segment) noexcept {
41- return size_t (static_cast <double >(segment.byte_size ) * FillFactor (segment));
42- }
43-
4431namespace tier {
4532
46- struct SegmentStats {
47- // cppcheck-suppress noExplicitConstructor
48- SegmentStats (const irs::SubReader& reader) noexcept
49- : reader{&reader},
50- meta{&reader.Meta ()},
51- size{SizeWithoutRemovals (*meta)},
52- fill_factor{FillFactor (*meta)} {}
53-
54- bool operator <(const SegmentStats& rhs) const noexcept {
55- // cppcheck-suppress constVariable
56- auto & lhs = *this ;
57-
58- if (lhs.size == rhs.size ) {
59- if (lhs.fill_factor > rhs.fill_factor ) {
60- return true ;
61- } else if (lhs.fill_factor < rhs.fill_factor ) {
62- return false ;
63- }
33+ // ConsolidationConfig static constants.
34+ const size_t ConsolidationConfig::candidate_size { 4 }; // consolidation window size
35+ const size_t ConsolidationConfig::tier1 { 1 << 22 }; // 4 MB
36+ const double ConsolidationConfig::maxMergeScore { 1.5 }; // Skip consolidation if candidate score is greater
6437
65- return lhs.meta ->name < rhs.meta ->name ;
66- }
38+ ConsolidationCandidate::ConsolidationCandidate (
39+ iterator_t start,
40+ iterator_t end) noexcept
41+ : segments(start, end) {
6742
68- return lhs.size < rhs.size ;
69- }
43+ initialized = true ;
7044
71- operator const irs::SubReader*() const noexcept { return reader; }
72-
73- const irs::SubReader* reader;
74- const irs::SegmentInfo* meta;
75- size_t size; // approximate size of segment without removals
76- double_t fill_factor;
77- };
78-
79- struct ConsolidationCandidate {
80- using iterator_t = std::vector<SegmentStats>::const_iterator;
81- using range_t = std::pair<iterator_t , iterator_t >;
82-
83- explicit ConsolidationCandidate (iterator_t i) noexcept : segments(i, i) {}
84-
85- iterator_t begin () const noexcept { return segments.first ; }
86- iterator_t end () const noexcept { return segments.second ; }
87-
88- range_t segments;
89- size_t count{0 };
90- size_t size{0 }; // estimated size of the level
91- double_t score{DBL_MIN}; // how good this permutation is
92- };
93-
94- // / @returns score of the consolidation bucket
95- double_t consolidation_score (const ConsolidationCandidate& consolidation,
96- const size_t segments_per_tier,
97- const size_t floor_segment_bytes) noexcept {
98- // to detect how skewed the consolidation we do the following:
99- // 1. evaluate coefficient of variation, less is better
100- // 2. good candidates are in range [0;1]
101- // 3. favor condidates where number of segments is equal to
102- // 'segments_per_tier' approx
103- // 4. prefer smaller consolidations
104- // 5. prefer consolidations which clean removals
105-
106- switch (consolidation.count ) {
107- case 0 :
108- // empty consolidation makes not sense
109- return DBL_MIN;
110- case 1 : {
111- auto & meta = *consolidation.segments .first ->meta ;
112-
113- if (meta.docs_count == meta.live_docs_count ) {
114- // singletone without removals makes no sense
115- return DBL_MIN;
116- }
45+ // Calculate initial cost
46+ auto itr = start;
47+ do
48+ {
49+ auto itrMeta = itr->meta ;
11750
118- // FIXME honor number of deletes???
119- // signletone with removals makes sense if nothing better is found
120- return DBL_MIN + DBL_EPSILON ;
121- }
122- }
51+ mergeBytes += itrMeta-> byte_size ;
52+ skew = static_cast < double >(itrMeta-> byte_size ) / mergeBytes;
53+ delCount += (itrMeta-> docs_count - itrMeta-> live_docs_count ) ;
54+ mergeScore = skew + ( 1.0 / ( 1 + delCount));
55+ cost = mergeBytes * mergeScore;
12356
124- size_t size_before_consolidation = 0 ;
125- size_t size_after_consolidation = 0 ;
126- size_t size_after_consolidation_floored = 0 ;
127- for (auto & segment_stat : consolidation) {
128- size_before_consolidation += segment_stat.meta ->byte_size ;
129- size_after_consolidation += segment_stat.size ;
130- size_after_consolidation_floored +=
131- std::max (segment_stat.size , floor_segment_bytes);
57+ } while (itr++ != end);
13258 }
13359
134- // evaluate coefficient of variation
135- double sum_square_differences = 0 ;
136- const auto segment_size_after_consolidaton_mean =
137- static_cast <double >(size_after_consolidation_floored) /
138- static_cast <double >(consolidation.count );
139- for (auto & segment_stat : consolidation) {
140- const auto diff =
141- static_cast <double >(std::max (segment_stat.size , floor_segment_bytes)) -
142- segment_size_after_consolidaton_mean;
143- sum_square_differences += diff * diff;
144- }
60+ // Caller is responsible for ensuring that
61+ // the segment iterators aren't past the
62+ // last element before calling advance().
63+ void ConsolidationCandidate::advance () noexcept {
64+ if (!initialized)
65+ return ;
14566
146- const auto stdev = std::sqrt (sum_square_differences /
147- static_cast <double >(consolidation.count ));
148- const auto cv = (stdev / segment_size_after_consolidaton_mean);
67+ const auto & removeMeta = segments.first ->meta ;
68+ const auto & addMeta = (segments.second + 1 )->meta ;
14969
150- // evaluate initial score
151- auto score = 1 . - cv ;
70+ std::advance (segments. first , 1 );
71+ std::advance (segments. second , 1 ) ;
15272
153- // favor consolidations that contain approximately the requested number of
154- // segments
155- score *= std::pow (static_cast <double >(consolidation.count ) /
156- static_cast <double >(segments_per_tier),
157- 1.5 );
73+ auto getDelCount = [](const irs::SegmentInfo* itemMeta) {
74+ return (itemMeta->docs_count - itemMeta->live_docs_count );
75+ };
15876
159- // FIXME use relative measure, e.g. cosolidation_size/total_size
160- // carefully prefer smaller consolidations over the bigger ones
161- score /= std::pow (size_after_consolidation, 0.5 );
77+ mergeBytes = mergeBytes - removeMeta->byte_size + addMeta->byte_size ;
78+ skew = static_cast <double >(addMeta->byte_size ) / mergeBytes;
79+ delCount = delCount - getDelCount (removeMeta) + getDelCount (addMeta);
80+ mergeScore = skew + (1 / (1 + delCount));
81+ cost = mergeBytes * mergeScore;
82+ }
83+
84+ // Currently we're using powers of 4 to define tiers,
85+ // with the smallest tier being 0-4MB. We select subsequent
86+ // tiers by multiplying the last tier by 4.
87+ // So we get 0-4MB, 4MB-16MB and so on.
88+ size_t getConsolidationTier (size_t num) {
16289
163- // favor consolidations which clean out removals
164- score /= std::pow (static_cast <double >(size_after_consolidation) /
165- static_cast <double >(size_before_consolidation),
166- 2 );
90+ size_t nextTier = ConsolidationConfig::tier1;
91+ while (nextTier < num)
92+ nextTier = nextTier << 2 ;
16793
168- return score ;
94+ return nextTier ;
16995}
17096
17197} // namespace tier
172- } // namespace
17398
17499namespace irs ::index_utils {
175100
@@ -391,6 +316,9 @@ ConsolidationPolicy MakePolicy(const ConsolidateTier& options) {
391316 // / if
392317 // / - segment size is greater than 'max_segments_bytes / 2'
393318 // / - segment has many documents but only few deletions
319+ // /
320+ // / TODO - too_big_segments_threshold formula is unreasonable
321+ // / - add unit tests as well
394322 // /////////////////////////////////////////////////////////////////////////
395323
396324 const double_t total_fill_factor =
@@ -413,63 +341,36 @@ ConsolidationPolicy MakePolicy(const ConsolidateTier& options) {
413341 }
414342 }
415343
344+ // No point in attempting consolidation if we don't have
345+ // enough segments to fill the consolidation window
346+ if (sorted_segments.size () < tier::ConsolidationConfig::candidate_size)
347+ return ;
348+
416349 // /////////////////////////////////////////////////////////////////////////
417350 // / Stage 3
418- // / sort candidates
351+ // / sort candidates and organize them into tiers
419352 // /////////////////////////////////////////////////////////////////////////
420353
421354 std::sort (sorted_segments.begin (), sorted_segments.end ());
422355
423- // /////////////////////////////////////////////////////////////////////////
424- // / Stage 4
425- // / find proper candidates
426- // /////////////////////////////////////////////////////////////////////////
427-
428- tier::ConsolidationCandidate best (sorted_segments.begin ());
429-
430- if (sorted_segments.size () >= min_segments_per_tier) {
431- for (auto i = sorted_segments.begin (), end = sorted_segments.end ();
432- i != end; ++i) {
433- tier::ConsolidationCandidate candidate (i);
434-
435- while (candidate.segments .second != end &&
436- candidate.count < max_segments_per_tier) {
437- candidate.size += candidate.segments .second ->size ;
438-
439- if (candidate.size > max_segments_bytes) {
440- // overcome the limit
441- break ;
442- }
443-
444- ++candidate.count ;
445- ++candidate.segments .second ;
356+ auto getSegmentSize = [](const tier::SegmentStats& segment) {
357+ return segment.meta ->byte_size ;
358+ };
446359
447- if (candidate.count < min_segments_per_tier) {
448- // not enough segments yet
449- continue ;
450- }
451-
452- candidate.score = tier::consolidation_score (
453- candidate, max_segments_per_tier, floor_segment_bytes);
454-
455- if (candidate.score < min_score) {
456- // score is too small
457- continue ;
458- }
459-
460- if (best.score < candidate.score ) {
461- best = candidate;
462- }
463- }
464- }
465- }
360+ auto tiers = tier::mapToTiers (sorted_segments, getSegmentSize);
466361
467362 // /////////////////////////////////////////////////////////////////////////
468363 // / Stage 4
469- // / pick the best candidate
364+ // / Find best candidate for consolidation.
470365 // /////////////////////////////////////////////////////////////////////////
471366
472- std::copy (best.begin (), best.end (), std::back_inserter (candidates));
367+ tier::ConsolidationCandidate best;
368+ auto ret = tier::findBestConsolidationCandidate<tier::SegmentStats>(tiers, max_segments_bytes, best);
369+ if (!ret) {
370+ return ;
371+ }
372+
373+ std::copy (best.first (), best.last () + 1 , std::back_inserter (candidates));
473374 };
474375}
475376
0 commit comments