2828
2929#include " formats/format_utils.hpp"
3030
31- namespace {
32-
33- // Returns percentage of live documents
34- inline double FillFactor (const irs::SegmentInfo& segment) noexcept {
35- return static_cast <double >(segment.live_docs_count ) /
36- static_cast <double >(segment.docs_count );
37- }
38-
39- // Returns approximated size of a segment in the absence of removals
40- inline size_t SizeWithoutRemovals (const irs::SegmentInfo& segment) noexcept {
41- return size_t (static_cast <double >(segment.byte_size ) * FillFactor (segment));
42- }
43-
4431namespace tier {
4532
46- struct SegmentStats {
47- // cppcheck-suppress noExplicitConstructor
48- SegmentStats (const irs::SubReader& reader) noexcept
49- : reader{&reader},
50- meta{&reader.Meta ()},
51- size{SizeWithoutRemovals (*meta)},
52- fill_factor{FillFactor (*meta)} {}
53-
54- bool operator <(const SegmentStats& rhs) const noexcept {
55- // cppcheck-suppress constVariable
56- auto & lhs = *this ;
57-
58- if (lhs.size == rhs.size ) {
59- if (lhs.fill_factor > rhs.fill_factor ) {
60- return true ;
61- } else if (lhs.fill_factor < rhs.fill_factor ) {
62- return false ;
63- }
64-
65- return lhs.meta ->name < rhs.meta ->name ;
33+ // interface to fetch the required attributes from
34+ // SegmentStats struct.
35+ // We use this function in struct ConsolidationCandidate
36+ // to fetch the segment dimensions from the SegmentStats
37+ // struct.
38+ //
39+ void getSegmentDimensions (
40+ std::vector<tier::SegmentStats>::const_iterator itr,
41+ uint64_t & byte_size,
42+ uint64_t & docs_count,
43+ uint64_t & live_docs_count) {
44+
45+ auto itrMeta = itr->meta ;
46+ byte_size = itrMeta->byte_size ;
47+ docs_count = itrMeta->docs_count ;
48+ live_docs_count = itrMeta->live_docs_count ;
6649 }
67-
68- return lhs.size < rhs.size ;
69- }
70-
71- operator const irs::SubReader*() const noexcept { return reader; }
72-
73- const irs::SubReader* reader;
74- const irs::SegmentInfo* meta;
75- size_t size; // approximate size of segment without removals
76- double_t fill_factor;
77- };
78-
79- struct ConsolidationCandidate {
80- using iterator_t = std::vector<SegmentStats>::const_iterator;
81- using range_t = std::pair<iterator_t , iterator_t >;
82-
83- explicit ConsolidationCandidate (iterator_t i) noexcept : segments(i, i) {}
84-
85- iterator_t begin () const noexcept { return segments.first ; }
86- iterator_t end () const noexcept { return segments.second ; }
87-
88- range_t segments;
89- size_t count{0 };
90- size_t size{0 }; // estimated size of the level
91- double_t score{DBL_MIN}; // how good this permutation is
92- };
93-
94- // / @returns score of the consolidation bucket
95- double_t consolidation_score (const ConsolidationCandidate& consolidation,
96- const size_t segments_per_tier,
97- const size_t floor_segment_bytes) noexcept {
98- // to detect how skewed the consolidation we do the following:
99- // 1. evaluate coefficient of variation, less is better
100- // 2. good candidates are in range [0;1]
101- // 3. favor condidates where number of segments is equal to
102- // 'segments_per_tier' approx
103- // 4. prefer smaller consolidations
104- // 5. prefer consolidations which clean removals
105-
106- switch (consolidation.count ) {
107- case 0 :
108- // empty consolidation makes not sense
109- return DBL_MIN;
110- case 1 : {
111- auto & meta = *consolidation.segments .first ->meta ;
112-
113- if (meta.docs_count == meta.live_docs_count ) {
114- // singletone without removals makes no sense
115- return DBL_MIN;
116- }
117-
118- // FIXME honor number of deletes???
119- // signletone with removals makes sense if nothing better is found
120- return DBL_MIN + DBL_EPSILON;
121- }
122- }
123-
124- size_t size_before_consolidation = 0 ;
125- size_t size_after_consolidation = 0 ;
126- size_t size_after_consolidation_floored = 0 ;
127- for (auto & segment_stat : consolidation) {
128- size_before_consolidation += segment_stat.meta ->byte_size ;
129- size_after_consolidation += segment_stat.size ;
130- size_after_consolidation_floored +=
131- std::max (segment_stat.size , floor_segment_bytes);
132- }
133-
134- // evaluate coefficient of variation
135- double sum_square_differences = 0 ;
136- const auto segment_size_after_consolidaton_mean =
137- static_cast <double >(size_after_consolidation_floored) /
138- static_cast <double >(consolidation.count );
139- for (auto & segment_stat : consolidation) {
140- const auto diff =
141- static_cast <double >(std::max (segment_stat.size , floor_segment_bytes)) -
142- segment_size_after_consolidaton_mean;
143- sum_square_differences += diff * diff;
144- }
145-
146- const auto stdev = std::sqrt (sum_square_differences /
147- static_cast <double >(consolidation.count ));
148- const auto cv = (stdev / segment_size_after_consolidaton_mean);
149-
150- // evaluate initial score
151- auto score = 1 . - cv;
152-
153- // favor consolidations that contain approximately the requested number of
154- // segments
155- score *= std::pow (static_cast <double >(consolidation.count ) /
156- static_cast <double >(segments_per_tier),
157- 1.5 );
158-
159- // FIXME use relative measure, e.g. cosolidation_size/total_size
160- // carefully prefer smaller consolidations over the bigger ones
161- score /= std::pow (size_after_consolidation, 0.5 );
162-
163- // favor consolidations which clean out removals
164- score /= std::pow (static_cast <double >(size_after_consolidation) /
165- static_cast <double >(size_before_consolidation),
166- 2 );
167-
168- return score;
16950}
17051
171- } // namespace tier
172- } // namespace
173-
17452namespace irs ::index_utils {
17553
17654ConsolidationPolicy MakePolicy (const ConsolidateBytes& options) {
@@ -391,6 +269,9 @@ ConsolidationPolicy MakePolicy(const ConsolidateTier& options) {
391269 // / if
392270 // / - segment size is greater than 'max_segments_bytes / 2'
393271 // / - segment has many documents but only few deletions
272+ // /
273+ // / TODO - too_big_segments_threshold formula is unreasonable
274+ // / - add unit tests as well
394275 // /////////////////////////////////////////////////////////////////////////
395276
396277 const double_t total_fill_factor =
@@ -413,63 +294,29 @@ ConsolidationPolicy MakePolicy(const ConsolidateTier& options) {
413294 }
414295 }
415296
297+ // No point in attempting consolidation if we don't have
298+ // enough segments to fill the consolidation window
299+ if (sorted_segments.size () < tier::ConsolidationConfig::candidate_size)
300+ return ;
301+
416302 // /////////////////////////////////////////////////////////////////////////
417303 // / Stage 3
418304 // / sort candidates
419305 // /////////////////////////////////////////////////////////////////////////
420306
421307 std::sort (sorted_segments.begin (), sorted_segments.end ());
308+ tier::ConsolidationCandidate<tier::SegmentStats> best;
422309
423310 // /////////////////////////////////////////////////////////////////////////
424311 // / Stage 4
425312 // / find proper candidates
426313 // /////////////////////////////////////////////////////////////////////////
427314
428- tier::ConsolidationCandidate best (sorted_segments.begin ());
429-
430- if (sorted_segments.size () >= min_segments_per_tier) {
431- for (auto i = sorted_segments.begin (), end = sorted_segments.end ();
432- i != end; ++i) {
433- tier::ConsolidationCandidate candidate (i);
434-
435- while (candidate.segments .second != end &&
436- candidate.count < max_segments_per_tier) {
437- candidate.size += candidate.segments .second ->size ;
438-
439- if (candidate.size > max_segments_bytes) {
440- // overcome the limit
441- break ;
442- }
443-
444- ++candidate.count ;
445- ++candidate.segments .second ;
446-
447- if (candidate.count < min_segments_per_tier) {
448- // not enough segments yet
449- continue ;
450- }
451-
452- candidate.score = tier::consolidation_score (
453- candidate, max_segments_per_tier, floor_segment_bytes);
454-
455- if (candidate.score < min_score) {
456- // score is too small
457- continue ;
458- }
459-
460- if (best.score < candidate.score ) {
461- best = candidate;
462- }
463- }
464- }
465- }
466-
467- // /////////////////////////////////////////////////////////////////////////
468- // / Stage 4
469- // / pick the best candidate
470- // /////////////////////////////////////////////////////////////////////////
315+ if (!tier::findBestConsolidationCandidate<tier::SegmentStats>(sorted_segments, tier::getSegmentDimensions, best))
316+ return ;
471317
472- std::copy (best.begin (), best.end (), std::back_inserter (candidates));
318+ candidates.reserve (tier::ConsolidationConfig::candidate_size);
319+ std::copy (best.first (), best.last () + 1 , std::back_inserter (candidates));
473320 };
474321}
475322
0 commit comments