Skip to content

Commit fb73fcd

Browse files
committed
Consolidation issue fix.
- Fixed the tier based candidate selection - Default tiers are powers of 4 with the first tier being 0-4M followed by 4-16M, 16-64M and so on. - Fixed consolidation window of size 4
1 parent 872d553 commit fb73fcd

File tree

3 files changed

+230
-182
lines changed

3 files changed

+230
-182
lines changed

core/index/index_meta.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,15 @@ class format;
3939
class IndexWriter;
4040

4141
struct SegmentInfo {
42+
SegmentInfo() = default;
43+
44+
// Added for testing purposes.
45+
SegmentInfo(
46+
const std::string& _name,
47+
uint64_t _byte_size
48+
) : name(_name), byte_size(_byte_size)
49+
{}
50+
4251
bool operator==(const SegmentInfo&) const = default;
4352

4453
std::string name; // FIXME(gnusi): move to SegmentMeta

core/utils/index_utils.cpp

Lines changed: 29 additions & 182 deletions
Original file line numberDiff line numberDiff line change
@@ -28,149 +28,27 @@
2828

2929
#include "formats/format_utils.hpp"
3030

31-
namespace {
32-
33-
// Returns percentage of live documents
34-
inline double FillFactor(const irs::SegmentInfo& segment) noexcept {
35-
return static_cast<double>(segment.live_docs_count) /
36-
static_cast<double>(segment.docs_count);
37-
}
38-
39-
// Returns approximated size of a segment in the absence of removals
40-
inline size_t SizeWithoutRemovals(const irs::SegmentInfo& segment) noexcept {
41-
return size_t(static_cast<double>(segment.byte_size) * FillFactor(segment));
42-
}
43-
4431
namespace tier {
4532

46-
struct SegmentStats {
47-
// cppcheck-suppress noExplicitConstructor
48-
SegmentStats(const irs::SubReader& reader) noexcept
49-
: reader{&reader},
50-
meta{&reader.Meta()},
51-
size{SizeWithoutRemovals(*meta)},
52-
fill_factor{FillFactor(*meta)} {}
53-
54-
bool operator<(const SegmentStats& rhs) const noexcept {
55-
// cppcheck-suppress constVariable
56-
auto& lhs = *this;
57-
58-
if (lhs.size == rhs.size) {
59-
if (lhs.fill_factor > rhs.fill_factor) {
60-
return true;
61-
} else if (lhs.fill_factor < rhs.fill_factor) {
62-
return false;
63-
}
64-
65-
return lhs.meta->name < rhs.meta->name;
33+
// interface to fetch the required attributes from
34+
// SegmentStats struct.
35+
// We use this function in struct ConsolidationCandidate
36+
// to fetch the segment dimensions from the SegmentStats
37+
// struct.
38+
//
39+
void getSegmentDimensions(
40+
std::vector<tier::SegmentStats>::const_iterator itr,
41+
uint64_t& byte_size,
42+
uint64_t& docs_count,
43+
uint64_t& live_docs_count) {
44+
45+
auto itrMeta = itr->meta;
46+
byte_size = itrMeta->byte_size;
47+
docs_count = itrMeta->docs_count;
48+
live_docs_count = itrMeta->live_docs_count;
6649
}
67-
68-
return lhs.size < rhs.size;
69-
}
70-
71-
operator const irs::SubReader*() const noexcept { return reader; }
72-
73-
const irs::SubReader* reader;
74-
const irs::SegmentInfo* meta;
75-
size_t size; // approximate size of segment without removals
76-
double_t fill_factor;
77-
};
78-
79-
struct ConsolidationCandidate {
80-
using iterator_t = std::vector<SegmentStats>::const_iterator;
81-
using range_t = std::pair<iterator_t, iterator_t>;
82-
83-
explicit ConsolidationCandidate(iterator_t i) noexcept : segments(i, i) {}
84-
85-
iterator_t begin() const noexcept { return segments.first; }
86-
iterator_t end() const noexcept { return segments.second; }
87-
88-
range_t segments;
89-
size_t count{0};
90-
size_t size{0}; // estimated size of the level
91-
double_t score{DBL_MIN}; // how good this permutation is
92-
};
93-
94-
/// @returns score of the consolidation bucket
95-
double_t consolidation_score(const ConsolidationCandidate& consolidation,
96-
const size_t segments_per_tier,
97-
const size_t floor_segment_bytes) noexcept {
98-
// to detect how skewed the consolidation we do the following:
99-
// 1. evaluate coefficient of variation, less is better
100-
// 2. good candidates are in range [0;1]
101-
// 3. favor condidates where number of segments is equal to
102-
// 'segments_per_tier' approx
103-
// 4. prefer smaller consolidations
104-
// 5. prefer consolidations which clean removals
105-
106-
switch (consolidation.count) {
107-
case 0:
108-
// empty consolidation makes not sense
109-
return DBL_MIN;
110-
case 1: {
111-
auto& meta = *consolidation.segments.first->meta;
112-
113-
if (meta.docs_count == meta.live_docs_count) {
114-
// singletone without removals makes no sense
115-
return DBL_MIN;
116-
}
117-
118-
// FIXME honor number of deletes???
119-
// signletone with removals makes sense if nothing better is found
120-
return DBL_MIN + DBL_EPSILON;
121-
}
122-
}
123-
124-
size_t size_before_consolidation = 0;
125-
size_t size_after_consolidation = 0;
126-
size_t size_after_consolidation_floored = 0;
127-
for (auto& segment_stat : consolidation) {
128-
size_before_consolidation += segment_stat.meta->byte_size;
129-
size_after_consolidation += segment_stat.size;
130-
size_after_consolidation_floored +=
131-
std::max(segment_stat.size, floor_segment_bytes);
132-
}
133-
134-
// evaluate coefficient of variation
135-
double sum_square_differences = 0;
136-
const auto segment_size_after_consolidaton_mean =
137-
static_cast<double>(size_after_consolidation_floored) /
138-
static_cast<double>(consolidation.count);
139-
for (auto& segment_stat : consolidation) {
140-
const auto diff =
141-
static_cast<double>(std::max(segment_stat.size, floor_segment_bytes)) -
142-
segment_size_after_consolidaton_mean;
143-
sum_square_differences += diff * diff;
144-
}
145-
146-
const auto stdev = std::sqrt(sum_square_differences /
147-
static_cast<double>(consolidation.count));
148-
const auto cv = (stdev / segment_size_after_consolidaton_mean);
149-
150-
// evaluate initial score
151-
auto score = 1. - cv;
152-
153-
// favor consolidations that contain approximately the requested number of
154-
// segments
155-
score *= std::pow(static_cast<double>(consolidation.count) /
156-
static_cast<double>(segments_per_tier),
157-
1.5);
158-
159-
// FIXME use relative measure, e.g. cosolidation_size/total_size
160-
// carefully prefer smaller consolidations over the bigger ones
161-
score /= std::pow(size_after_consolidation, 0.5);
162-
163-
// favor consolidations which clean out removals
164-
score /= std::pow(static_cast<double>(size_after_consolidation) /
165-
static_cast<double>(size_before_consolidation),
166-
2);
167-
168-
return score;
16950
}
17051

171-
} // namespace tier
172-
} // namespace
173-
17452
namespace irs::index_utils {
17553

17654
ConsolidationPolicy MakePolicy(const ConsolidateBytes& options) {
@@ -391,6 +269,9 @@ ConsolidationPolicy MakePolicy(const ConsolidateTier& options) {
391269
/// if
392270
/// - segment size is greater than 'max_segments_bytes / 2'
393271
/// - segment has many documents but only few deletions
272+
///
273+
/// TODO - too_big_segments_threshold formula is unreasonable
274+
/// - add unit tests as well
394275
///////////////////////////////////////////////////////////////////////////
395276

396277
const double_t total_fill_factor =
@@ -413,63 +294,29 @@ ConsolidationPolicy MakePolicy(const ConsolidateTier& options) {
413294
}
414295
}
415296

297+
// No point in attempting consolidation if we don't have
298+
// enough segments to fill the consolidation window
299+
if (sorted_segments.size() < tier::ConsolidationConfig::candidate_size)
300+
return;
301+
416302
///////////////////////////////////////////////////////////////////////////
417303
/// Stage 3
418304
/// sort candidates
419305
///////////////////////////////////////////////////////////////////////////
420306

421307
std::sort(sorted_segments.begin(), sorted_segments.end());
308+
tier::ConsolidationCandidate<tier::SegmentStats> best;
422309

423310
///////////////////////////////////////////////////////////////////////////
424311
/// Stage 4
425312
/// find proper candidates
426313
///////////////////////////////////////////////////////////////////////////
427314

428-
tier::ConsolidationCandidate best(sorted_segments.begin());
429-
430-
if (sorted_segments.size() >= min_segments_per_tier) {
431-
for (auto i = sorted_segments.begin(), end = sorted_segments.end();
432-
i != end; ++i) {
433-
tier::ConsolidationCandidate candidate(i);
434-
435-
while (candidate.segments.second != end &&
436-
candidate.count < max_segments_per_tier) {
437-
candidate.size += candidate.segments.second->size;
438-
439-
if (candidate.size > max_segments_bytes) {
440-
// overcome the limit
441-
break;
442-
}
443-
444-
++candidate.count;
445-
++candidate.segments.second;
446-
447-
if (candidate.count < min_segments_per_tier) {
448-
// not enough segments yet
449-
continue;
450-
}
451-
452-
candidate.score = tier::consolidation_score(
453-
candidate, max_segments_per_tier, floor_segment_bytes);
454-
455-
if (candidate.score < min_score) {
456-
// score is too small
457-
continue;
458-
}
459-
460-
if (best.score < candidate.score) {
461-
best = candidate;
462-
}
463-
}
464-
}
465-
}
466-
467-
///////////////////////////////////////////////////////////////////////////
468-
/// Stage 4
469-
/// pick the best candidate
470-
///////////////////////////////////////////////////////////////////////////
315+
if (!tier::findBestConsolidationCandidate<tier::SegmentStats>(sorted_segments, tier::getSegmentDimensions, best))
316+
return;
471317

472-
std::copy(best.begin(), best.end(), std::back_inserter(candidates));
318+
candidates.reserve(tier::ConsolidationConfig::candidate_size);
319+
std::copy(best.first(), best.last() + 1, std::back_inserter(candidates));
473320
};
474321
}
475322

0 commit comments

Comments
 (0)