Skip to content

Commit c4ce867

Browse files
committed
Fixed consolidation candidate selection approach
Changed consolidation config defaults
1 parent d91b909 commit c4ce867

File tree

2 files changed

+142
-24
lines changed

2 files changed

+142
-24
lines changed

core/utils/index_utils.cpp

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -301,21 +301,28 @@ ConsolidationPolicy MakePolicy(const ConsolidateTier& options) {
301301

302302
///////////////////////////////////////////////////////////////////////////
303303
/// Stage 3
304-
/// sort candidates
304+
/// Find cleanup candidates
305305
///////////////////////////////////////////////////////////////////////////
306306

307-
std::sort(sorted_segments.begin(), sorted_segments.end());
308307
tier::ConsolidationCandidate<tier::SegmentStats> best;
308+
auto ret = tier::findBestCleanupCandidate<tier::SegmentStats>(sorted_segments, tier::getSegmentDimensions, best);
309+
if (ret && best.initialized && std::distance(best.first(), best.last()) >= 0) {
310+
std::copy(best.first(), best.last() + 1, std::back_inserter(candidates));
311+
return;
312+
}
309313

310314
///////////////////////////////////////////////////////////////////////////
311315
/// Stage 4
312-
/// find proper candidates
316+
/// find consolidation candidates
313317
///////////////////////////////////////////////////////////////////////////
314318

315-
if (!tier::findBestConsolidationCandidate<tier::SegmentStats>(sorted_segments, tier::getSegmentDimensions, best))
319+
if (!tier::findBestConsolidationCandidate<tier::SegmentStats>(
320+
sorted_segments,
321+
max_segments_bytes,
322+
tier::getSegmentDimensions, best))
316323
return;
317324

318-
candidates.reserve(tier::ConsolidationConfig::candidate_size);
325+
candidates.reserve(std::distance(best.first(), best.last()) + 1);
319326
std::copy(best.first(), best.last() + 1, std::back_inserter(candidates));
320327
};
321328
}

core/utils/index_utils.hpp

Lines changed: 130 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -79,10 +79,12 @@ namespace tier {
7979
};
8080

8181
struct ConsolidationConfig {
82-
static constexpr size_t candidate_size { 4 }; // candidate selection window size: 4
83-
static constexpr double maxMergeScore { 1.5 }; // max score allowed for candidates consolidation.
82+
static constexpr size_t candidate_size { 2 }; // candidate selection window size: 4
83+
static constexpr double maxMergeScore { 0.4 }; // max score allowed for candidates consolidation.
8484
// Skip consolidation if candidate score is greater
8585
// than this value.
86+
static constexpr double maxLivePercentage { 0.5 }; // Max live docs % of a segment to consider it
87+
// for cleanup during consolidation.
8688
};
8789

8890
// interface to fetch the required attributes from
@@ -125,62 +127,119 @@ namespace tier {
125127
do
126128
{
127129
accessor_(itr, byte_size, docs_count, live_docs_count);
128-
129130
mergeBytes += byte_size;
130-
delCount += (docs_count - live_docs_count);
131131

132132
} while (itr++ != end);
133133

134134
skew = static_cast<double>(byte_size) / mergeBytes;
135-
mergeScore = skew + (1.0 / (1 + delCount));
136-
cost = mergeBytes * mergeScore;
135+
mergeScore = skew;
137136
}
138137

139-
void advance() noexcept {
138+
bool pop_front() {
140139
if (!initialized)
141-
return;
140+
return false;
142141

143-
const auto& removeSegment = segments.first;
144-
const auto& addSegment = segments.second + 1;
142+
const auto removeSegment = first();
143+
const auto lastSegment = last();
145144

146145
std::advance(segments.first, 1);
147-
std::advance(segments.second, 1);
148146

149147
// Segment to be removed
150148
uint64_t rem_byte_size;
151149
uint64_t rem_docs_count;
152150
uint64_t rem_live_docs_count;
153151
accessor_(removeSegment, rem_byte_size, rem_docs_count, rem_live_docs_count);
154-
auto rem_del_count = rem_docs_count - rem_live_docs_count;
152+
153+
uint64_t last_seg_byte_size;
154+
uint64_t ignore;
155+
accessor_(lastSegment, last_seg_byte_size, ignore, ignore);
156+
157+
mergeBytes -= rem_byte_size;
158+
skew = static_cast<double>(last_seg_byte_size) / mergeBytes;
159+
mergeScore = skew;
160+
161+
return true;
162+
}
163+
164+
bool push_back() noexcept {
165+
if (!initialized)
166+
return false;
167+
168+
const auto addSegment = segments.second + 1;
169+
170+
std::advance(segments.second, 1);
155171

156172
// Segment to be added
157173
uint64_t add_byte_size;
158174
uint64_t add_docs_count;
159175
uint64_t add_live_docs_count;
160176
accessor_(addSegment, add_byte_size, add_docs_count, add_live_docs_count);
161-
auto add_del_count = add_docs_count - add_live_docs_count;
162177

163-
mergeBytes = mergeBytes - rem_byte_size + add_byte_size;
178+
mergeBytes += add_byte_size;
164179
skew = static_cast<double>(add_byte_size) / mergeBytes;
165-
delCount = delCount - rem_del_count + add_del_count;
166-
mergeScore = skew + (1.0 / (1 + delCount));
167-
cost = mergeBytes * mergeScore;
180+
mergeScore = skew;
181+
182+
return true;
168183
}
169184

170185
iterator_t first() const noexcept { return segments.first; }
171186
iterator_t last() const noexcept { return segments.second; }
172187

173188
size_t mergeBytes { 0 };
174189
double skew { 0.0 };
175-
size_t delCount { 0 };
176190
double mergeScore { 0.0 };
177-
double cost { 0.0 };
178191
bool initialized { false };
179192

180193
range_t segments;
181194
std::function<void(iterator_t, uint64_t&, uint64_t&, uint64_t&)> accessor_;
182195
};
183196

197+
template<typename Segment>
198+
bool findBestCleanupCandidate(
199+
std::vector<Segment>& segments,
200+
const std::function<
201+
void(const typename std::vector<Segment>::const_iterator,
202+
uint64_t& /* byte_size */,
203+
uint64_t& /* docs_count */,
204+
uint64_t& /* live_docs_count */)>& getSegmentAttributes,
205+
tier::ConsolidationCandidate<Segment>& best) {
206+
207+
auto segmentSortFunc = [](const Segment& left, const Segment& right) {
208+
209+
auto lMeta = left.meta;
210+
auto rMeta = right.meta;
211+
auto lLivePerc = static_cast<double>(lMeta->live_docs_count) / lMeta->docs_count;
212+
auto rLivePerc = static_cast<double>(rMeta->live_docs_count) / rMeta->docs_count;
213+
214+
return (lLivePerc < rLivePerc);
215+
};
216+
217+
std::sort(segments.begin(), segments.end(), segmentSortFunc);
218+
219+
auto count = 0;
220+
auto total_docs_count = 0;
221+
auto total_live_docs_count = 0;
222+
double livePerc;
223+
224+
for (auto itr = segments.begin(); itr != segments.end(); itr++) {
225+
auto meta = itr->meta;
226+
total_docs_count += meta->docs_count;
227+
total_live_docs_count += meta->live_docs_count;
228+
229+
livePerc = static_cast<double>(total_live_docs_count) / total_docs_count;
230+
if (livePerc > tier::ConsolidationConfig::maxLivePercentage)
231+
break;
232+
233+
++count;
234+
}
235+
236+
if (count < 1)
237+
return false;
238+
239+
best = ConsolidationCandidate<Segment>(segments.begin(), segments.begin() + count - 1, getSegmentAttributes);
240+
return true;
241+
}
242+
184243
//
185244
// This function receives a sorted vector of segments
186245
// and finds the best contiguous subset of segments to
@@ -197,13 +256,65 @@ namespace tier {
197256
//
198257
template<typename Segment>
199258
bool findBestConsolidationCandidate(
259+
std::vector<Segment>& sorted_segments,
260+
size_t max_segments_bytes,
261+
const std::function<
262+
void(const typename std::vector<Segment>::const_iterator,
263+
uint64_t& /* byte_size */,
264+
uint64_t& /* docs_count */,
265+
uint64_t& /* live_docs_count */)>& getSegmentAttributes,
266+
tier::ConsolidationCandidate<Segment>& best) {
267+
268+
// sort segments in increasing order of the segment byte size
269+
std::sort(sorted_segments.begin(), sorted_segments.end());
270+
271+
// We start with a min. window size of 2
272+
// since a window of size 1 will always
273+
// give us a skew of 1.0.
274+
uint64_t minWindowSize { tier::ConsolidationConfig::candidate_size };
275+
auto front = sorted_segments.begin();
276+
auto rear = front + minWindowSize - 1;
277+
tier::ConsolidationCandidate<Segment> candidate(front, rear, getSegmentAttributes);
278+
279+
double prev_score { 1.0 };
280+
281+
while ((candidate.first() + 1) < sorted_segments.end()) {
282+
283+
if (!best.initialized || (best.mergeScore > candidate.mergeScore && candidate.mergeBytes <= max_segments_bytes))
284+
best = candidate;
285+
286+
if (std::distance(candidate.first(), candidate.last()) < (minWindowSize - 1)) {
287+
candidate.push_back();
288+
continue;
289+
}
290+
291+
if (candidate.mergeScore > prev_score ||
292+
candidate.mergeBytes > max_segments_bytes ||
293+
candidate.last() == (sorted_segments.end() - 1)) {
294+
prev_score = candidate.mergeScore;
295+
candidate.pop_front();
296+
}
297+
else if (candidate.mergeScore <= prev_score && candidate.last() < (sorted_segments.end() - 1) &&
298+
candidate.mergeBytes <= max_segments_bytes) {
299+
prev_score = candidate.mergeScore;
300+
candidate.push_back();
301+
}
302+
}
303+
304+
return (best.initialized &&
305+
best.mergeScore <= tier::ConsolidationConfig::maxMergeScore);
306+
}
307+
308+
template<typename Segment>
309+
bool findBestConsolidationCandidate1(
200310
const std::vector<Segment>& sorted_segments,
201311
const std::function<
202312
void(const typename std::vector<Segment>::const_iterator,
203313
uint64_t& /* byte_size */,
204314
uint64_t& /* docs_count */,
205315
uint64_t& /* live_docs_count */)>& getSegmentAttributes,
206316
tier::ConsolidationCandidate<Segment>& best) {
317+
207318
if (sorted_segments.size() < tier::ConsolidationConfig::candidate_size)
208319
return false;
209320

0 commit comments

Comments
 (0)