Skip to content

Commit ae1f202

Browse files
committed
Added more unit tests for testing consolidation
1 parent afe615c commit ae1f202

File tree

3 files changed

+364
-58
lines changed

3 files changed

+364
-58
lines changed

core/utils/index_utils.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,9 @@ namespace tier {
4141
tier::SegmentAttributes& attrs) {
4242

4343
auto* meta = segment.meta;
44-
attrs.byte_size = meta->byte_size;
45-
attrs.docs_count = meta->docs_count;
46-
attrs.live_docs_count = meta->live_docs_count;
44+
attrs.byteSize = meta->byte_size;
45+
attrs.docsCount = meta->docs_count;
46+
attrs.liveDocsCount = meta->live_docs_count;
4747
}
4848
}
4949

core/utils/index_utils.hpp

Lines changed: 64 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -71,13 +71,13 @@ namespace tier {
7171
};
7272

7373
struct SegmentAttributes {
74-
uint64_t byte_size { 0 };
75-
uint64_t docs_count { 0 };
76-
uint64_t live_docs_count { 0 };
74+
uint64_t byteSize { 0 };
75+
uint64_t docsCount { 0 };
76+
uint64_t liveDocsCount { 0 };
7777

7878
SegmentAttributes() = default;
7979
SegmentAttributes(uint64_t b, uint64_t d, uint64_t l) :
80-
byte_size(b), docs_count(d), live_docs_count(l) {}
80+
byteSize(b), docsCount(d), liveDocsCount(l) {}
8181
};
8282

8383
// interface to fetch the required attributes from
@@ -115,11 +115,11 @@ namespace tier {
115115
do
116116
{
117117
accessor_(*itr, attrs);
118-
mergeBytes += attrs.byte_size;
118+
mergeBytes += attrs.byteSize;
119119

120120
} while (itr++ != end);
121121

122-
skew = static_cast<double>(attrs.byte_size) / mergeBytes;
122+
skew = static_cast<double>(attrs.byteSize) / mergeBytes;
123123
mergeScore = skew;
124124
}
125125

@@ -136,14 +136,14 @@ namespace tier {
136136
std::advance(segments.first, 1);
137137

138138
// Segment to be removed
139-
SegmentAttributes rem_seg_attrs;
140-
accessor_(*removeSegment, rem_seg_attrs);
139+
SegmentAttributes remSegAttrs;
140+
accessor_(*removeSegment, remSegAttrs);
141141

142-
SegmentAttributes last_seg_attrs;
143-
accessor_(*lastSegment, last_seg_attrs);
142+
SegmentAttributes lastSegAttrs;
143+
accessor_(*lastSegment, lastSegAttrs);
144144

145-
mergeBytes -= rem_seg_attrs.byte_size;
146-
skew = static_cast<double>(last_seg_attrs.byte_size) / mergeBytes;
145+
mergeBytes -= remSegAttrs.byteSize;
146+
skew = static_cast<double>(lastSegAttrs.byteSize) / mergeBytes;
147147
mergeScore = skew;
148148

149149
return true;
@@ -164,8 +164,8 @@ namespace tier {
164164
SegmentAttributes attrs;
165165
accessor_(*addSegment, attrs);
166166

167-
mergeBytes += attrs.byte_size;
168-
skew = static_cast<double>(attrs.byte_size) / mergeBytes;
167+
mergeBytes += attrs.byteSize;
168+
skew = static_cast<double>(attrs.byteSize) / mergeBytes;
169169
mergeScore = skew;
170170

171171
return true;
@@ -188,38 +188,38 @@ namespace tier {
188188
std::vector<Segment>& segments,
189189
const std::function<
190190
void(const Segment&,
191-
tier::SegmentAttributes& /* byte_size */
191+
tier::SegmentAttributes&
192192
)>& getSegmentAttributes,
193193
tier::ConsolidationCandidate<Segment>& best) {
194194

195195
auto segmentSortFunc = [&](const Segment& left, const Segment& right) {
196196

197197
tier::SegmentAttributes attrs;
198198
getSegmentAttributes(left, attrs);
199-
auto lLivePerc = static_cast<double>(attrs.live_docs_count) / attrs.docs_count;
199+
auto lLivePerc = static_cast<double>(attrs.liveDocsCount) / attrs.docsCount;
200200

201201
getSegmentAttributes(right, attrs);
202-
auto rLivePerc = static_cast<double>(attrs.live_docs_count) / attrs.docs_count;
202+
auto rLivePerc = static_cast<double>(attrs.liveDocsCount) / attrs.docsCount;
203203

204204
return lLivePerc < rLivePerc;
205205
};
206206

207207
std::sort(segments.begin(), segments.end(), segmentSortFunc);
208208

209209
auto count = 0;
210-
auto total_docs_count = 0;
211-
auto total_live_docs_count = 0;
210+
auto totalDocsCount = 0;
211+
auto totalLiveDocsCount = 0;
212212
double livePerc;
213213

214214
for (auto itr = segments.begin(); itr != segments.end(); itr++) {
215215

216216
tier::SegmentAttributes attrs;
217217
getSegmentAttributes(*itr, attrs);
218218

219-
total_docs_count += attrs.docs_count;
220-
total_live_docs_count += attrs.live_docs_count;
219+
totalDocsCount += attrs.docsCount;
220+
totalLiveDocsCount += attrs.liveDocsCount;
221221

222-
livePerc = static_cast<double>(total_live_docs_count) / total_docs_count;
222+
livePerc = static_cast<double>(totalLiveDocsCount) / totalDocsCount;
223223
if (livePerc > tier::ConsolidationConfig::maxLivePercentage)
224224
break;
225225

@@ -234,72 +234,85 @@ namespace tier {
234234
}
235235

236236
//
237-
// This function receives a sorted vector of segments
238-
// and finds the best contiguous subset of segments to
239-
// merge together.
237+
// This function receives a set of segments and finds
238+
// the best subset to merge together.
240239
// The best subset is defined as the one with the lowest
241-
// merge cost and the merge cost is computed inside the
242-
// ConslidateCandidate struct.
240+
// merge cost (i.e. skew). The merge cost is computed inside
241+
// the ConslidationCandidate struct upon candidate init,
242+
// push_back() and pop_front() operations.
243243
//
244-
// findBestConsolidationCandidate merely sets a rolling
245-
// window of size 4 on a subset of segments, it lets
246-
// ConsolidationCandidate compute the cost of merge for that
247-
// subset, repeats this process for all contiguous subsets
248-
// and identifies the best candidate.
244+
// findBestConsolidationCandidate sorts the set of segments
245+
// in the increasing order of the segment sizes and then finds
246+
// the largest possible subset of segments whose consolidated
247+
// size is within the maxSegmentsBytes range and has the
248+
// lowest skew.
249+
//
250+
// Currently it is only executed with struct tier::SegmentStats
251+
// as the template argument in ArangoSearch. However we leverage
252+
// this templatized design for writing unit tests.
253+
//
254+
// findBestConsolidationCandidate does not use the live %
255+
// to find the best candidate. It only needs the segment
256+
// byte size.
249257
//
250258
template<typename Segment>
251259
bool findBestConsolidationCandidate(
252-
std::vector<Segment>& sorted_segments,
253-
size_t max_segments_bytes,
260+
std::vector<Segment>& segments,
261+
size_t maxSegmentsBytes,
254262
const std::function<
255263
void(const Segment&,
256264
SegmentAttributes&
257265
)>& getSegmentAttributes,
258266
tier::ConsolidationCandidate<Segment>& best) {
259267

268+
// sort segments by segment size
260269
auto comp = [&](const Segment& lhs, const Segment& rhs) {
261270

262-
SegmentAttributes l_attrs;
263-
SegmentAttributes r_attrs;
271+
SegmentAttributes lAttrs;
272+
SegmentAttributes rAttrs;
273+
274+
getSegmentAttributes(lhs, lAttrs);
275+
getSegmentAttributes(rhs, rAttrs);
264276

265-
getSegmentAttributes(lhs, l_attrs);
266-
getSegmentAttributes(rhs, r_attrs);
277+
if (lAttrs.byteSize == rAttrs.byteSize) {
267278

268-
if (l_attrs.byte_size == r_attrs.byte_size) {
269-
270-
double lfill_factor = static_cast<double>(l_attrs.live_docs_count) / l_attrs.docs_count;
271-
double rfill_factor = static_cast<double>(r_attrs.live_docs_count) / r_attrs.docs_count;
279+
double lfill_factor = static_cast<double>(lAttrs.liveDocsCount) / lAttrs.docsCount;
280+
double rfill_factor = static_cast<double>(rAttrs.liveDocsCount) / rAttrs.docsCount;
272281
return lfill_factor > rfill_factor;
273282
}
274283

275-
return l_attrs.byte_size < r_attrs.byte_size;
284+
return lAttrs.byteSize < rAttrs.byteSize;
276285
};
277286

278287
// sort segments in increasing order of the segment byte size
279-
std::sort(sorted_segments.begin(), sorted_segments.end(), comp);
288+
std::sort(segments.begin(), segments.end(), comp);
280289

281290
// We start with a min. window size of 2
282291
// since a window of size 1 will always
283292
// give us a skew of 1.0.
284293
uint64_t minWindowSize { tier::ConsolidationConfig::candidate_size };
285-
auto front = sorted_segments.begin();
294+
auto front = segments.begin();
286295
auto rear = front + minWindowSize - 1;
287296
tier::ConsolidationCandidate<Segment> candidate(front, rear, getSegmentAttributes);
288297

289298
// Algorithm:
290-
// We
299+
// We start by setting the smallest possible window on the list of
300+
// sorted segments. We move the right end ahead to add more segments to
301+
// the window and we incrementally compute the merge cost for each subset.
302+
// We move the left end ahead to remove segments from the window and we
303+
// only do this when we're over the maxSegmentsBytes limit.
291304
while ((candidate.first() + minWindowSize - 1) <= candidate.last() &&
292-
candidate.last() < sorted_segments.end()) {
305+
candidate.last() < segments.end()) {
293306

294-
if (candidate.mergeBytes > max_segments_bytes) {
307+
if (candidate.mergeBytes > maxSegmentsBytes) {
295308
candidate.pop_front();
296309
continue;
297310
}
298311

299312
if (!best.initialized || best.mergeScore > candidate.mergeScore)
300313
best = candidate;
301314

302-
if (candidate.last() == (sorted_segments.end() - 1))
315+
if (candidate.last() == (segments.end() - 1))
303316
break;
304317

305318
candidate.push_back();
@@ -353,6 +366,8 @@ struct ConsolidateDocsFill {
353366

354367
ConsolidationPolicy MakePolicy(const ConsolidateDocsFill& options);
355368

369+
// [TODO] Currently unused as the new algorithm uses a different
370+
// approach. Only max_segments_bytes is in use.
356371
struct ConsolidateTier {
357372
// minimum allowed number of segments to consolidate at once
358373
size_t min_segments = 1;

0 commit comments

Comments
 (0)