Skip to content

Commit 77307a6

Browse files
pks-tgitster
authored andcommitted
reftable/block: refactor binary search over restart points
When seeking a record in our block reader we perform a binary search over the block's restart points so that we don't have to do a linear scan over the whole block. The logic to do so is quite intricate though, which makes it hard to understand. Improve documentation and rename some of the functions and variables so that the code becomes easier to understand overall. This refactoring should not result in any change in behaviour. Signed-off-by: Patrick Steinhardt <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 2176792 commit 77307a6

File tree

1 file changed

+73
-27
lines changed

1 file changed

+73
-27
lines changed

reftable/block.c

Lines changed: 73 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -273,34 +273,36 @@ void block_reader_start(struct block_reader *br, struct block_iter *it)
273273
it->next_off = br->header_off + 4;
274274
}
275275

276-
struct restart_find_args {
276+
struct restart_needle_less_args {
277277
int error;
278-
struct strbuf key;
279-
struct block_reader *r;
278+
struct strbuf needle;
279+
struct block_reader *reader;
280280
};
281281

282-
static int restart_key_less(size_t idx, void *args)
282+
static int restart_needle_less(size_t idx, void *_args)
283283
{
284-
struct restart_find_args *a = args;
285-
uint32_t off = block_reader_restart_offset(a->r, idx);
284+
struct restart_needle_less_args *args = _args;
285+
uint32_t off = block_reader_restart_offset(args->reader, idx);
286286
struct string_view in = {
287-
.buf = a->r->block.data + off,
288-
.len = a->r->block_len - off,
287+
.buf = args->reader->block.data + off,
288+
.len = args->reader->block_len - off,
289289
};
290-
291-
/* the restart key is verbatim in the block, so this could avoid the
292-
alloc for decoding the key */
293-
struct strbuf rkey = STRBUF_INIT;
290+
struct strbuf kth_restart_key = STRBUF_INIT;
294291
uint8_t unused_extra;
295-
int n = reftable_decode_key(&rkey, &unused_extra, in);
296-
int result;
292+
int result, n;
293+
294+
/*
295+
* TODO: The restart key is verbatim in the block, so we can in theory
296+
* avoid decoding the key and thus save some allocations.
297+
*/
298+
n = reftable_decode_key(&kth_restart_key, &unused_extra, in);
297299
if (n < 0) {
298-
a->error = 1;
300+
args->error = 1;
299301
return -1;
300302
}
301303

302-
result = strbuf_cmp(&a->key, &rkey);
303-
strbuf_release(&rkey);
304+
result = strbuf_cmp(&args->needle, &kth_restart_key);
305+
strbuf_release(&kth_restart_key);
304306
return result < 0;
305307
}
306308

@@ -376,9 +378,9 @@ void block_iter_close(struct block_iter *it)
376378
int block_reader_seek(struct block_reader *br, struct block_iter *it,
377379
struct strbuf *want)
378380
{
379-
struct restart_find_args args = {
380-
.key = *want,
381-
.r = br,
381+
struct restart_needle_less_args args = {
382+
.needle = *want,
383+
.reader = br,
382384
};
383385
struct block_iter next = BLOCK_ITER_INIT;
384386
struct reftable_record rec;
@@ -390,7 +392,38 @@ int block_reader_seek(struct block_reader *br, struct block_iter *it,
390392
goto done;
391393
}
392394

393-
i = binsearch(br->restart_count, &restart_key_less, &args);
395+
/*
396+
* Perform a binary search over the block's restart points, which
397+
* avoids doing a linear scan over the whole block. Like this, we
398+
* identify the section of the block that should contain our key.
399+
*
400+
* Note that we explicitly search for the first restart point _greater_
401+
* than the sought-after record, not _greater or equal_ to it. In case
402+
* the sought-after record is located directly at the restart point we
403+
* would otherwise start doing the linear search at the preceding
404+
* restart point. While that works alright, we would end up scanning
405+
* too many record.
406+
*/
407+
i = binsearch(br->restart_count, &restart_needle_less, &args);
408+
409+
/*
410+
* Now there are multiple cases:
411+
*
412+
* - `i == 0`: The wanted record is smaller than the record found at
413+
* the first restart point. As the first restart point is the first
414+
* record in the block, our wanted record cannot be located in this
415+
* block at all. We still need to position the iterator so that the
416+
* next call to `block_iter_next()` will yield an end-of-iterator
417+
* signal.
418+
*
419+
* - `i == restart_count`: The wanted record was not found at any of
420+
* the restart points. As there is no restart point at the end of
421+
* the section the record may thus be contained in the last block.
422+
*
423+
* - `i > 0`: The wanted record must be contained in the section
424+
* before the found restart point. We thus do a linear search
425+
* starting from the preceding restart point.
426+
*/
394427
if (i > 0)
395428
it->next_off = block_reader_restart_offset(br, i - 1);
396429
else
@@ -399,21 +432,34 @@ int block_reader_seek(struct block_reader *br, struct block_iter *it,
399432

400433
reftable_record_init(&rec, block_reader_type(br));
401434

402-
/* We're looking for the last entry less/equal than the wanted key, so
403-
we have to go one entry too far and then back up.
404-
*/
435+
/*
436+
* We're looking for the last entry less than the wanted key so that
437+
* the next call to `block_reader_next()` would yield the wanted
438+
* record. We thus don't want to position our reader at the sought
439+
* after record, but one before. To do so, we have to go one entry too
440+
* far and then back up.
441+
*/
405442
while (1) {
406443
block_iter_copy_from(&next, it);
407444
err = block_iter_next(&next, &rec);
408445
if (err < 0)
409446
goto done;
410-
411-
reftable_record_key(&rec, &it->last_key);
412-
if (err > 0 || strbuf_cmp(&it->last_key, want) >= 0) {
447+
if (err > 0) {
413448
err = 0;
414449
goto done;
415450
}
416451

452+
/*
453+
* Check whether the current key is greater or equal to the
454+
* sought-after key. In case it is greater we know that the
455+
* record does not exist in the block and can thus abort early.
456+
* In case it is equal to the sought-after key we have found
457+
* the desired record.
458+
*/
459+
reftable_record_key(&rec, &it->last_key);
460+
if (strbuf_cmp(&it->last_key, want) >= 0)
461+
goto done;
462+
417463
block_iter_copy_from(it, &next);
418464
}
419465

0 commit comments

Comments
 (0)