Skip to content

Commit 7fbe3ea

Browse files
committed
Merge branch 'ps/reftable-binsearch-updates'
Reftable code clean-up and some bugfixes. * ps/reftable-binsearch-updates: reftable/block: avoid decoding keys when searching restart points reftable/record: extract function to decode key lengths reftable/block: fix error handling when searching restart points reftable/block: refactor binary search over restart points reftable/refname: refactor binary search over refnames reftable/basics: improve `binsearch()` test reftable/basics: fix return type of `binsearch()` to be `size_t`
2 parents 847af43 + d51d8cc commit 7fbe3ea

File tree

7 files changed

+182
-97
lines changed

7 files changed

+182
-97
lines changed

reftable/basics.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ void put_be16(uint8_t *out, uint16_t i)
2727
out[1] = (uint8_t)(i & 0xff);
2828
}
2929

30-
int binsearch(size_t sz, int (*f)(size_t k, void *args), void *args)
30+
size_t binsearch(size_t sz, int (*f)(size_t k, void *args), void *args)
3131
{
3232
size_t lo = 0;
3333
size_t hi = sz;
@@ -39,8 +39,11 @@ int binsearch(size_t sz, int (*f)(size_t k, void *args), void *args)
3939
*/
4040
while (hi - lo > 1) {
4141
size_t mid = lo + (hi - lo) / 2;
42+
int ret = f(mid, args);
43+
if (ret < 0)
44+
return sz;
4245

43-
if (f(mid, args))
46+
if (ret > 0)
4447
hi = mid;
4548
else
4649
lo = mid;

reftable/basics.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,14 @@ uint32_t get_be24(uint8_t *in);
2222
void put_be16(uint8_t *out, uint16_t i);
2323

2424
/*
25-
* find smallest index i in [0, sz) at which f(i) is true, assuming
26-
* that f is ascending. Return sz if f(i) is false for all indices.
25+
* find smallest index i in [0, sz) at which `f(i) > 0`, assuming that f is
26+
* ascending. Return sz if `f(i) == 0` for all indices. The search is aborted
27+
* and `sz` is returned in case `f(i) < 0`.
2728
*
2829
* Contrary to bsearch(3), this returns something useful if the argument is not
2930
* found.
3031
*/
31-
int binsearch(size_t sz, int (*f)(size_t k, void *args), void *args);
32+
size_t binsearch(size_t sz, int (*f)(size_t k, void *args), void *args);
3233

3334
/*
3435
* Frees a NULL terminated array of malloced strings. The array itself is also

reftable/basics_test.c

Lines changed: 31 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -12,40 +12,47 @@ license that can be found in the LICENSE file or at
1212
#include "test_framework.h"
1313
#include "reftable-tests.h"
1414

15-
struct binsearch_args {
16-
int key;
17-
int *arr;
15+
struct integer_needle_lesseq_args {
16+
int needle;
17+
int *haystack;
1818
};
1919

20-
static int binsearch_func(size_t i, void *void_args)
20+
static int integer_needle_lesseq(size_t i, void *_args)
2121
{
22-
struct binsearch_args *args = void_args;
23-
24-
return args->key < args->arr[i];
22+
struct integer_needle_lesseq_args *args = _args;
23+
return args->needle <= args->haystack[i];
2524
}
2625

2726
static void test_binsearch(void)
2827
{
29-
int arr[] = { 2, 4, 6, 8, 10 };
30-
size_t sz = ARRAY_SIZE(arr);
31-
struct binsearch_args args = {
32-
.arr = arr,
28+
int haystack[] = { 2, 4, 6, 8, 10 };
29+
struct {
30+
int needle;
31+
size_t expected_idx;
32+
} testcases[] = {
33+
{-9000, 0},
34+
{-1, 0},
35+
{0, 0},
36+
{2, 0},
37+
{3, 1},
38+
{4, 1},
39+
{7, 3},
40+
{9, 4},
41+
{10, 4},
42+
{11, 5},
43+
{9000, 5},
3344
};
45+
size_t i = 0;
3446

35-
int i = 0;
36-
for (i = 1; i < 11; i++) {
37-
int res;
38-
args.key = i;
39-
res = binsearch(sz, &binsearch_func, &args);
47+
for (i = 0; i < ARRAY_SIZE(testcases); i++) {
48+
struct integer_needle_lesseq_args args = {
49+
.haystack = haystack,
50+
.needle = testcases[i].needle,
51+
};
52+
size_t idx;
4053

41-
if (res < sz) {
42-
EXPECT(args.key < arr[res]);
43-
if (res > 0) {
44-
EXPECT(args.key >= arr[res - 1]);
45-
}
46-
} else {
47-
EXPECT(args.key == 10 || args.key == 11);
48-
}
54+
idx = binsearch(ARRAY_SIZE(haystack), &integer_needle_lesseq, &args);
55+
EXPECT(idx == testcases[i].expected_idx);
4956
}
5057
}
5158

reftable/block.c

Lines changed: 86 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -273,35 +273,46 @@ void block_reader_start(struct block_reader *br, struct block_iter *it)
273273
it->next_off = br->header_off + 4;
274274
}
275275

276-
struct restart_find_args {
276+
struct restart_needle_less_args {
277277
int error;
278-
struct strbuf key;
279-
struct block_reader *r;
278+
struct strbuf needle;
279+
struct block_reader *reader;
280280
};
281281

282-
static int restart_key_less(size_t idx, void *args)
282+
static int restart_needle_less(size_t idx, void *_args)
283283
{
284-
struct restart_find_args *a = args;
285-
uint32_t off = block_reader_restart_offset(a->r, idx);
284+
struct restart_needle_less_args *args = _args;
285+
uint32_t off = block_reader_restart_offset(args->reader, idx);
286286
struct string_view in = {
287-
.buf = a->r->block.data + off,
288-
.len = a->r->block_len - off,
287+
.buf = args->reader->block.data + off,
288+
.len = args->reader->block_len - off,
289289
};
290+
uint64_t prefix_len, suffix_len;
291+
uint8_t extra;
292+
int n;
293+
294+
/*
295+
* Records at restart points are stored without prefix compression, so
296+
* there is no need to fully decode the record key here. This removes
297+
* the need for allocating memory.
298+
*/
299+
n = reftable_decode_keylen(in, &prefix_len, &suffix_len, &extra);
300+
if (n < 0 || prefix_len) {
301+
args->error = 1;
302+
return -1;
303+
}
290304

291-
/* the restart key is verbatim in the block, so this could avoid the
292-
alloc for decoding the key */
293-
struct strbuf rkey = STRBUF_INIT;
294-
uint8_t unused_extra;
295-
int n = reftable_decode_key(&rkey, &unused_extra, in);
296-
int result;
297-
if (n < 0) {
298-
a->error = 1;
305+
string_view_consume(&in, n);
306+
if (suffix_len > in.len) {
307+
args->error = 1;
299308
return -1;
300309
}
301310

302-
result = strbuf_cmp(&a->key, &rkey);
303-
strbuf_release(&rkey);
304-
return result < 0;
311+
n = memcmp(args->needle.buf, in.buf,
312+
args->needle.len < suffix_len ? args->needle.len : suffix_len);
313+
if (n)
314+
return n < 0;
315+
return args->needle.len < suffix_len;
305316
}
306317

307318
void block_iter_copy_from(struct block_iter *dest, struct block_iter *src)
@@ -376,20 +387,51 @@ void block_iter_close(struct block_iter *it)
376387
int block_reader_seek(struct block_reader *br, struct block_iter *it,
377388
struct strbuf *want)
378389
{
379-
struct restart_find_args args = {
380-
.key = *want,
381-
.r = br,
390+
struct restart_needle_less_args args = {
391+
.needle = *want,
392+
.reader = br,
382393
};
383394
struct block_iter next = BLOCK_ITER_INIT;
384395
struct reftable_record rec;
385-
int err = 0, i;
386-
396+
int err = 0;
397+
size_t i;
398+
399+
/*
400+
* Perform a binary search over the block's restart points, which
401+
* avoids doing a linear scan over the whole block. Like this, we
402+
* identify the section of the block that should contain our key.
403+
*
404+
* Note that we explicitly search for the first restart point _greater_
405+
* than the sought-after record, not _greater or equal_ to it. In case
406+
* the sought-after record is located directly at the restart point we
407+
* would otherwise start doing the linear search at the preceding
408+
* restart point. While that works alright, we would end up scanning
409+
* too many record.
410+
*/
411+
i = binsearch(br->restart_count, &restart_needle_less, &args);
387412
if (args.error) {
388413
err = REFTABLE_FORMAT_ERROR;
389414
goto done;
390415
}
391416

392-
i = binsearch(br->restart_count, &restart_key_less, &args);
417+
/*
418+
* Now there are multiple cases:
419+
*
420+
* - `i == 0`: The wanted record is smaller than the record found at
421+
* the first restart point. As the first restart point is the first
422+
* record in the block, our wanted record cannot be located in this
423+
* block at all. We still need to position the iterator so that the
424+
* next call to `block_iter_next()` will yield an end-of-iterator
425+
* signal.
426+
*
427+
* - `i == restart_count`: The wanted record was not found at any of
428+
* the restart points. As there is no restart point at the end of
429+
* the section the record may thus be contained in the last block.
430+
*
431+
* - `i > 0`: The wanted record must be contained in the section
432+
* before the found restart point. We thus do a linear search
433+
* starting from the preceding restart point.
434+
*/
393435
if (i > 0)
394436
it->next_off = block_reader_restart_offset(br, i - 1);
395437
else
@@ -398,21 +440,34 @@ int block_reader_seek(struct block_reader *br, struct block_iter *it,
398440

399441
reftable_record_init(&rec, block_reader_type(br));
400442

401-
/* We're looking for the last entry less/equal than the wanted key, so
402-
we have to go one entry too far and then back up.
403-
*/
443+
/*
444+
* We're looking for the last entry less than the wanted key so that
445+
* the next call to `block_reader_next()` would yield the wanted
446+
* record. We thus don't want to position our reader at the sought
447+
* after record, but one before. To do so, we have to go one entry too
448+
* far and then back up.
449+
*/
404450
while (1) {
405451
block_iter_copy_from(&next, it);
406452
err = block_iter_next(&next, &rec);
407453
if (err < 0)
408454
goto done;
409-
410-
reftable_record_key(&rec, &it->last_key);
411-
if (err > 0 || strbuf_cmp(&it->last_key, want) >= 0) {
455+
if (err > 0) {
412456
err = 0;
413457
goto done;
414458
}
415459

460+
/*
461+
* Check whether the current key is greater or equal to the
462+
* sought-after key. In case it is greater we know that the
463+
* record does not exist in the block and can thus abort early.
464+
* In case it is equal to the sought-after key we have found
465+
* the desired record.
466+
*/
467+
reftable_record_key(&rec, &it->last_key);
468+
if (strbuf_cmp(&it->last_key, want) >= 0)
469+
goto done;
470+
416471
block_iter_copy_from(it, &next);
417472
}
418473

reftable/record.c

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -159,26 +159,42 @@ int reftable_encode_key(int *restart, struct string_view dest,
159159
return start.len - dest.len;
160160
}
161161

162-
int reftable_decode_key(struct strbuf *last_key, uint8_t *extra,
163-
struct string_view in)
162+
int reftable_decode_keylen(struct string_view in,
163+
uint64_t *prefix_len,
164+
uint64_t *suffix_len,
165+
uint8_t *extra)
164166
{
165-
int start_len = in.len;
166-
uint64_t prefix_len = 0;
167-
uint64_t suffix_len = 0;
167+
size_t start_len = in.len;
168168
int n;
169169

170-
n = get_var_int(&prefix_len, &in);
170+
n = get_var_int(prefix_len, &in);
171171
if (n < 0)
172172
return -1;
173173
string_view_consume(&in, n);
174174

175-
n = get_var_int(&suffix_len, &in);
175+
n = get_var_int(suffix_len, &in);
176176
if (n <= 0)
177177
return -1;
178178
string_view_consume(&in, n);
179179

180-
*extra = (uint8_t)(suffix_len & 0x7);
181-
suffix_len >>= 3;
180+
*extra = (uint8_t)(*suffix_len & 0x7);
181+
*suffix_len >>= 3;
182+
183+
return start_len - in.len;
184+
}
185+
186+
int reftable_decode_key(struct strbuf *last_key, uint8_t *extra,
187+
struct string_view in)
188+
{
189+
int start_len = in.len;
190+
uint64_t prefix_len = 0;
191+
uint64_t suffix_len = 0;
192+
int n;
193+
194+
n = reftable_decode_keylen(in, &prefix_len, &suffix_len, extra);
195+
if (n < 0)
196+
return -1;
197+
string_view_consume(&in, n);
182198

183199
if (in.len < suffix_len ||
184200
prefix_len > last_key->len)

reftable/record.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,12 @@ int reftable_encode_key(int *is_restart, struct string_view dest,
8686
struct strbuf prev_key, struct strbuf key,
8787
uint8_t extra);
8888

89+
/* Decode a record's key lengths. */
90+
int reftable_decode_keylen(struct string_view in,
91+
uint64_t *prefix_len,
92+
uint64_t *suffix_len,
93+
uint8_t *extra);
94+
8995
/*
9096
* Decode into `last_key` and `extra` from `in`. `last_key` is expected to
9197
* contain the decoded key of the preceding record, if any.

0 commit comments

Comments
 (0)