Skip to content

Commit cf4a3bd

Browse files
committed
Merge branch 'ps/reftable-multi-level-indices-fix'
Write multi-level indices for reftable has been corrected. * ps/reftable-multi-level-indices-fix: reftable: document reading and writing indices reftable/writer: fix writing multi-level indices reftable/writer: simplify writing index records reftable/writer: use correct type to iterate through index entries reftable/reader: be more careful about errors in indexed seeks
2 parents c875e0b + 4950aca commit cf4a3bd

File tree

3 files changed

+122
-27
lines changed

3 files changed

+122
-27
lines changed

reftable/reader.c

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -508,8 +508,38 @@ static int reader_seek_indexed(struct reftable_reader *r,
508508
if (err < 0)
509509
goto done;
510510

511+
/*
512+
* The index may consist of multiple levels, where each level may have
513+
* multiple index blocks. We start by doing a linear search in the
514+
* highest layer that identifies the relevant index block as well as
515+
* the record inside that block that corresponds to our wanted key.
516+
*/
511517
err = reader_seek_linear(&index_iter, &want_index);
518+
if (err < 0)
519+
goto done;
520+
521+
/*
522+
* Traverse down the levels until we find a non-index entry.
523+
*/
512524
while (1) {
525+
/*
526+
* In case we seek a record that does not exist the index iter
527+
* will tell us that the iterator is over. This works because
528+
* the last index entry of the current level will contain the
529+
* last key it knows about. So in case our seeked key is larger
530+
* than the last indexed key we know that it won't exist.
531+
*
532+
* There is one subtlety in the layout of the index section
533+
* that makes this work as expected: the highest-level index is
534+
* at end of the section and will point backwards and thus we
535+
* start reading from the end of the index section, not the
536+
* beginning.
537+
*
538+
* If that wasn't the case and the order was reversed then the
539+
* linear seek would seek into the lower levels and traverse
540+
* all levels of the index only to find out that the key does
541+
* not exist.
542+
*/
513543
err = table_iter_next(&index_iter, &index_result);
514544
table_iter_block_done(&index_iter);
515545
if (err != 0)

reftable/readwrite_test.c

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -866,6 +866,61 @@ static void test_write_multiple_indices(void)
866866
strbuf_release(&buf);
867867
}
868868

869+
static void test_write_multi_level_index(void)
870+
{
871+
struct reftable_write_options opts = {
872+
.block_size = 100,
873+
};
874+
struct strbuf writer_buf = STRBUF_INIT, buf = STRBUF_INIT;
875+
struct reftable_block_source source = { 0 };
876+
struct reftable_iterator it = { 0 };
877+
const struct reftable_stats *stats;
878+
struct reftable_writer *writer;
879+
struct reftable_reader *reader;
880+
int err;
881+
882+
writer = reftable_new_writer(&strbuf_add_void, &noop_flush, &writer_buf, &opts);
883+
reftable_writer_set_limits(writer, 1, 1);
884+
for (size_t i = 0; i < 200; i++) {
885+
struct reftable_ref_record ref = {
886+
.update_index = 1,
887+
.value_type = REFTABLE_REF_VAL1,
888+
.value.val1 = {i},
889+
};
890+
891+
strbuf_reset(&buf);
892+
strbuf_addf(&buf, "refs/heads/%03" PRIuMAX, (uintmax_t)i);
893+
ref.refname = buf.buf,
894+
895+
err = reftable_writer_add_ref(writer, &ref);
896+
EXPECT_ERR(err);
897+
}
898+
reftable_writer_close(writer);
899+
900+
/*
901+
* The written refs should be sufficiently large to result in a
902+
* multi-level index.
903+
*/
904+
stats = reftable_writer_stats(writer);
905+
EXPECT(stats->ref_stats.max_index_level == 2);
906+
907+
block_source_from_strbuf(&source, &writer_buf);
908+
err = reftable_new_reader(&reader, &source, "filename");
909+
EXPECT_ERR(err);
910+
911+
/*
912+
* Seeking the last ref should work as expected.
913+
*/
914+
err = reftable_reader_seek_ref(reader, &it, "refs/heads/199");
915+
EXPECT_ERR(err);
916+
917+
reftable_iterator_destroy(&it);
918+
reftable_writer_free(writer);
919+
reftable_reader_free(reader);
920+
strbuf_release(&writer_buf);
921+
strbuf_release(&buf);
922+
}
923+
869924
static void test_corrupt_table_empty(void)
870925
{
871926
struct strbuf buf = STRBUF_INIT;
@@ -916,5 +971,6 @@ int readwrite_test_main(int argc, const char *argv[])
916971
RUN_TEST(test_write_object_id_length);
917972
RUN_TEST(test_write_object_id_min_length);
918973
RUN_TEST(test_write_multiple_indices);
974+
RUN_TEST(test_write_multi_level_index);
919975
return 0;
920976
}

reftable/writer.c

Lines changed: 36 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -379,20 +379,39 @@ int reftable_writer_add_logs(struct reftable_writer *w,
379379

380380
static int writer_finish_section(struct reftable_writer *w)
381381
{
382+
struct reftable_block_stats *bstats = NULL;
382383
uint8_t typ = block_writer_type(w->block_writer);
383384
uint64_t index_start = 0;
384385
int max_level = 0;
385-
int threshold = w->opts.unpadded ? 1 : 3;
386+
size_t threshold = w->opts.unpadded ? 1 : 3;
386387
int before_blocks = w->stats.idx_stats.blocks;
387-
int err = writer_flush_block(w);
388-
int i = 0;
389-
struct reftable_block_stats *bstats = NULL;
388+
int err;
389+
390+
err = writer_flush_block(w);
390391
if (err < 0)
391392
return err;
392393

394+
/*
395+
* When the section we are about to index has a lot of blocks then the
396+
* index itself may span across multiple blocks, as well. This would
397+
* require a linear scan over index blocks only to find the desired
398+
* indexed block, which is inefficient. Instead, we write a multi-level
399+
* index where index records of level N+1 will refer to index blocks of
400+
* level N. This isn't constant time, either, but at least logarithmic.
401+
*
402+
* This loop handles writing this multi-level index. Note that we write
403+
* the lowest-level index pointing to the indexed blocks first. We then
404+
* continue writing additional index levels until the current level has
405+
* less blocks than the threshold so that the highest level will be at
406+
* the end of the index section.
407+
*
408+
* Readers are thus required to start reading the index section from
409+
* its end, which is why we set `index_start` to the beginning of the
410+
* last index section.
411+
*/
393412
while (w->index_len > threshold) {
394413
struct reftable_index_record *idx = NULL;
395-
int idx_len = 0;
414+
size_t i, idx_len;
396415

397416
max_level++;
398417
index_start = w->next;
@@ -411,33 +430,26 @@ static int writer_finish_section(struct reftable_writer *w)
411430
.idx = idx[i],
412431
},
413432
};
414-
if (block_writer_add(w->block_writer, &rec) == 0) {
415-
continue;
416-
}
417433

418-
err = writer_flush_block(w);
434+
err = writer_add_record(w, &rec);
419435
if (err < 0)
420436
return err;
437+
}
421438

422-
writer_reinit_block_writer(w, BLOCK_TYPE_INDEX);
439+
err = writer_flush_block(w);
440+
if (err < 0)
441+
return err;
423442

424-
err = block_writer_add(w->block_writer, &rec);
425-
if (err != 0) {
426-
/* write into fresh block should always succeed
427-
*/
428-
abort();
429-
}
430-
}
431-
for (i = 0; i < idx_len; i++) {
443+
for (i = 0; i < idx_len; i++)
432444
strbuf_release(&idx[i].last_key);
433-
}
434445
reftable_free(idx);
435446
}
436447

437-
err = writer_flush_block(w);
438-
if (err < 0)
439-
return err;
440-
448+
/*
449+
* The index may still contain a number of index blocks lower than the
450+
* threshold. Clear it so that these entries don't leak into the next
451+
* index section.
452+
*/
441453
writer_clear_index(w);
442454

443455
bstats = writer_reftable_block_stats(w, typ);
@@ -630,11 +642,8 @@ int reftable_writer_close(struct reftable_writer *w)
630642

631643
static void writer_clear_index(struct reftable_writer *w)
632644
{
633-
int i = 0;
634-
for (i = 0; i < w->index_len; i++) {
645+
for (size_t i = 0; i < w->index_len; i++)
635646
strbuf_release(&w->index[i].last_key);
636-
}
637-
638647
FREE_AND_NULL(w->index);
639648
w->index_len = 0;
640649
w->index_cap = 0;

0 commit comments

Comments
 (0)