Skip to content

Commit 4d5a46e

Browse files
committed
Merge branch 'ps/reftable-optimize-io'
Low-level I/O optimization for reftable. * ps/reftable-optimize-io: reftable/stack: fix race in up-to-date check reftable/stack: unconditionally reload stack after commit reftable/blocksource: use mmap to read tables reftable/blocksource: refactor code to match our coding style reftable/stack: use stat info to avoid re-reading stack list reftable/stack: refactor reloading to use file descriptor reftable/stack: refactor stack reloading to have common exit path
2 parents b50a608 + 4f36b85 commit 4d5a46e

File tree

3 files changed

+172
-70
lines changed

3 files changed

+172
-70
lines changed

reftable/blocksource.c

Lines changed: 15 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,8 @@ struct reftable_block_source malloc_block_source(void)
7676
}
7777

7878
struct file_block_source {
79-
int fd;
8079
uint64_t size;
80+
unsigned char *data;
8181
};
8282

8383
static uint64_t file_size(void *b)
@@ -87,19 +87,12 @@ static uint64_t file_size(void *b)
8787

8888
static void file_return_block(void *b, struct reftable_block *dest)
8989
{
90-
if (dest->len)
91-
memset(dest->data, 0xff, dest->len);
92-
reftable_free(dest->data);
9390
}
9491

95-
static void file_close(void *b)
92+
static void file_close(void *v)
9693
{
97-
int fd = ((struct file_block_source *)b)->fd;
98-
if (fd > 0) {
99-
close(fd);
100-
((struct file_block_source *)b)->fd = 0;
101-
}
102-
94+
struct file_block_source *b = v;
95+
munmap(b->data, b->size);
10396
reftable_free(b);
10497
}
10598

@@ -108,9 +101,7 @@ static int file_read_block(void *v, struct reftable_block *dest, uint64_t off,
108101
{
109102
struct file_block_source *b = v;
110103
assert(off + size <= b->size);
111-
dest->data = reftable_malloc(size);
112-
if (pread_in_full(b->fd, dest->data, size, off) != size)
113-
return -1;
104+
dest->data = b->data + off;
114105
dest->len = size;
115106
return size;
116107
}
@@ -125,26 +116,26 @@ static struct reftable_block_source_vtable file_vtable = {
125116
int reftable_block_source_from_file(struct reftable_block_source *bs,
126117
const char *name)
127118
{
128-
struct stat st = { 0 };
129-
int err = 0;
130-
int fd = open(name, O_RDONLY);
131-
struct file_block_source *p = NULL;
119+
struct file_block_source *p;
120+
struct stat st;
121+
int fd;
122+
123+
fd = open(name, O_RDONLY);
132124
if (fd < 0) {
133-
if (errno == ENOENT) {
125+
if (errno == ENOENT)
134126
return REFTABLE_NOT_EXIST_ERROR;
135-
}
136127
return -1;
137128
}
138129

139-
err = fstat(fd, &st);
140-
if (err < 0) {
130+
if (fstat(fd, &st) < 0) {
141131
close(fd);
142132
return REFTABLE_IO_ERROR;
143133
}
144134

145-
p = reftable_calloc(sizeof(struct file_block_source));
135+
p = reftable_calloc(sizeof(*p));
146136
p->size = st.st_size;
147-
p->fd = fd;
137+
p->data = xmmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
138+
close(fd);
148139

149140
assert(!bs->ops);
150141
bs->ops = &file_vtable;

reftable/stack.c

Lines changed: 154 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ int reftable_new_stack(struct reftable_stack **dest, const char *dir,
6666
strbuf_addstr(&list_file_name, "/tables.list");
6767

6868
p->list_file = strbuf_detach(&list_file_name, NULL);
69+
p->list_fd = -1;
6970
p->reftable_dir = xstrdup(dir);
7071
p->config = config;
7172

@@ -175,6 +176,12 @@ void reftable_stack_destroy(struct reftable_stack *st)
175176
st->readers_len = 0;
176177
FREE_AND_NULL(st->readers);
177178
}
179+
180+
if (st->list_fd >= 0) {
181+
close(st->list_fd);
182+
st->list_fd = -1;
183+
}
184+
178185
FREE_AND_NULL(st->list_file);
179186
FREE_AND_NULL(st->reftable_dir);
180187
reftable_free(st);
@@ -304,69 +311,134 @@ static int tv_cmp(struct timeval *a, struct timeval *b)
304311
static int reftable_stack_reload_maybe_reuse(struct reftable_stack *st,
305312
int reuse_open)
306313
{
307-
struct timeval deadline = { 0 };
308-
int err = gettimeofday(&deadline, NULL);
314+
char **names = NULL, **names_after = NULL;
315+
struct timeval deadline;
309316
int64_t delay = 0;
310-
int tries = 0;
311-
if (err < 0)
312-
return err;
317+
int tries = 0, err;
318+
int fd = -1;
313319

320+
err = gettimeofday(&deadline, NULL);
321+
if (err < 0)
322+
goto out;
314323
deadline.tv_sec += 3;
324+
315325
while (1) {
316-
char **names = NULL;
317-
char **names_after = NULL;
318-
struct timeval now = { 0 };
319-
int err = gettimeofday(&now, NULL);
320-
int err2 = 0;
321-
if (err < 0) {
322-
return err;
323-
}
326+
struct timeval now;
324327

325-
/* Only look at deadlines after the first few times. This
326-
simplifies debugging in GDB */
328+
err = gettimeofday(&now, NULL);
329+
if (err < 0)
330+
goto out;
331+
332+
/*
333+
* Only look at deadlines after the first few times. This
334+
* simplifies debugging in GDB.
335+
*/
327336
tries++;
328-
if (tries > 3 && tv_cmp(&now, &deadline) >= 0) {
329-
break;
330-
}
337+
if (tries > 3 && tv_cmp(&now, &deadline) >= 0)
338+
goto out;
331339

332-
err = read_lines(st->list_file, &names);
333-
if (err < 0) {
334-
free_names(names);
335-
return err;
336-
}
337-
err = reftable_stack_reload_once(st, names, reuse_open);
338-
if (err == 0) {
339-
free_names(names);
340-
break;
341-
}
342-
if (err != REFTABLE_NOT_EXIST_ERROR) {
343-
free_names(names);
344-
return err;
345-
}
340+
fd = open(st->list_file, O_RDONLY);
341+
if (fd < 0) {
342+
if (errno != ENOENT) {
343+
err = REFTABLE_IO_ERROR;
344+
goto out;
345+
}
346346

347-
/* err == REFTABLE_NOT_EXIST_ERROR can be caused by a concurrent
348-
writer. Check if there was one by checking if the name list
349-
changed.
350-
*/
351-
err2 = read_lines(st->list_file, &names_after);
352-
if (err2 < 0) {
353-
free_names(names);
354-
return err2;
347+
names = reftable_calloc(sizeof(char *));
348+
} else {
349+
err = fd_read_lines(fd, &names);
350+
if (err < 0)
351+
goto out;
355352
}
356353

354+
err = reftable_stack_reload_once(st, names, reuse_open);
355+
if (!err)
356+
break;
357+
if (err != REFTABLE_NOT_EXIST_ERROR)
358+
goto out;
359+
360+
/*
361+
* REFTABLE_NOT_EXIST_ERROR can be caused by a concurrent
362+
* writer. Check if there was one by checking if the name list
363+
* changed.
364+
*/
365+
err = read_lines(st->list_file, &names_after);
366+
if (err < 0)
367+
goto out;
357368
if (names_equal(names_after, names)) {
358-
free_names(names);
359-
free_names(names_after);
360-
return err;
369+
err = REFTABLE_NOT_EXIST_ERROR;
370+
goto out;
361371
}
372+
362373
free_names(names);
374+
names = NULL;
363375
free_names(names_after);
376+
names_after = NULL;
377+
close(fd);
378+
fd = -1;
364379

365380
delay = delay + (delay * rand()) / RAND_MAX + 1;
366381
sleep_millisec(delay);
367382
}
368383

369-
return 0;
384+
out:
385+
/*
386+
* Invalidate the stat cache. It is sufficient to only close the file
387+
* descriptor and keep the cached stat info because we never use the
388+
* latter when the former is negative.
389+
*/
390+
if (st->list_fd >= 0) {
391+
close(st->list_fd);
392+
st->list_fd = -1;
393+
}
394+
395+
/*
396+
* Cache stat information in case it provides a useful signal to us.
397+
* According to POSIX, "The st_ino and st_dev fields taken together
398+
* uniquely identify the file within the system." That being said,
399+
* Windows is not POSIX compliant and we do not have these fields
400+
* available. So the information we have there is insufficient to
401+
* determine whether two file descriptors point to the same file.
402+
*
403+
* While we could fall back to using other signals like the file's
404+
* mtime, those are not sufficient to avoid races. We thus refrain from
405+
* using the stat cache on such systems and fall back to the secondary
406+
* caching mechanism, which is to check whether contents of the file
407+
* have changed.
408+
*
409+
* On other systems which are POSIX compliant we must keep the file
410+
* descriptor open. This is to avoid a race condition where two
411+
* processes access the reftable stack at the same point in time:
412+
*
413+
* 1. A reads the reftable stack and caches its stat info.
414+
*
415+
* 2. B updates the stack, appending a new table to "tables.list".
416+
* This will both use a new inode and result in a different file
417+
* size, thus invalidating A's cache in theory.
418+
*
419+
* 3. B decides to auto-compact the stack and merges two tables. The
420+
* file size now matches what A has cached again. Furthermore, the
421+
* filesystem may decide to recycle the inode number of the file
422+
* we have replaced in (2) because it is not in use anymore.
423+
*
424+
* 4. A reloads the reftable stack. Neither the inode number nor the
425+
* file size changed. If the timestamps did not change either then
426+
* we think the cached copy of our stack is up-to-date.
427+
*
428+
* By keeping the file descriptor open the inode number cannot be
429+
* recycled, mitigating the race.
430+
*/
431+
if (!err && fd >= 0 && !fstat(fd, &st->list_st) &&
432+
st->list_st.st_dev && st->list_st.st_ino) {
433+
st->list_fd = fd;
434+
fd = -1;
435+
}
436+
437+
if (fd >= 0)
438+
close(fd);
439+
free_names(names);
440+
free_names(names_after);
441+
return err;
370442
}
371443

372444
/* -1 = error
@@ -375,8 +447,44 @@ static int reftable_stack_reload_maybe_reuse(struct reftable_stack *st,
375447
static int stack_uptodate(struct reftable_stack *st)
376448
{
377449
char **names = NULL;
378-
int err = read_lines(st->list_file, &names);
450+
int err;
379451
int i = 0;
452+
453+
/*
454+
* When we have cached stat information available then we use it to
455+
* verify whether the file has been rewritten.
456+
*
457+
* Note that we explicitly do not want to use `stat_validity_check()`
458+
* and friends here because they may end up not comparing the `st_dev`
459+
* and `st_ino` fields. These functions thus cannot guarantee that we
460+
* indeed still have the same file.
461+
*/
462+
if (st->list_fd >= 0) {
463+
struct stat list_st;
464+
465+
if (stat(st->list_file, &list_st) < 0) {
466+
/*
467+
* It's fine for "tables.list" to not exist. In that
468+
* case, we have to refresh when the loaded stack has
469+
* any readers.
470+
*/
471+
if (errno == ENOENT)
472+
return !!st->readers_len;
473+
return REFTABLE_IO_ERROR;
474+
}
475+
476+
/*
477+
* When "tables.list" refers to the same file we can assume
478+
* that it didn't change. This is because we always use
479+
* rename(3P) to update the file and never write to it
480+
* directly.
481+
*/
482+
if (st->list_st.st_dev == list_st.st_dev &&
483+
st->list_st.st_ino == list_st.st_ino)
484+
return 0;
485+
}
486+
487+
err = read_lines(st->list_file, &names);
380488
if (err < 0)
381489
return err;
382490

@@ -559,7 +667,7 @@ int reftable_addition_commit(struct reftable_addition *add)
559667
add->new_tables = NULL;
560668
add->new_tables_len = 0;
561669

562-
err = reftable_stack_reload(add->stack);
670+
err = reftable_stack_reload_maybe_reuse(add->stack, 1);
563671
if (err)
564672
goto done;
565673

reftable/stack.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@ license that can be found in the LICENSE file or at
1414
#include "reftable-stack.h"
1515

1616
struct reftable_stack {
17+
struct stat list_st;
1718
char *list_file;
19+
int list_fd;
20+
1821
char *reftable_dir;
1922
int disable_auto_compact;
2023

0 commit comments

Comments
 (0)