Skip to content

Commit 8a24a10

Browse files
committed
flux-fsck: parallelize valref blobref checks
Problem: The values in a valref blobref array are checked synchronously one by one. This can be quite slow. Parallelize the valref blobref array verification by making each blobref lookup asynchronous. Rate cap the lookups at 1000 RPCs. This adjustment leads to slightly different output, as multiple blobref errors in a single valref will be output. Before, only the first invalid blobref would result in an error output. For this reason, only output detailed blobref errors under verbose output, output a single line error under non-verbose output. Adjust tests as a result of these output changes.
1 parent f7c6529 commit 8a24a10

File tree

2 files changed

+106
-21
lines changed

2 files changed

+106
-21
lines changed

src/cmd/builtin/fsck.c

Lines changed: 92 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,17 @@
2828

2929
#include "builtin.h"
3030

31+
#define BLOBREF_ASYNC_MAX 1000
32+
3133
static void fsck_treeobj (flux_t *h,
3234
const char *path,
3335
json_t *treeobj);
3436

37+
static void valref_load (flux_t *h,
38+
json_t *treeobj,
39+
int index,
40+
void *arg);
41+
3542
static bool verbose;
3643
static bool quiet;
3744
static int errorcount;
@@ -54,37 +61,104 @@ void read_error (const char *fmt, ...)
5461
va_end (ap);
5562
}
5663

57-
static void fsck_valref (flux_t *h,
58-
const char *path,
59-
json_t *treeobj)
64+
struct fsck_valref_data
65+
{
66+
flux_t *h;
67+
json_t *treeobj;
68+
int index;
69+
int count;
70+
int in_flight;
71+
const char *path;
72+
int errorcount;
73+
int errnum;
74+
};
75+
76+
static void valref_load_continuation (flux_future_t *f, void *arg)
6077
{
61-
int count = treeobj_get_count (treeobj);
78+
struct fsck_valref_data *fvd = arg;
6279
const void *buf;
6380
size_t buflen;
6481

65-
for (int i = 0; i < count; i++) {
66-
flux_future_t *f;
67-
if (!(f = content_load_byblobref (h,
68-
treeobj_get_blobref (treeobj, i),
69-
CONTENT_FLAG_CACHE_BYPASS))
70-
|| content_load_get (f, &buf, &buflen) < 0) {
82+
if (content_load_get (f, &buf, &buflen) < 0) {
83+
if (verbose) {
84+
int *index = flux_future_aux_get (f, "index");
7185
if (errno == ENOENT)
7286
read_error ("%s: missing blobref index=%d",
73-
path,
74-
i);
87+
fvd->path,
88+
(*index));
7589
else
7690
read_error ("%s: error retrieving blobref index=%d: %s",
77-
path,
78-
i,
91+
fvd->path,
92+
(*index),
7993
future_strerror (f, errno));
80-
errorcount++;
81-
flux_future_destroy (f);
82-
return;
8394
}
84-
flux_future_destroy (f);
95+
fvd->errorcount++;
96+
fvd->errnum = errno; /* we'll report the last errno */
97+
}
98+
fvd->in_flight--;
99+
100+
if (fvd->index < fvd->count) {
101+
valref_load (fvd->h, fvd->treeobj, fvd->index, fvd);
102+
fvd->in_flight++;
103+
fvd->index++;
104+
}
105+
106+
flux_future_destroy (f);
107+
}
108+
109+
static void valref_load (flux_t *h, json_t *treeobj, int index, void *arg)
110+
{
111+
flux_future_t *f;
112+
int *indexp;
113+
if (!(f = content_load_byblobref (h,
114+
treeobj_get_blobref (treeobj, index),
115+
CONTENT_FLAG_CACHE_BYPASS))
116+
|| flux_future_then (f, -1, valref_load_continuation, arg) < 0)
117+
log_err_exit ("cannot retrieve valref blob");
118+
if (!(indexp = (int *)malloc (sizeof (int))))
119+
log_err_exit ("cannot allocate index memory");
120+
(*indexp) = index;
121+
if (flux_future_aux_set (f, "index", indexp, free) < 0)
122+
log_err_exit ("could not save index value");
123+
}
124+
125+
static void fsck_valref (flux_t *h, const char *path, json_t *treeobj)
126+
{
127+
struct fsck_valref_data fvd = {0};
128+
129+
fvd.h = h;
130+
fvd.treeobj = treeobj;
131+
fvd.count = treeobj_get_count (treeobj);
132+
fvd.path = path;
133+
134+
/* N.B. unlike flux-dump(1) out of order returns do not matter
135+
* here since we only care about verification, not the data.
136+
*/
137+
while (fvd.in_flight < BLOBREF_ASYNC_MAX
138+
&& fvd.index < fvd.count) {
139+
valref_load (h, treeobj, fvd.index, &fvd);
140+
fvd.in_flight++;
141+
fvd.index++;
142+
}
143+
144+
if (flux_reactor_run (flux_get_reactor (h), 0) < 0)
145+
log_err_exit ("flux_reactor_run");
146+
147+
if (fvd.errorcount) {
148+
/* each invalid blobref will be output in verbose mode */
149+
if (!verbose) {
150+
if (errno == ENOENT)
151+
read_error ("%s: missing blobref(s)", path);
152+
else
153+
read_error ("%s: error retrieving blobref(s): %s",
154+
path,
155+
strerror (fvd.errnum));
156+
}
157+
errorcount++;
85158
}
86159
}
87160

161+
88162
static void fsck_val (flux_t *h,
89163
const char *path,
90164
json_t *treeobj)

t/t2816-fsck-cmd.t

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,14 @@ test_expect_success 'flux-fsck detects errors' '
6767
test_must_fail flux fsck 2> fsckerrors1.out &&
6868
count=$(cat fsckerrors1.out | wc -l) &&
6969
test $count -eq 3 &&
70-
grep "dir\.b" fsckerrors1.out | grep "missing blobref" | grep "index=1" &&
70+
grep "dir\.b" fsckerrors1.out | grep "missing blobref(s)" &&
7171
grep "Total errors: 1" fsckerrors1.out
7272
'
73+
test_expect_success 'flux-fsck --verbose outputs details' '
74+
test_must_fail flux fsck --verbose 2> fsckerrors1V.out &&
75+
grep "dir\.b" fsckerrors1V.out | grep "missing blobref" | grep "index=1" &&
76+
grep "Total errors: 1" fsckerrors1V.out
77+
'
7378
test_expect_success 'flux-fsck no output with --quiet' '
7479
test_must_fail flux fsck --quiet 2> fsckerrors2.out &&
7580
count=$(cat fsckerrors2.out | wc -l) &&
@@ -90,10 +95,16 @@ test_expect_success 'flux-fsck detects errors' '
9095
test_must_fail flux fsck 2> fsckerrors3.out &&
9196
count=$(cat fsckerrors3.out | wc -l) &&
9297
test $count -eq 4 &&
93-
grep "dir\.b" fsckerrors3.out | grep "missing blobref" | grep "index=1" &&
94-
grep "dir\.c" fsckerrors3.out | grep "missing blobref" | grep "index=2" &&
98+
grep "dir\.b" fsckerrors3.out | grep "missing blobref(s)" &&
99+
grep "dir\.c" fsckerrors3.out | grep "missing blobref(s)" &&
95100
grep "Total errors: 2" fsckerrors3.out
96101
'
102+
test_expect_success 'flux-fsck --verbose outputs details' '
103+
test_must_fail flux fsck --verbose 2> fsckerrors3V.out &&
104+
grep "dir\.b" fsckerrors3V.out | grep "missing blobref" | grep "index=1" &&
105+
grep "dir\.c" fsckerrors3V.out | grep "missing blobref" | grep "index=2" &&
106+
grep "Total errors: 2" fsckerrors3V.out
107+
'
97108
test_expect_success 'flux-fsck no output with --quiet' '
98109
test_must_fail flux fsck --quiet 2> fsckerrors4.out &&
99110
count=$(cat fsckerrors4.out | wc -l) &&

0 commit comments

Comments
 (0)