Skip to content

Commit e6ca1cd

Browse files
authored
Merge pull request #6828 from chu11/issue6782_flux_fsck_perf
flux-fsck: parallelize valref blobref checks
2 parents fd5c6d2 + 07acd81 commit e6ca1cd

File tree

2 files changed

+143
-28
lines changed

2 files changed

+143
-28
lines changed

src/cmd/builtin/fsck.c

Lines changed: 92 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,17 @@
2828

2929
#include "builtin.h"
3030

31+
#define BLOBREF_ASYNC_MAX 1000
32+
3133
static void fsck_treeobj (flux_t *h,
3234
const char *path,
3335
json_t *treeobj);
3436

37+
static void valref_load (flux_t *h,
38+
json_t *treeobj,
39+
int index,
40+
void *arg);
41+
3542
static bool verbose;
3643
static bool quiet;
3744
static int errorcount;
@@ -54,37 +61,104 @@ void read_error (const char *fmt, ...)
5461
va_end (ap);
5562
}
5663

57-
static void fsck_valref (flux_t *h,
58-
const char *path,
59-
json_t *treeobj)
64+
struct fsck_valref_data
65+
{
66+
flux_t *h;
67+
json_t *treeobj;
68+
int index;
69+
int count;
70+
int in_flight;
71+
const char *path;
72+
int errorcount;
73+
int errnum;
74+
};
75+
76+
static void valref_load_continuation (flux_future_t *f, void *arg)
6077
{
61-
int count = treeobj_get_count (treeobj);
78+
struct fsck_valref_data *fvd = arg;
6279
const void *buf;
6380
size_t buflen;
6481

65-
for (int i = 0; i < count; i++) {
66-
flux_future_t *f;
67-
if (!(f = content_load_byblobref (h,
68-
treeobj_get_blobref (treeobj, i),
69-
CONTENT_FLAG_CACHE_BYPASS))
70-
|| content_load_get (f, &buf, &buflen) < 0) {
82+
if (content_load_get (f, &buf, &buflen) < 0) {
83+
if (verbose) {
84+
int *index = flux_future_aux_get (f, "index");
7185
if (errno == ENOENT)
7286
read_error ("%s: missing blobref index=%d",
73-
path,
74-
i);
87+
fvd->path,
88+
(*index));
7589
else
7690
read_error ("%s: error retrieving blobref index=%d: %s",
77-
path,
78-
i,
91+
fvd->path,
92+
(*index),
7993
future_strerror (f, errno));
80-
errorcount++;
81-
flux_future_destroy (f);
82-
return;
8394
}
84-
flux_future_destroy (f);
95+
fvd->errorcount++;
96+
fvd->errnum = errno; /* we'll report the last errno */
97+
}
98+
fvd->in_flight--;
99+
100+
if (fvd->index < fvd->count) {
101+
valref_load (fvd->h, fvd->treeobj, fvd->index, fvd);
102+
fvd->in_flight++;
103+
fvd->index++;
104+
}
105+
106+
flux_future_destroy (f);
107+
}
108+
109+
static void valref_load (flux_t *h, json_t *treeobj, int index, void *arg)
110+
{
111+
flux_future_t *f;
112+
int *indexp;
113+
if (!(f = content_load_byblobref (h,
114+
treeobj_get_blobref (treeobj, index),
115+
CONTENT_FLAG_CACHE_BYPASS))
116+
|| flux_future_then (f, -1, valref_load_continuation, arg) < 0)
117+
log_err_exit ("cannot retrieve valref blob");
118+
if (!(indexp = (int *)malloc (sizeof (int))))
119+
log_err_exit ("cannot allocate index memory");
120+
(*indexp) = index;
121+
if (flux_future_aux_set (f, "index", indexp, free) < 0)
122+
log_err_exit ("could not save index value");
123+
}
124+
125+
static void fsck_valref (flux_t *h, const char *path, json_t *treeobj)
126+
{
127+
struct fsck_valref_data fvd = {0};
128+
129+
fvd.h = h;
130+
fvd.treeobj = treeobj;
131+
fvd.count = treeobj_get_count (treeobj);
132+
fvd.path = path;
133+
134+
/* N.B. unlike flux-dump(1) out of order returns do not matter
135+
* here since we only care about verification, not the data.
136+
*/
137+
while (fvd.in_flight < BLOBREF_ASYNC_MAX
138+
&& fvd.index < fvd.count) {
139+
valref_load (h, treeobj, fvd.index, &fvd);
140+
fvd.in_flight++;
141+
fvd.index++;
142+
}
143+
144+
if (flux_reactor_run (flux_get_reactor (h), 0) < 0)
145+
log_err_exit ("flux_reactor_run");
146+
147+
if (fvd.errorcount) {
148+
/* each invalid blobref will be output in verbose mode */
149+
if (!verbose) {
150+
if (errno == ENOENT)
151+
read_error ("%s: missing blobref(s)", path);
152+
else
153+
read_error ("%s: error retrieving blobref(s): %s",
154+
path,
155+
strerror (fvd.errnum));
156+
}
157+
errorcount++;
85158
}
86159
}
87160

161+
88162
static void fsck_val (flux_t *h,
89163
const char *path,
90164
json_t *treeobj)

t/t2816-fsck-cmd.t

Lines changed: 51 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -41,24 +41,49 @@ test_expect_success 'unload kvs' '
4141
'
4242
test_expect_success 'flux-fsck works' '
4343
flux fsck 2> simple.out &&
44-
grep "Checking integrity" simple.out &&
45-
grep "Total errors: 0" simple.out
44+
grep "Checking integrity" simple.out &&
45+
grep "Total errors: 0" simple.out
4646
'
4747
test_expect_success 'flux-fsck verbose works' '
4848
flux fsck --verbose 2> verbose.out &&
4949
grep "dir\.a" verbose.out &&
5050
grep "dir\.b" verbose.out &&
5151
grep "alink" verbose.out
5252
'
53+
# Cover value with a very large number of appends
54+
# N.B. from 1000 to 3000 instead of 0 to 2000, easier to debug errors
55+
# using fold(1) (i.e. all numbers same width)
56+
test_expect_success LONGTEST 'load kvs and create some kvs content' '
57+
flux module load kvs &&
58+
for i in `seq 1000 3000`; do
59+
flux kvs put --append bigval=${i}
60+
done &&
61+
flux kvs get bigval > bigval.exp
62+
'
63+
test_expect_success LONGTEST 'call sync to ensure we have checkpointed' '
64+
flux kvs sync
65+
'
66+
test_expect_success LONGTEST 'unload kvs' '
67+
flux module remove kvs
68+
'
69+
test_expect_success LONGTEST 'flux-fsck works' '
70+
flux fsck --verbose 2> bigval.out &&
71+
grep "Checking integrity" bigval.out &&
72+
grep "bigval" bigval.out &&
73+
grep "Total errors: 0" bigval.out
74+
'
5375
test_expect_success 'load kvs' '
5476
flux module load kvs
5577
'
5678
# unfortunately we don't have a `flux content remove` command, so we'll corrupt
5779
# a valref by overwriting a treeobj with a bad reference
5880
test_expect_success 'make a reference invalid' '
59-
cat dirb.out | jq -c .data[2]=\"sha1-1234567890123456789012345678901234567890\" > dirbbad.out &&
81+
cat dirb.out | jq -c .data[1]=\"sha1-1234567890123456789012345678901234567890\" > dirbbad.out &&
6082
flux kvs put --treeobj dir.b="$(cat dirbbad.out)"
6183
'
84+
test_expect_success 'call sync to ensure we have checkpointed' '
85+
flux kvs sync
86+
'
6287
test_expect_success 'unload kvs' '
6388
flux module remove kvs
6489
'
@@ -67,8 +92,13 @@ test_expect_success 'flux-fsck detects errors' '
6792
test_must_fail flux fsck 2> fsckerrors1.out &&
6893
count=$(cat fsckerrors1.out | wc -l) &&
6994
test $count -eq 3 &&
70-
grep "dir\.b" fsckerrors1.out | grep "missing blobref" &&
71-
grep "Total errors: 1" fsckerrors1.out
95+
grep "dir\.b" fsckerrors1.out | grep "missing blobref(s)" &&
96+
grep "Total errors: 1" fsckerrors1.out
97+
'
98+
test_expect_success 'flux-fsck --verbose outputs details' '
99+
test_must_fail flux fsck --verbose 2> fsckerrors1V.out &&
100+
grep "dir\.b" fsckerrors1V.out | grep "missing blobref" | grep "index=1" &&
101+
grep "Total errors: 1" fsckerrors1V.out
72102
'
73103
test_expect_success 'flux-fsck no output with --quiet' '
74104
test_must_fail flux fsck --quiet 2> fsckerrors2.out &&
@@ -79,8 +109,12 @@ test_expect_success 'load kvs' '
79109
flux module load kvs
80110
'
81111
test_expect_success 'make a reference invalid' '
82-
cat dirc.out | jq -c .data[2]=\"sha1-1234567890123456789012345678901234567890\" > dircbad.out &&
83-
flux kvs put --treeobj dir.c="$(cat dircbad.out)"
112+
cat dirc.out | jq -c .data[1]=\"sha1-1234567890123456789012345678901234567890\" > dircbad1.out &&
113+
cat dircbad1.out | jq -c .data[2]=\"sha1-1234567890123456789012345678901234567890\" > dircbad2.out &&
114+
flux kvs put --treeobj dir.c="$(cat dircbad2.out)"
115+
'
116+
test_expect_success 'call sync to ensure we have checkpointed' '
117+
flux kvs sync
84118
'
85119
test_expect_success 'unload kvs' '
86120
flux module remove kvs
@@ -90,9 +124,16 @@ test_expect_success 'flux-fsck detects errors' '
90124
test_must_fail flux fsck 2> fsckerrors3.out &&
91125
count=$(cat fsckerrors3.out | wc -l) &&
92126
test $count -eq 4 &&
93-
grep "dir\.b" fsckerrors3.out | grep "missing blobref" &&
94-
grep "dir\.c" fsckerrors3.out | grep "missing blobref" &&
95-
grep "Total errors: 2" fsckerrors3.out
127+
grep "dir\.b" fsckerrors3.out | grep "missing blobref(s)" &&
128+
grep "dir\.c" fsckerrors3.out | grep "missing blobref(s)" &&
129+
grep "Total errors: 2" fsckerrors3.out
130+
'
131+
test_expect_success 'flux-fsck --verbose outputs details' '
132+
test_must_fail flux fsck --verbose 2> fsckerrors3V.out &&
133+
grep "dir\.b" fsckerrors3V.out | grep "missing blobref" | grep "index=1" &&
134+
grep "dir\.c" fsckerrors3V.out | grep "missing blobref" | grep "index=1" &&
135+
grep "dir\.c" fsckerrors3V.out | grep "missing blobref" | grep "index=2" &&
136+
grep "Total errors: 2" fsckerrors3V.out
96137
'
97138
test_expect_success 'flux-fsck no output with --quiet' '
98139
test_must_fail flux fsck --quiet 2> fsckerrors4.out &&

0 commit comments

Comments
 (0)