Skip to content

Commit f540876

Browse files
author
Kent Overstreet
committed
bcachefs: Fix striping behaviour
For striping across devices, we maintain "clocks", and we advance them by the inverse of "how much free space this device has left", so that we round robin biased in favor of devices with more free space. This code was originally trying to do EWMA-ish stuff when originally written, ~10 years ago, and was never properly cleaned up when it was realized that an EWMA is not the right approach here. That left a bug, when we rescale to keep all the clocks in the correct range and prevent overflow. It was assumed that we'd always be allocated from the device with the smallest clock hand, but that's actually not correct: with the target options, allocations will be first tried from a subset of devices, and then the entire filesystem if that fails. Thus, the rescale from the first allocation - allocating from a subset of devices - can pick the wrong rescale value and cause the rest of the clocks to go to 0, losing information. This resuls in incorrect striping behaviour when the desired number of replicas doesn't fit on the foreground target. Link: https://www.reddit.com/r/bcachefs/comments/1jn3t26/replica_allocation_not_evenly_distributed_among/ Signed-off-by: Kent Overstreet <[email protected]>
1 parent 650f535 commit f540876

File tree

1 file changed

+48
-12
lines changed

1 file changed

+48
-12
lines changed

fs/bcachefs/alloc_foreground.c

Lines changed: 48 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -606,8 +606,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
606606
static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
607607
unsigned l, unsigned r)
608608
{
609-
return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
610-
(stripe->next_alloc[l] < stripe->next_alloc[r]));
609+
return cmp_int(stripe->next_alloc[l], stripe->next_alloc[r]);
611610
}
612611

613612
#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
@@ -626,25 +625,62 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
626625
return ret;
627626
}
628627

628+
static const u64 stripe_clock_hand_rescale = 1ULL << 62; /* trigger rescale at */
629+
static const u64 stripe_clock_hand_max = 1ULL << 56; /* max after rescale */
630+
static const u64 stripe_clock_hand_inv = 1ULL << 52; /* max increment, if a device is empty */
631+
632+
static noinline void bch2_stripe_state_rescale(struct dev_stripe_state *stripe)
633+
{
634+
/*
635+
* Avoid underflowing clock hands if at all possible, if clock hands go
636+
* to 0 then we lose information - clock hands can be in a wide range if
637+
* we have devices we rarely try to allocate from, if we generally
638+
* allocate from a specified target but only sometimes have to fall back
639+
* to the whole filesystem.
640+
*/
641+
u64 scale_max = U64_MAX; /* maximum we can subtract without underflow */
642+
u64 scale_min = 0; /* minumum we must subtract to avoid overflow */
643+
644+
for (u64 *v = stripe->next_alloc;
645+
v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) {
646+
if (*v)
647+
scale_max = min(scale_max, *v);
648+
if (*v > stripe_clock_hand_max)
649+
scale_min = max(scale_min, *v - stripe_clock_hand_max);
650+
}
651+
652+
u64 scale = max(scale_min, scale_max);
653+
654+
for (u64 *v = stripe->next_alloc;
655+
v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
656+
*v = *v < scale ? 0 : *v - scale;
657+
}
658+
629659
static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
630660
struct dev_stripe_state *stripe,
631661
struct bch_dev_usage *usage)
632662
{
663+
/*
664+
* Stripe state has a per device clock hand: we allocate from the device
665+
* with the smallest clock hand.
666+
*
667+
* When we allocate, we don't do a simple increment; we add the inverse
668+
* of the device's free space. This results in round robin behavior that
669+
* biases in favor of the device(s) with more free space.
670+
*/
671+
633672
u64 *v = stripe->next_alloc + ca->dev_idx;
634673
u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal);
635674
u64 free_space_inv = free_space
636-
? div64_u64(1ULL << 48, free_space)
637-
: 1ULL << 48;
638-
u64 scale = *v / 4;
675+
? div64_u64(stripe_clock_hand_inv, free_space)
676+
: stripe_clock_hand_inv;
639677

640-
if (*v + free_space_inv >= *v)
641-
*v += free_space_inv;
642-
else
643-
*v = U64_MAX;
678+
/* Saturating add, avoid overflow: */
679+
u64 sum = *v + free_space_inv;
680+
*v = sum >= *v ? sum : U64_MAX;
644681

645-
for (v = stripe->next_alloc;
646-
v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
647-
*v = *v < scale ? 0 : *v - scale;
682+
if (unlikely(*v > stripe_clock_hand_rescale))
683+
bch2_stripe_state_rescale(stripe);
648684
}
649685

650686
void bch2_dev_stripe_increment(struct bch_dev *ca,

0 commit comments

Comments
 (0)