Skip to content

Commit 0873882

Browse files
author
Florian Westphal
committed
netfilter: nf_tables: nft_set_rbtree: fix spurious insertion failure
nft_rbtree_gc_elem() walks back and removes the end interval element that comes before the expired element. There is a small chance that we've cached this element as 'rbe_ge'. If this happens, we hold and test a pointer that has been queued for freeing. It also causes spurious insertion failures: $ cat test-testcases-sets-0044interval_overlap_0.1/testout.log Error: Could not process rule: File exists add element t s { 0 - 2 } ^^^^^^ Failed to insert 0 - 2 given: table ip t { set s { type inet_service flags interval,timeout timeout 2s gc-interval 2s } } The set (rbtree) is empty. The 'failure' doesn't happen on next attempt. Reason is that when we try to insert, the tree may hold an expired element that collides with the range we're adding. While we do evict/erase this element, we can trip over this check: if (rbe_ge && nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new)) return -ENOTEMPTY; rbe_ge was erased by the synchronous gc, we should not have done this check. Next attempt won't find it, so retry results in successful insertion. Restart in-kernel to avoid such spurious errors. Such restart are rare, unless userspace intentionally adds very large numbers of elements with very short timeouts while setting a huge gc interval. Even in this case, this cannot loop forever, on each retry an existing element has been removed. As the caller is holding the transaction mutex, its impossible for a second entity to add more expiring elements to the tree. After this it also becomes feasible to remove the async gc worker and perform all garbage collection from the commit path. Fixes: c9e6978 ("netfilter: nft_set_rbtree: Switch to node list walk for overlap detection") Signed-off-by: Florian Westphal <[email protected]>
1 parent 0d880dc commit 0873882

File tree

1 file changed

+29
-17
lines changed

1 file changed

+29
-17
lines changed

net/netfilter/nft_set_rbtree.c

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -233,10 +233,9 @@ static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set,
233233
rb_erase(&rbe->node, &priv->root);
234234
}
235235

236-
static int nft_rbtree_gc_elem(const struct nft_set *__set,
237-
struct nft_rbtree *priv,
238-
struct nft_rbtree_elem *rbe,
239-
u8 genmask)
236+
static const struct nft_rbtree_elem *
237+
nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv,
238+
struct nft_rbtree_elem *rbe, u8 genmask)
240239
{
241240
struct nft_set *set = (struct nft_set *)__set;
242241
struct rb_node *prev = rb_prev(&rbe->node);
@@ -246,7 +245,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set,
246245

247246
gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC);
248247
if (!gc)
249-
return -ENOMEM;
248+
return ERR_PTR(-ENOMEM);
250249

251250
/* search for end interval coming before this element.
252251
* end intervals don't carry a timeout extension, they
@@ -261,6 +260,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set,
261260
prev = rb_prev(prev);
262261
}
263262

263+
rbe_prev = NULL;
264264
if (prev) {
265265
rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
266266
nft_rbtree_gc_remove(net, set, priv, rbe_prev);
@@ -272,21 +272,21 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set,
272272
*/
273273
gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
274274
if (WARN_ON_ONCE(!gc))
275-
return -ENOMEM;
275+
return ERR_PTR(-ENOMEM);
276276

277277
nft_trans_gc_elem_add(gc, rbe_prev);
278278
}
279279

280280
nft_rbtree_gc_remove(net, set, priv, rbe);
281281
gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
282282
if (WARN_ON_ONCE(!gc))
283-
return -ENOMEM;
283+
return ERR_PTR(-ENOMEM);
284284

285285
nft_trans_gc_elem_add(gc, rbe);
286286

287287
nft_trans_gc_queue_sync_done(gc);
288288

289-
return 0;
289+
return rbe_prev;
290290
}
291291

292292
static bool nft_rbtree_update_first(const struct nft_set *set,
@@ -314,7 +314,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
314314
struct nft_rbtree *priv = nft_set_priv(set);
315315
u8 cur_genmask = nft_genmask_cur(net);
316316
u8 genmask = nft_genmask_next(net);
317-
int d, err;
317+
int d;
318318

319319
/* Descend the tree to search for an existing element greater than the
320320
* key value to insert that is greater than the new element. This is the
@@ -363,9 +363,14 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
363363
*/
364364
if (nft_set_elem_expired(&rbe->ext) &&
365365
nft_set_elem_active(&rbe->ext, cur_genmask)) {
366-
err = nft_rbtree_gc_elem(set, priv, rbe, genmask);
367-
if (err < 0)
368-
return err;
366+
const struct nft_rbtree_elem *removed_end;
367+
368+
removed_end = nft_rbtree_gc_elem(set, priv, rbe, genmask);
369+
if (IS_ERR(removed_end))
370+
return PTR_ERR(removed_end);
371+
372+
if (removed_end == rbe_le || removed_end == rbe_ge)
373+
return -EAGAIN;
369374

370375
continue;
371376
}
@@ -486,11 +491,18 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
486491
struct nft_rbtree_elem *rbe = elem->priv;
487492
int err;
488493

489-
write_lock_bh(&priv->lock);
490-
write_seqcount_begin(&priv->count);
491-
err = __nft_rbtree_insert(net, set, rbe, ext);
492-
write_seqcount_end(&priv->count);
493-
write_unlock_bh(&priv->lock);
494+
do {
495+
if (fatal_signal_pending(current))
496+
return -EINTR;
497+
498+
cond_resched();
499+
500+
write_lock_bh(&priv->lock);
501+
write_seqcount_begin(&priv->count);
502+
err = __nft_rbtree_insert(net, set, rbe, ext);
503+
write_seqcount_end(&priv->count);
504+
write_unlock_bh(&priv->lock);
505+
} while (err == -EAGAIN);
494506

495507
return err;
496508
}

0 commit comments

Comments
 (0)