Skip to content

Commit f634f41

Browse files
zx2c4davem330
authored andcommitted
wireguard: allowedips: remove nodes in O(1)
Previously, deleting peers would require traversing the entire trie in order to rebalance nodes and safely free them. This meant that removing 1000 peers from a trie with a half million nodes would take an extremely long time, during which we're holding the rtnl lock. Large-scale users were reporting 200ms latencies added to the networking stack as a whole every time their userspace software would queue up significant removals. That's a serious situation. This commit fixes that by maintaining a double pointer to the parent's bit pointer for each node, and then using the already existing node list belonging to each peer to go directly to the node, fix up its pointers, and free it with RCU. This means removal is O(1) instead of O(n), and we don't use gobs of stack. The removal algorithm has the same downside as the code that it fixes: it won't collapse needlessly long runs of fillers. We can enhance that in the future if it ever becomes a problem. This commit documents that limitation with a TODO comment in code, a small but meaningful improvement over the prior situation. Currently the biggest flaw, which the next commit addresses, is that because this increases the node size on 64-bit machines from 60 bytes to 68 bytes. 60 rounds up to 64, but 68 rounds up to 128. So we wind up using twice as much memory per node, because of power-of-two allocations, which is a big bummer. We'll need to figure something out there. Fixes: e7096c1 ("net: WireGuard secure network tunnel") Cc: [email protected] Signed-off-by: Jason A. Donenfeld <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 46cfe8e commit f634f41

File tree

2 files changed

+57
-84
lines changed

2 files changed

+57
-84
lines changed

drivers/net/wireguard/allowedips.c

Lines changed: 54 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -66,60 +66,6 @@ static void root_remove_peer_lists(struct allowedips_node *root)
6666
}
6767
}
6868

69-
static void walk_remove_by_peer(struct allowedips_node __rcu **top,
70-
struct wg_peer *peer, struct mutex *lock)
71-
{
72-
#define REF(p) rcu_access_pointer(p)
73-
#define DEREF(p) rcu_dereference_protected(*(p), lockdep_is_held(lock))
74-
#define PUSH(p) ({ \
75-
WARN_ON(IS_ENABLED(DEBUG) && len >= 128); \
76-
stack[len++] = p; \
77-
})
78-
79-
struct allowedips_node __rcu **stack[128], **nptr;
80-
struct allowedips_node *node, *prev;
81-
unsigned int len;
82-
83-
if (unlikely(!peer || !REF(*top)))
84-
return;
85-
86-
for (prev = NULL, len = 0, PUSH(top); len > 0; prev = node) {
87-
nptr = stack[len - 1];
88-
node = DEREF(nptr);
89-
if (!node) {
90-
--len;
91-
continue;
92-
}
93-
if (!prev || REF(prev->bit[0]) == node ||
94-
REF(prev->bit[1]) == node) {
95-
if (REF(node->bit[0]))
96-
PUSH(&node->bit[0]);
97-
else if (REF(node->bit[1]))
98-
PUSH(&node->bit[1]);
99-
} else if (REF(node->bit[0]) == prev) {
100-
if (REF(node->bit[1]))
101-
PUSH(&node->bit[1]);
102-
} else {
103-
if (rcu_dereference_protected(node->peer,
104-
lockdep_is_held(lock)) == peer) {
105-
RCU_INIT_POINTER(node->peer, NULL);
106-
list_del_init(&node->peer_list);
107-
if (!node->bit[0] || !node->bit[1]) {
108-
rcu_assign_pointer(*nptr, DEREF(
109-
&node->bit[!REF(node->bit[0])]));
110-
kfree_rcu(node, rcu);
111-
node = DEREF(nptr);
112-
}
113-
}
114-
--len;
115-
}
116-
}
117-
118-
#undef REF
119-
#undef DEREF
120-
#undef PUSH
121-
}
122-
12369
static unsigned int fls128(u64 a, u64 b)
12470
{
12571
return a ? fls64(a) + 64U : fls64(b);
@@ -224,6 +170,7 @@ static int add(struct allowedips_node __rcu **trie, u8 bits, const u8 *key,
224170
RCU_INIT_POINTER(node->peer, peer);
225171
list_add_tail(&node->peer_list, &peer->allowedips_list);
226172
copy_and_assign_cidr(node, key, cidr, bits);
173+
rcu_assign_pointer(node->parent_bit, trie);
227174
rcu_assign_pointer(*trie, node);
228175
return 0;
229176
}
@@ -243,9 +190,9 @@ static int add(struct allowedips_node __rcu **trie, u8 bits, const u8 *key,
243190
if (!node) {
244191
down = rcu_dereference_protected(*trie, lockdep_is_held(lock));
245192
} else {
246-
down = rcu_dereference_protected(CHOOSE_NODE(node, key),
247-
lockdep_is_held(lock));
193+
down = rcu_dereference_protected(CHOOSE_NODE(node, key), lockdep_is_held(lock));
248194
if (!down) {
195+
rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, key));
249196
rcu_assign_pointer(CHOOSE_NODE(node, key), newnode);
250197
return 0;
251198
}
@@ -254,29 +201,37 @@ static int add(struct allowedips_node __rcu **trie, u8 bits, const u8 *key,
254201
parent = node;
255202

256203
if (newnode->cidr == cidr) {
204+
rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(newnode, down->bits));
257205
rcu_assign_pointer(CHOOSE_NODE(newnode, down->bits), down);
258-
if (!parent)
206+
if (!parent) {
207+
rcu_assign_pointer(newnode->parent_bit, trie);
259208
rcu_assign_pointer(*trie, newnode);
260-
else
261-
rcu_assign_pointer(CHOOSE_NODE(parent, newnode->bits),
262-
newnode);
263-
} else {
264-
node = kzalloc(sizeof(*node), GFP_KERNEL);
265-
if (unlikely(!node)) {
266-
list_del(&newnode->peer_list);
267-
kfree(newnode);
268-
return -ENOMEM;
209+
} else {
210+
rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(parent, newnode->bits));
211+
rcu_assign_pointer(CHOOSE_NODE(parent, newnode->bits), newnode);
269212
}
270-
INIT_LIST_HEAD(&node->peer_list);
271-
copy_and_assign_cidr(node, newnode->bits, cidr, bits);
272-
273-
rcu_assign_pointer(CHOOSE_NODE(node, down->bits), down);
274-
rcu_assign_pointer(CHOOSE_NODE(node, newnode->bits), newnode);
275-
if (!parent)
276-
rcu_assign_pointer(*trie, node);
277-
else
278-
rcu_assign_pointer(CHOOSE_NODE(parent, node->bits),
279-
node);
213+
return 0;
214+
}
215+
216+
node = kzalloc(sizeof(*node), GFP_KERNEL);
217+
if (unlikely(!node)) {
218+
list_del(&newnode->peer_list);
219+
kfree(newnode);
220+
return -ENOMEM;
221+
}
222+
INIT_LIST_HEAD(&node->peer_list);
223+
copy_and_assign_cidr(node, newnode->bits, cidr, bits);
224+
225+
rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(node, down->bits));
226+
rcu_assign_pointer(CHOOSE_NODE(node, down->bits), down);
227+
rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, newnode->bits));
228+
rcu_assign_pointer(CHOOSE_NODE(node, newnode->bits), newnode);
229+
if (!parent) {
230+
rcu_assign_pointer(node->parent_bit, trie);
231+
rcu_assign_pointer(*trie, node);
232+
} else {
233+
rcu_assign_pointer(node->parent_bit, &CHOOSE_NODE(parent, node->bits));
234+
rcu_assign_pointer(CHOOSE_NODE(parent, node->bits), node);
280235
}
281236
return 0;
282237
}
@@ -335,9 +290,30 @@ int wg_allowedips_insert_v6(struct allowedips *table, const struct in6_addr *ip,
335290
void wg_allowedips_remove_by_peer(struct allowedips *table,
336291
struct wg_peer *peer, struct mutex *lock)
337292
{
293+
struct allowedips_node *node, *child, *tmp;
294+
295+
if (list_empty(&peer->allowedips_list))
296+
return;
338297
++table->seq;
339-
walk_remove_by_peer(&table->root4, peer, lock);
340-
walk_remove_by_peer(&table->root6, peer, lock);
298+
list_for_each_entry_safe(node, tmp, &peer->allowedips_list, peer_list) {
299+
list_del_init(&node->peer_list);
300+
RCU_INIT_POINTER(node->peer, NULL);
301+
if (node->bit[0] && node->bit[1])
302+
continue;
303+
child = rcu_dereference_protected(
304+
node->bit[!rcu_access_pointer(node->bit[0])],
305+
lockdep_is_held(lock));
306+
if (child)
307+
child->parent_bit = node->parent_bit;
308+
*rcu_dereference_protected(node->parent_bit, lockdep_is_held(lock)) = child;
309+
kfree_rcu(node, rcu);
310+
311+
/* TODO: Note that we currently don't walk up and down in order to
312+
* free any potential filler nodes. This means that this function
313+
* doesn't free up as much as it could, which could be revisited
314+
* at some point.
315+
*/
316+
}
341317
}
342318

343319
int wg_allowedips_read_node(struct allowedips_node *node, u8 ip[16], u8 *cidr)

drivers/net/wireguard/allowedips.h

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,11 @@ struct wg_peer;
1515
struct allowedips_node {
1616
struct wg_peer __rcu *peer;
1717
struct allowedips_node __rcu *bit[2];
18-
/* While it may seem scandalous that we waste space for v4,
19-
* we're alloc'ing to the nearest power of 2 anyway, so this
20-
* doesn't actually make a difference.
21-
*/
22-
u8 bits[16] __aligned(__alignof(u64));
2318
u8 cidr, bit_at_a, bit_at_b, bitlen;
19+
u8 bits[16] __aligned(__alignof(u64));
2420

25-
/* Keep rarely used list at bottom to be beyond cache line. */
21+
/* Keep rarely used members at bottom to be beyond cache line. */
22+
struct allowedips_node *__rcu *parent_bit; /* XXX: this puts us at 68->128 bytes instead of 60->64 bytes!! */
2623
union {
2724
struct list_head peer_list;
2825
struct rcu_head rcu;

0 commit comments

Comments
 (0)