Skip to content

Commit cec199c

Browse files
author
Peter Zijlstra
committed
futex: Implement FUTEX2_NUMA
Extend the futex2 interface to be numa aware. When FUTEX2_NUMA is specified for a futex, the user value is extended to two words (of the same size). The first is the user value we all know, the second one will be the node to place this futex on. struct futex_numa_32 { u32 val; u32 node; }; When node is set to ~0, WAIT will set it to the current node_id such that WAKE knows where to find it. If userspace corrupts the node value between WAIT and WAKE, the futex will not be found and no wakeup will happen. When FUTEX2_NUMA is not set, the node is simply an extension of the hash, such that traditional futexes are still interleaved over the nodes. This is done to avoid having to have a separate !numa hash-table. [bigeasy: ensure to have at least hashsize of 4 in futex_init(), add pr_info() for size and allocation information. Cast the naddr math to void*] Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Signed-off-by: Sebastian Andrzej Siewior <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent 63e8595 commit cec199c

File tree

4 files changed

+123
-20
lines changed

4 files changed

+123
-20
lines changed

include/linux/futex.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ union futex_key {
3434
u64 i_seq;
3535
unsigned long pgoff;
3636
unsigned int offset;
37+
/* unsigned int node; */
3738
} shared;
3839
struct {
3940
union {
@@ -42,11 +43,13 @@ union futex_key {
4243
};
4344
unsigned long address;
4445
unsigned int offset;
46+
/* unsigned int node; */
4547
} private;
4648
struct {
4749
u64 ptr;
4850
unsigned long word;
4951
unsigned int offset;
52+
unsigned int node; /* NOT hashed! */
5053
} both;
5154
};
5255

include/uapi/linux/futex.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,13 @@
7474
/* do not use */
7575
#define FUTEX_32 FUTEX2_SIZE_U32 /* historical accident :-( */
7676

77+
/*
78+
* When FUTEX2_NUMA doubles the futex word, the second word is a node value.
79+
* The special value -1 indicates no-node. This is the same value as
80+
* NUMA_NO_NODE, except that value is not ABI, this is.
81+
*/
82+
#define FUTEX_NO_NODE (-1)
83+
7784
/*
7885
* Max numbers of elements in a futex_waitv array
7986
*/

kernel/futex/core.c

Lines changed: 83 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
#include <linux/pagemap.h>
3737
#include <linux/debugfs.h>
3838
#include <linux/plist.h>
39+
#include <linux/gfp.h>
40+
#include <linux/vmalloc.h>
3941
#include <linux/memblock.h>
4042
#include <linux/fault-inject.h>
4143
#include <linux/slab.h>
@@ -51,11 +53,14 @@
5153
* reside in the same cacheline.
5254
*/
5355
static struct {
54-
struct futex_hash_bucket *queues;
5556
unsigned long hashmask;
57+
unsigned int hashshift;
58+
struct futex_hash_bucket *queues[MAX_NUMNODES];
5659
} __futex_data __read_mostly __aligned(2*sizeof(long));
57-
#define futex_queues (__futex_data.queues)
58-
#define futex_hashmask (__futex_data.hashmask)
60+
61+
#define futex_hashmask (__futex_data.hashmask)
62+
#define futex_hashshift (__futex_data.hashshift)
63+
#define futex_queues (__futex_data.queues)
5964

6065
struct futex_private_hash {
6166
rcuref_t users;
@@ -339,15 +344,35 @@ __futex_hash(union futex_key *key, struct futex_private_hash *fph)
339344
{
340345
struct futex_hash_bucket *hb;
341346
u32 hash;
347+
int node;
342348

343349
hb = __futex_hash_private(key, fph);
344350
if (hb)
345351
return hb;
346352

347353
hash = jhash2((u32 *)key,
348-
offsetof(typeof(*key), both.offset) / 4,
354+
offsetof(typeof(*key), both.offset) / sizeof(u32),
349355
key->both.offset);
350-
return &futex_queues[hash & futex_hashmask];
356+
node = key->both.node;
357+
358+
if (node == FUTEX_NO_NODE) {
359+
/*
360+
* In case of !FLAGS_NUMA, use some unused hash bits to pick a
361+
* node -- this ensures regular futexes are interleaved across
362+
* the nodes and avoids having to allocate multiple
363+
* hash-tables.
364+
*
365+
* NOTE: this isn't perfectly uniform, but it is fast and
366+
* handles sparse node masks.
367+
*/
368+
node = (hash >> futex_hashshift) % nr_node_ids;
369+
if (!node_possible(node)) {
370+
node = find_next_bit_wrap(node_possible_map.bits,
371+
nr_node_ids, node);
372+
}
373+
}
374+
375+
return &futex_queues[node][hash & futex_hashmask];
351376
}
352377

353378
/**
@@ -454,25 +479,49 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
454479
struct page *page;
455480
struct folio *folio;
456481
struct address_space *mapping;
457-
int err, ro = 0;
482+
int node, err, size, ro = 0;
458483
bool fshared;
459484

460485
fshared = flags & FLAGS_SHARED;
486+
size = futex_size(flags);
487+
if (flags & FLAGS_NUMA)
488+
size *= 2;
461489

462490
/*
463491
* The futex address must be "naturally" aligned.
464492
*/
465493
key->both.offset = address % PAGE_SIZE;
466-
if (unlikely((address % sizeof(u32)) != 0))
494+
if (unlikely((address % size) != 0))
467495
return -EINVAL;
468496
address -= key->both.offset;
469497

470-
if (unlikely(!access_ok(uaddr, sizeof(u32))))
498+
if (unlikely(!access_ok(uaddr, size)))
471499
return -EFAULT;
472500

473501
if (unlikely(should_fail_futex(fshared)))
474502
return -EFAULT;
475503

504+
if (flags & FLAGS_NUMA) {
505+
u32 __user *naddr = (void *)uaddr + size / 2;
506+
507+
if (futex_get_value(&node, naddr))
508+
return -EFAULT;
509+
510+
if (node == FUTEX_NO_NODE) {
511+
node = numa_node_id();
512+
if (futex_put_value(node, naddr))
513+
return -EFAULT;
514+
515+
} else if (node >= MAX_NUMNODES || !node_possible(node)) {
516+
return -EINVAL;
517+
}
518+
519+
key->both.node = node;
520+
521+
} else {
522+
key->both.node = FUTEX_NO_NODE;
523+
}
524+
476525
/*
477526
* PROCESS_PRIVATE futexes are fast.
478527
* As the mm cannot disappear under us and the 'key' only needs
@@ -1642,24 +1691,41 @@ int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
16421691
static int __init futex_init(void)
16431692
{
16441693
unsigned long hashsize, i;
1645-
unsigned int futex_shift;
1694+
unsigned int order, n;
1695+
unsigned long size;
16461696

16471697
#ifdef CONFIG_BASE_SMALL
16481698
hashsize = 16;
16491699
#else
1650-
hashsize = roundup_pow_of_two(256 * num_possible_cpus());
1700+
hashsize = 256 * num_possible_cpus();
1701+
hashsize /= num_possible_nodes();
1702+
hashsize = max(4, hashsize);
1703+
hashsize = roundup_pow_of_two(hashsize);
16511704
#endif
1705+
futex_hashshift = ilog2(hashsize);
1706+
size = sizeof(struct futex_hash_bucket) * hashsize;
1707+
order = get_order(size);
16521708

1653-
futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
1654-
hashsize, 0, 0,
1655-
&futex_shift, NULL,
1656-
hashsize, hashsize);
1657-
hashsize = 1UL << futex_shift;
1709+
for_each_node(n) {
1710+
struct futex_hash_bucket *table;
16581711

1659-
for (i = 0; i < hashsize; i++)
1660-
futex_hash_bucket_init(&futex_queues[i], NULL);
1712+
if (order > MAX_PAGE_ORDER)
1713+
table = vmalloc_huge_node(size, GFP_KERNEL, n);
1714+
else
1715+
table = alloc_pages_exact_nid(n, size, GFP_KERNEL);
1716+
1717+
BUG_ON(!table);
1718+
1719+
for (i = 0; i < hashsize; i++)
1720+
futex_hash_bucket_init(&table[i], NULL);
1721+
1722+
futex_queues[n] = table;
1723+
}
16611724

16621725
futex_hashmask = hashsize - 1;
1726+
pr_info("futex hash table entries: %lu (%lu bytes on %d NUMA nodes, total %lu KiB, %s).\n",
1727+
hashsize, size, num_possible_nodes(), size * num_possible_nodes() / 1024,
1728+
order > MAX_PAGE_ORDER ? "vmalloc" : "linear");
16631729
return 0;
16641730
}
16651731
core_initcall(futex_init);

kernel/futex/futex.h

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ static inline unsigned int futex_to_flags(unsigned int op)
5454
return flags;
5555
}
5656

57-
#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE)
57+
#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_NUMA | FUTEX2_PRIVATE)
5858

5959
/* FUTEX2_ to FLAGS_ */
6060
static inline unsigned int futex2_to_flags(unsigned int flags2)
@@ -87,6 +87,19 @@ static inline bool futex_flags_valid(unsigned int flags)
8787
if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32)
8888
return false;
8989

90+
/*
91+
* Must be able to represent both FUTEX_NO_NODE and every valid nodeid
92+
* in a futex word.
93+
*/
94+
if (flags & FLAGS_NUMA) {
95+
int bits = 8 * futex_size(flags);
96+
u64 max = ~0ULL;
97+
98+
max >>= 64 - bits;
99+
if (nr_node_ids >= max)
100+
return false;
101+
}
102+
90103
return true;
91104
}
92105

@@ -282,7 +295,7 @@ static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32
282295
* This looks a bit overkill, but generally just results in a couple
283296
* of instructions.
284297
*/
285-
static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from)
298+
static __always_inline int futex_get_value(u32 *dest, u32 __user *from)
286299
{
287300
u32 val;
288301

@@ -299,12 +312,26 @@ static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from)
299312
return -EFAULT;
300313
}
301314

315+
static __always_inline int futex_put_value(u32 val, u32 __user *to)
316+
{
317+
if (can_do_masked_user_access())
318+
to = masked_user_access_begin(to);
319+
else if (!user_read_access_begin(to, sizeof(*to)))
320+
return -EFAULT;
321+
unsafe_put_user(val, to, Efault);
322+
user_read_access_end();
323+
return 0;
324+
Efault:
325+
user_read_access_end();
326+
return -EFAULT;
327+
}
328+
302329
static inline int futex_get_value_locked(u32 *dest, u32 __user *from)
303330
{
304331
int ret;
305332

306333
pagefault_disable();
307-
ret = futex_read_inatomic(dest, from);
334+
ret = futex_get_value(dest, from);
308335
pagefault_enable();
309336

310337
return ret;

0 commit comments

Comments
 (0)