|
36 | 36 | #include <linux/pagemap.h>
|
37 | 37 | #include <linux/debugfs.h>
|
38 | 38 | #include <linux/plist.h>
|
| 39 | +#include <linux/gfp.h> |
| 40 | +#include <linux/vmalloc.h> |
39 | 41 | #include <linux/memblock.h>
|
40 | 42 | #include <linux/fault-inject.h>
|
41 | 43 | #include <linux/slab.h>
|
|
51 | 53 | * reside in the same cacheline.
|
52 | 54 | */
|
53 | 55 | static struct {
|
54 |
| - struct futex_hash_bucket *queues; |
55 | 56 | unsigned long hashmask;
|
| 57 | + unsigned int hashshift; |
| 58 | + struct futex_hash_bucket *queues[MAX_NUMNODES]; |
56 | 59 | } __futex_data __read_mostly __aligned(2*sizeof(long));
|
57 |
| -#define futex_queues (__futex_data.queues) |
58 |
| -#define futex_hashmask (__futex_data.hashmask) |
| 60 | + |
| 61 | +#define futex_hashmask (__futex_data.hashmask) |
| 62 | +#define futex_hashshift (__futex_data.hashshift) |
| 63 | +#define futex_queues (__futex_data.queues) |
59 | 64 |
|
60 | 65 | struct futex_private_hash {
|
61 | 66 | rcuref_t users;
|
@@ -339,15 +344,35 @@ __futex_hash(union futex_key *key, struct futex_private_hash *fph)
|
339 | 344 | {
|
340 | 345 | struct futex_hash_bucket *hb;
|
341 | 346 | u32 hash;
|
| 347 | + int node; |
342 | 348 |
|
343 | 349 | hb = __futex_hash_private(key, fph);
|
344 | 350 | if (hb)
|
345 | 351 | return hb;
|
346 | 352 |
|
347 | 353 | hash = jhash2((u32 *)key,
|
348 |
| - offsetof(typeof(*key), both.offset) / 4, |
| 354 | + offsetof(typeof(*key), both.offset) / sizeof(u32), |
349 | 355 | key->both.offset);
|
350 |
| - return &futex_queues[hash & futex_hashmask]; |
| 356 | + node = key->both.node; |
| 357 | + |
| 358 | + if (node == FUTEX_NO_NODE) { |
| 359 | + /* |
| 360 | + * In case of !FLAGS_NUMA, use some unused hash bits to pick a |
| 361 | + * node -- this ensures regular futexes are interleaved across |
| 362 | + * the nodes and avoids having to allocate multiple |
| 363 | + * hash-tables. |
| 364 | + * |
| 365 | + * NOTE: this isn't perfectly uniform, but it is fast and |
| 366 | + * handles sparse node masks. |
| 367 | + */ |
| 368 | + node = (hash >> futex_hashshift) % nr_node_ids; |
| 369 | + if (!node_possible(node)) { |
| 370 | + node = find_next_bit_wrap(node_possible_map.bits, |
| 371 | + nr_node_ids, node); |
| 372 | + } |
| 373 | + } |
| 374 | + |
| 375 | + return &futex_queues[node][hash & futex_hashmask]; |
351 | 376 | }
|
352 | 377 |
|
353 | 378 | /**
|
@@ -454,25 +479,49 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
|
454 | 479 | struct page *page;
|
455 | 480 | struct folio *folio;
|
456 | 481 | struct address_space *mapping;
|
457 |
| - int err, ro = 0; |
| 482 | + int node, err, size, ro = 0; |
458 | 483 | bool fshared;
|
459 | 484 |
|
460 | 485 | fshared = flags & FLAGS_SHARED;
|
| 486 | + size = futex_size(flags); |
| 487 | + if (flags & FLAGS_NUMA) |
| 488 | + size *= 2; |
461 | 489 |
|
462 | 490 | /*
|
463 | 491 | * The futex address must be "naturally" aligned.
|
464 | 492 | */
|
465 | 493 | key->both.offset = address % PAGE_SIZE;
|
466 |
| - if (unlikely((address % sizeof(u32)) != 0)) |
| 494 | + if (unlikely((address % size) != 0)) |
467 | 495 | return -EINVAL;
|
468 | 496 | address -= key->both.offset;
|
469 | 497 |
|
470 |
| - if (unlikely(!access_ok(uaddr, sizeof(u32)))) |
| 498 | + if (unlikely(!access_ok(uaddr, size))) |
471 | 499 | return -EFAULT;
|
472 | 500 |
|
473 | 501 | if (unlikely(should_fail_futex(fshared)))
|
474 | 502 | return -EFAULT;
|
475 | 503 |
|
| 504 | + if (flags & FLAGS_NUMA) { |
| 505 | + u32 __user *naddr = (void *)uaddr + size / 2; |
| 506 | + |
| 507 | + if (futex_get_value(&node, naddr)) |
| 508 | + return -EFAULT; |
| 509 | + |
| 510 | + if (node == FUTEX_NO_NODE) { |
| 511 | + node = numa_node_id(); |
| 512 | + if (futex_put_value(node, naddr)) |
| 513 | + return -EFAULT; |
| 514 | + |
| 515 | + } else if (node >= MAX_NUMNODES || !node_possible(node)) { |
| 516 | + return -EINVAL; |
| 517 | + } |
| 518 | + |
| 519 | + key->both.node = node; |
| 520 | + |
| 521 | + } else { |
| 522 | + key->both.node = FUTEX_NO_NODE; |
| 523 | + } |
| 524 | + |
476 | 525 | /*
|
477 | 526 | * PROCESS_PRIVATE futexes are fast.
|
478 | 527 | * As the mm cannot disappear under us and the 'key' only needs
|
@@ -1642,24 +1691,41 @@ int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
|
1642 | 1691 | static int __init futex_init(void)
|
1643 | 1692 | {
|
1644 | 1693 | unsigned long hashsize, i;
|
1645 |
| - unsigned int futex_shift; |
| 1694 | + unsigned int order, n; |
| 1695 | + unsigned long size; |
1646 | 1696 |
|
1647 | 1697 | #ifdef CONFIG_BASE_SMALL
|
1648 | 1698 | hashsize = 16;
|
1649 | 1699 | #else
|
1650 |
| - hashsize = roundup_pow_of_two(256 * num_possible_cpus()); |
| 1700 | + hashsize = 256 * num_possible_cpus(); |
| 1701 | + hashsize /= num_possible_nodes(); |
| 1702 | + hashsize = max(4, hashsize); |
| 1703 | + hashsize = roundup_pow_of_two(hashsize); |
1651 | 1704 | #endif
|
| 1705 | + futex_hashshift = ilog2(hashsize); |
| 1706 | + size = sizeof(struct futex_hash_bucket) * hashsize; |
| 1707 | + order = get_order(size); |
1652 | 1708 |
|
1653 |
| - futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), |
1654 |
| - hashsize, 0, 0, |
1655 |
| - &futex_shift, NULL, |
1656 |
| - hashsize, hashsize); |
1657 |
| - hashsize = 1UL << futex_shift; |
| 1709 | + for_each_node(n) { |
| 1710 | + struct futex_hash_bucket *table; |
1658 | 1711 |
|
1659 |
| - for (i = 0; i < hashsize; i++) |
1660 |
| - futex_hash_bucket_init(&futex_queues[i], NULL); |
| 1712 | + if (order > MAX_PAGE_ORDER) |
| 1713 | + table = vmalloc_huge_node(size, GFP_KERNEL, n); |
| 1714 | + else |
| 1715 | + table = alloc_pages_exact_nid(n, size, GFP_KERNEL); |
| 1716 | + |
| 1717 | + BUG_ON(!table); |
| 1718 | + |
| 1719 | + for (i = 0; i < hashsize; i++) |
| 1720 | + futex_hash_bucket_init(&table[i], NULL); |
| 1721 | + |
| 1722 | + futex_queues[n] = table; |
| 1723 | + } |
1661 | 1724 |
|
1662 | 1725 | futex_hashmask = hashsize - 1;
|
| 1726 | + pr_info("futex hash table entries: %lu (%lu bytes on %d NUMA nodes, total %lu KiB, %s).\n", |
| 1727 | + hashsize, size, num_possible_nodes(), size * num_possible_nodes() / 1024, |
| 1728 | + order > MAX_PAGE_ORDER ? "vmalloc" : "linear"); |
1663 | 1729 | return 0;
|
1664 | 1730 | }
|
1665 | 1731 | core_initcall(futex_init);
|
0 commit comments