Skip to content

Commit 1c134b1

Browse files
committed
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Ingo Molnar: "The main changes in this cycle were: - A PAT series from Davidlohr Bueso, which simplifies the memtype rbtree by using the interval tree helpers. (There's more cleanups in this area queued up, but they didn't make the merge window.) - Also flip over CONFIG_X86_5LEVEL to default-y. This might draw in a few more testers, as all the major distros are going to have 5-level paging enabled by default in their next iterations. - Misc cleanups" * 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mm/pat: Rename pat_rbtree.c to pat_interval.c x86/mm/pat: Drop the rbt_ prefix from external memtype calls x86/mm/pat: Do not pass 'rb_root' down the memtype tree helper functions x86/mm/pat: Convert the PAT tree to a generic interval tree x86/mm: Clean up the pmd_read_atomic() comments x86/mm: Fix function name typo in pmd_read_atomic() comment x86/cpu: Clean up intel_tlb_table[] x86/mm: Enable 5-level paging support by default
2 parents 24ee25a + 7f264da commit 1c134b1

File tree

8 files changed

+229
-309
lines changed

8 files changed

+229
-309
lines changed

arch/x86/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1462,6 +1462,7 @@ config X86_PAE
14621462

14631463
config X86_5LEVEL
14641464
bool "Enable 5-level page tables support"
1465+
default y
14651466
select DYNAMIC_MEMORY_LAYOUT
14661467
select SPARSEMEM_VMEMMAP
14671468
depends on X86_64

arch/x86/include/asm/pgtable-3level.h

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -36,39 +36,41 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte)
3636

3737
#define pmd_read_atomic pmd_read_atomic
3838
/*
39-
* pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with
40-
* a "*pmdp" dereference done by gcc. Problem is, in certain places
41-
* where pte_offset_map_lock is called, concurrent page faults are
39+
* pte_offset_map_lock() on 32-bit PAE kernels was reading the pmd_t with
40+
* a "*pmdp" dereference done by GCC. Problem is, in certain places
41+
* where pte_offset_map_lock() is called, concurrent page faults are
4242
* allowed, if the mmap_sem is hold for reading. An example is mincore
4343
* vs page faults vs MADV_DONTNEED. On the page fault side
44-
* pmd_populate rightfully does a set_64bit, but if we're reading the
44+
* pmd_populate() rightfully does a set_64bit(), but if we're reading the
4545
* pmd_t with a "*pmdp" on the mincore side, a SMP race can happen
46-
* because gcc will not read the 64bit of the pmd atomically. To fix
47-
* this all places running pmd_offset_map_lock() while holding the
46+
* because GCC will not read the 64-bit value of the pmd atomically.
47+
*
48+
* To fix this all places running pte_offset_map_lock() while holding the
4849
* mmap_sem in read mode, shall read the pmdp pointer using this
49-
* function to know if the pmd is null nor not, and in turn to know if
50-
* they can run pmd_offset_map_lock or pmd_trans_huge or other pmd
50+
* function to know if the pmd is null or not, and in turn to know if
51+
* they can run pte_offset_map_lock() or pmd_trans_huge() or other pmd
5152
* operations.
5253
*
53-
* Without THP if the mmap_sem is hold for reading, the pmd can only
54-
* transition from null to not null while pmd_read_atomic runs. So
54+
* Without THP if the mmap_sem is held for reading, the pmd can only
55+
* transition from null to not null while pmd_read_atomic() runs. So
5556
* we can always return atomic pmd values with this function.
5657
*
57-
* With THP if the mmap_sem is hold for reading, the pmd can become
58+
* With THP if the mmap_sem is held for reading, the pmd can become
5859
* trans_huge or none or point to a pte (and in turn become "stable")
59-
* at any time under pmd_read_atomic. We could read it really
60-
* atomically here with a atomic64_read for the THP enabled case (and
60+
* at any time under pmd_read_atomic(). We could read it truly
61+
* atomically here with an atomic64_read() for the THP enabled case (and
6162
* it would be a whole lot simpler), but to avoid using cmpxchg8b we
6263
* only return an atomic pmdval if the low part of the pmdval is later
63-
* found stable (i.e. pointing to a pte). And we're returning a none
64-
* pmdval if the low part of the pmd is none. In some cases the high
65-
* and low part of the pmdval returned may not be consistent if THP is
66-
* enabled (the low part may point to previously mapped hugepage,
67-
* while the high part may point to a more recently mapped hugepage),
68-
* but pmd_none_or_trans_huge_or_clear_bad() only needs the low part
69-
* of the pmd to be read atomically to decide if the pmd is unstable
70-
* or not, with the only exception of when the low part of the pmd is
71-
* zero in which case we return a none pmd.
64+
* found to be stable (i.e. pointing to a pte). We are also returning a
65+
* 'none' (zero) pmdval if the low part of the pmd is zero.
66+
*
67+
* In some cases the high and low part of the pmdval returned may not be
68+
* consistent if THP is enabled (the low part may point to previously
69+
* mapped hugepage, while the high part may point to a more recently
70+
* mapped hugepage), but pmd_none_or_trans_huge_or_clear_bad() only
71+
* needs the low part of the pmd to be read atomically to decide if the
72+
* pmd is unstable or not, with the only exception when the low part
73+
* of the pmd is zero, in which case we return a 'none' pmd.
7274
*/
7375
static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
7476
{

arch/x86/kernel/cpu/intel.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -819,7 +819,7 @@ static const struct _tlb_table intel_tlb_table[] = {
819819
{ 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" },
820820
{ 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" },
821821
{ 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" },
822-
{ 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages */" },
822+
{ 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages" },
823823
{ 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
824824
{ 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
825825
{ 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
@@ -847,7 +847,7 @@ static const struct _tlb_table intel_tlb_table[] = {
847847
{ 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" },
848848
{ 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
849849
{ 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" },
850-
{ 0xc2, TLB_DATA_2M_4M, 16, " DTLB 2 MByte/4MByte pages, 4-way associative" },
850+
{ 0xc2, TLB_DATA_2M_4M, 16, " TLB_DATA 2 MByte/4MByte pages, 4-way associative" },
851851
{ 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" },
852852
{ 0x00, 0, 0 }
853853
};
@@ -859,8 +859,8 @@ static void intel_tlb_lookup(const unsigned char desc)
859859
return;
860860

861861
/* look up this descriptor in the table */
862-
for (k = 0; intel_tlb_table[k].descriptor != desc && \
863-
intel_tlb_table[k].descriptor != 0; k++)
862+
for (k = 0; intel_tlb_table[k].descriptor != desc &&
863+
intel_tlb_table[k].descriptor != 0; k++)
864864
;
865865

866866
if (intel_tlb_table[k].tlb_type == 0)

arch/x86/mm/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ CFLAGS_mem_encrypt_identity.o := $(nostackp)
2323

2424
CFLAGS_fault.o := -I $(srctree)/$(src)/../include/asm/trace
2525

26-
obj-$(CONFIG_X86_PAT) += pat_rbtree.o
26+
obj-$(CONFIG_X86_PAT) += pat_interval.o
2727

2828
obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
2929

arch/x86/mm/pat.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -603,7 +603,7 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
603603

604604
spin_lock(&memtype_lock);
605605

606-
err = rbt_memtype_check_insert(new, new_type);
606+
err = memtype_check_insert(new, new_type);
607607
if (err) {
608608
pr_info("x86/PAT: reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
609609
start, end - 1,
@@ -650,7 +650,7 @@ int free_memtype(u64 start, u64 end)
650650
}
651651

652652
spin_lock(&memtype_lock);
653-
entry = rbt_memtype_erase(start, end);
653+
entry = memtype_erase(start, end);
654654
spin_unlock(&memtype_lock);
655655

656656
if (IS_ERR(entry)) {
@@ -693,7 +693,7 @@ static enum page_cache_mode lookup_memtype(u64 paddr)
693693

694694
spin_lock(&memtype_lock);
695695

696-
entry = rbt_memtype_lookup(paddr);
696+
entry = memtype_lookup(paddr);
697697
if (entry != NULL)
698698
rettype = entry->type;
699699
else
@@ -1109,7 +1109,7 @@ static struct memtype *memtype_get_idx(loff_t pos)
11091109
return NULL;
11101110

11111111
spin_lock(&memtype_lock);
1112-
ret = rbt_memtype_copy_nth_element(print_entry, pos);
1112+
ret = memtype_copy_nth_element(print_entry, pos);
11131113
spin_unlock(&memtype_lock);
11141114

11151115
if (!ret) {

arch/x86/mm/pat_internal.h

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,20 +29,20 @@ static inline char *cattr_name(enum page_cache_mode pcm)
2929
}
3030

3131
#ifdef CONFIG_X86_PAT
32-
extern int rbt_memtype_check_insert(struct memtype *new,
33-
enum page_cache_mode *new_type);
34-
extern struct memtype *rbt_memtype_erase(u64 start, u64 end);
35-
extern struct memtype *rbt_memtype_lookup(u64 addr);
36-
extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos);
32+
extern int memtype_check_insert(struct memtype *new,
33+
enum page_cache_mode *new_type);
34+
extern struct memtype *memtype_erase(u64 start, u64 end);
35+
extern struct memtype *memtype_lookup(u64 addr);
36+
extern int memtype_copy_nth_element(struct memtype *out, loff_t pos);
3737
#else
38-
static inline int rbt_memtype_check_insert(struct memtype *new,
39-
enum page_cache_mode *new_type)
38+
static inline int memtype_check_insert(struct memtype *new,
39+
enum page_cache_mode *new_type)
4040
{ return 0; }
41-
static inline struct memtype *rbt_memtype_erase(u64 start, u64 end)
41+
static inline struct memtype *memtype_erase(u64 start, u64 end)
4242
{ return NULL; }
43-
static inline struct memtype *rbt_memtype_lookup(u64 addr)
43+
static inline struct memtype *memtype_lookup(u64 addr)
4444
{ return NULL; }
45-
static inline int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
45+
static inline int memtype_copy_nth_element(struct memtype *out, loff_t pos)
4646
{ return 0; }
4747
#endif
4848

arch/x86/mm/pat_interval.c

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
/*
3+
* Handle caching attributes in page tables (PAT)
4+
*
5+
* Authors: Venkatesh Pallipadi <[email protected]>
6+
* Suresh B Siddha <[email protected]>
7+
*
8+
* Interval tree used to store the PAT memory type reservations.
9+
*/
10+
11+
#include <linux/seq_file.h>
12+
#include <linux/debugfs.h>
13+
#include <linux/kernel.h>
14+
#include <linux/interval_tree_generic.h>
15+
#include <linux/sched.h>
16+
#include <linux/gfp.h>
17+
18+
#include <asm/pgtable.h>
19+
#include <asm/pat.h>
20+
21+
#include "pat_internal.h"
22+
23+
/*
24+
* The memtype tree keeps track of memory type for specific
25+
* physical memory areas. Without proper tracking, conflicting memory
26+
* types in different mappings can cause CPU cache corruption.
27+
*
28+
* The tree is an interval tree (augmented rbtree) with tree ordered
29+
* on starting address. Tree can contain multiple entries for
30+
* different regions which overlap. All the aliases have the same
31+
* cache attributes of course.
32+
*
33+
* memtype_lock protects the rbtree.
34+
*/
35+
static inline u64 memtype_interval_start(struct memtype *memtype)
36+
{
37+
return memtype->start;
38+
}
39+
40+
static inline u64 memtype_interval_end(struct memtype *memtype)
41+
{
42+
return memtype->end - 1;
43+
}
44+
INTERVAL_TREE_DEFINE(struct memtype, rb, u64, subtree_max_end,
45+
memtype_interval_start, memtype_interval_end,
46+
static, memtype_interval)
47+
48+
static struct rb_root_cached memtype_rbroot = RB_ROOT_CACHED;
49+
50+
enum {
51+
MEMTYPE_EXACT_MATCH = 0,
52+
MEMTYPE_END_MATCH = 1
53+
};
54+
55+
static struct memtype *memtype_match(u64 start, u64 end, int match_type)
56+
{
57+
struct memtype *match;
58+
59+
match = memtype_interval_iter_first(&memtype_rbroot, start, end);
60+
while (match != NULL && match->start < end) {
61+
if ((match_type == MEMTYPE_EXACT_MATCH) &&
62+
(match->start == start) && (match->end == end))
63+
return match;
64+
65+
if ((match_type == MEMTYPE_END_MATCH) &&
66+
(match->start < start) && (match->end == end))
67+
return match;
68+
69+
match = memtype_interval_iter_next(match, start, end);
70+
}
71+
72+
return NULL; /* Returns NULL if there is no match */
73+
}
74+
75+
static int memtype_check_conflict(u64 start, u64 end,
76+
enum page_cache_mode reqtype,
77+
enum page_cache_mode *newtype)
78+
{
79+
struct memtype *match;
80+
enum page_cache_mode found_type = reqtype;
81+
82+
match = memtype_interval_iter_first(&memtype_rbroot, start, end);
83+
if (match == NULL)
84+
goto success;
85+
86+
if (match->type != found_type && newtype == NULL)
87+
goto failure;
88+
89+
dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end);
90+
found_type = match->type;
91+
92+
match = memtype_interval_iter_next(match, start, end);
93+
while (match) {
94+
if (match->type != found_type)
95+
goto failure;
96+
97+
match = memtype_interval_iter_next(match, start, end);
98+
}
99+
success:
100+
if (newtype)
101+
*newtype = found_type;
102+
103+
return 0;
104+
105+
failure:
106+
pr_info("x86/PAT: %s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
107+
current->comm, current->pid, start, end,
108+
cattr_name(found_type), cattr_name(match->type));
109+
return -EBUSY;
110+
}
111+
112+
int memtype_check_insert(struct memtype *new,
113+
enum page_cache_mode *ret_type)
114+
{
115+
int err = 0;
116+
117+
err = memtype_check_conflict(new->start, new->end, new->type, ret_type);
118+
if (err)
119+
return err;
120+
121+
if (ret_type)
122+
new->type = *ret_type;
123+
124+
memtype_interval_insert(new, &memtype_rbroot);
125+
return 0;
126+
}
127+
128+
struct memtype *memtype_erase(u64 start, u64 end)
129+
{
130+
struct memtype *data;
131+
132+
/*
133+
* Since the memtype_rbroot tree allows overlapping ranges,
134+
* memtype_erase() checks with EXACT_MATCH first, i.e. free
135+
* a whole node for the munmap case. If no such entry is found,
136+
* it then checks with END_MATCH, i.e. shrink the size of a node
137+
* from the end for the mremap case.
138+
*/
139+
data = memtype_match(start, end, MEMTYPE_EXACT_MATCH);
140+
if (!data) {
141+
data = memtype_match(start, end, MEMTYPE_END_MATCH);
142+
if (!data)
143+
return ERR_PTR(-EINVAL);
144+
}
145+
146+
if (data->start == start) {
147+
/* munmap: erase this node */
148+
memtype_interval_remove(data, &memtype_rbroot);
149+
} else {
150+
/* mremap: update the end value of this node */
151+
memtype_interval_remove(data, &memtype_rbroot);
152+
data->end = start;
153+
memtype_interval_insert(data, &memtype_rbroot);
154+
return NULL;
155+
}
156+
157+
return data;
158+
}
159+
160+
struct memtype *memtype_lookup(u64 addr)
161+
{
162+
return memtype_interval_iter_first(&memtype_rbroot, addr,
163+
addr + PAGE_SIZE);
164+
}
165+
166+
#if defined(CONFIG_DEBUG_FS)
167+
int memtype_copy_nth_element(struct memtype *out, loff_t pos)
168+
{
169+
struct memtype *match;
170+
int i = 1;
171+
172+
match = memtype_interval_iter_first(&memtype_rbroot, 0, ULONG_MAX);
173+
while (match && pos != i) {
174+
match = memtype_interval_iter_next(match, 0, ULONG_MAX);
175+
i++;
176+
}
177+
178+
if (match) { /* pos == i */
179+
*out = *match;
180+
return 0;
181+
} else {
182+
return 1;
183+
}
184+
}
185+
#endif

0 commit comments

Comments
 (0)