Skip to content

Commit 311cd2f

Browse files
Alexandre Ghitipalmer-dabbelt
authored andcommitted
riscv: Fix set_memory_XX() and set_direct_map_XX() by splitting huge linear mappings
When STRICT_KERNEL_RWX is set, any change of permissions on any kernel mapping (vmalloc/modules/kernel text...etc) should be applied on its linear mapping alias. The problem is that the riscv kernel uses huge mappings for the linear mapping and walk_page_range_novma() does not split those huge mappings. So this patchset implements such split in order to apply fine-grained permissions on the linear mapping. Below is the difference before and after (the first PUD mapping is split into PTE/PMD mappings): Before: ---[ Linear mapping ]--- 0xffffaf8000080000-0xffffaf8000200000 0x0000000080080000 1536K PTE D A G . . W R V 0xffffaf8000200000-0xffffaf8077c00000 0x0000000080200000 1914M PMD D A G . . W R V 0xffffaf8077c00000-0xffffaf8078800000 0x00000000f7c00000 12M PMD D A G . . . R V 0xffffaf8078800000-0xffffaf8078c00000 0x00000000f8800000 4M PMD D A G . . W R V 0xffffaf8078c00000-0xffffaf8079200000 0x00000000f8c00000 6M PMD D A G . . . R V 0xffffaf8079200000-0xffffaf807e600000 0x00000000f9200000 84M PMD D A G . . W R V 0xffffaf807e600000-0xffffaf807e716000 0x00000000fe600000 1112K PTE D A G . . W R V 0xffffaf807e717000-0xffffaf807e71a000 0x00000000fe717000 12K PTE D A G . . W R V 0xffffaf807e71d000-0xffffaf807e71e000 0x00000000fe71d000 4K PTE D A G . . W R V 0xffffaf807e722000-0xffffaf807e800000 0x00000000fe722000 888K PTE D A G . . W R V 0xffffaf807e800000-0xffffaf807fe00000 0x00000000fe800000 22M PMD D A G . . W R V 0xffffaf807fe00000-0xffffaf807ff54000 0x00000000ffe00000 1360K PTE D A G . . W R V 0xffffaf807ff55000-0xffffaf8080000000 0x00000000fff55000 684K PTE D A G . . W R V 0xffffaf8080000000-0xffffaf8400000000 0x0000000100000000 14G PUD D A G . . W R V After: ---[ Linear mapping ]--- 0xffffaf8000080000-0xffffaf8000200000 0x0000000080080000 1536K PTE D A G . . W R V 0xffffaf8000200000-0xffffaf8077c00000 0x0000000080200000 1914M PMD D A G . . W R V 0xffffaf8077c00000-0xffffaf8078800000 0x00000000f7c00000 12M PMD D A G . . . R V 0xffffaf8078800000-0xffffaf8078a00000 0x00000000f8800000 2M PMD D A G . . W R V 0xffffaf8078a00000-0xffffaf8078c00000 0x00000000f8a00000 2M PTE D A G . . W R V 0xffffaf8078c00000-0xffffaf8079200000 0x00000000f8c00000 6M PMD D A G . . . R V 0xffffaf8079200000-0xffffaf807e600000 0x00000000f9200000 84M PMD D A G . . W R V 0xffffaf807e600000-0xffffaf807e716000 0x00000000fe600000 1112K PTE D A G . . W R V 0xffffaf807e717000-0xffffaf807e71a000 0x00000000fe717000 12K PTE D A G . . W R V 0xffffaf807e71d000-0xffffaf807e71e000 0x00000000fe71d000 4K PTE D A G . . W R V 0xffffaf807e722000-0xffffaf807e800000 0x00000000fe722000 888K PTE D A G . . W R V 0xffffaf807e800000-0xffffaf807fe00000 0x00000000fe800000 22M PMD D A G . . W R V 0xffffaf807fe00000-0xffffaf807ff54000 0x00000000ffe00000 1360K PTE D A G . . W R V 0xffffaf807ff55000-0xffffaf8080000000 0x00000000fff55000 684K PTE D A G . . W R V 0xffffaf8080000000-0xffffaf8080800000 0x0000000100000000 8M PMD D A G . . W R V 0xffffaf8080800000-0xffffaf8080af6000 0x0000000100800000 3032K PTE D A G . . W R V 0xffffaf8080af6000-0xffffaf8080af8000 0x0000000100af6000 8K PTE D A G . X . R V 0xffffaf8080af8000-0xffffaf8080c00000 0x0000000100af8000 1056K PTE D A G . . W R V 0xffffaf8080c00000-0xffffaf8081a00000 0x0000000100c00000 14M PMD D A G . . W R V 0xffffaf8081a00000-0xffffaf8081a40000 0x0000000101a00000 256K PTE D A G . . W R V 0xffffaf8081a40000-0xffffaf8081a44000 0x0000000101a40000 16K PTE D A G . X . R V 0xffffaf8081a44000-0xffffaf8081a52000 0x0000000101a44000 56K PTE D A G . . W R V 0xffffaf8081a52000-0xffffaf8081a54000 0x0000000101a52000 8K PTE D A G . X . R V ... 0xffffaf809e800000-0xffffaf80c0000000 0x000000011e800000 536M PMD D A G . . W R V 0xffffaf80c0000000-0xffffaf8400000000 0x0000000140000000 13G PUD D A G . . W R V Note that this also fixes memfd_secret() syscall which uses set_direct_map_invalid_noflush() and set_direct_map_default_noflush() to remove the pages from the linear mapping. Below is the kernel page table while a memfd_secret() syscall is running, you can see all the !valid page table entries in the linear mapping: ... 0xffffaf8082240000-0xffffaf8082241000 0x0000000102240000 4K PTE D A G . . W R . 0xffffaf8082241000-0xffffaf8082250000 0x0000000102241000 60K PTE D A G . . W R V 0xffffaf8082250000-0xffffaf8082252000 0x0000000102250000 8K PTE D A G . . W R . 0xffffaf8082252000-0xffffaf8082256000 0x0000000102252000 16K PTE D A G . . W R V 0xffffaf8082256000-0xffffaf8082257000 0x0000000102256000 4K PTE D A G . . W R . 0xffffaf8082257000-0xffffaf8082258000 0x0000000102257000 4K PTE D A G . . W R V 0xffffaf8082258000-0xffffaf8082259000 0x0000000102258000 4K PTE D A G . . W R . 0xffffaf8082259000-0xffffaf808225a000 0x0000000102259000 4K PTE D A G . . W R V 0xffffaf808225a000-0xffffaf808225c000 0x000000010225a000 8K PTE D A G . . W R . 0xffffaf808225c000-0xffffaf8082266000 0x000000010225c000 40K PTE D A G . . W R V 0xffffaf8082266000-0xffffaf8082268000 0x0000000102266000 8K PTE D A G . . W R . 0xffffaf8082268000-0xffffaf8082284000 0x0000000102268000 112K PTE D A G . . W R V 0xffffaf8082284000-0xffffaf8082288000 0x0000000102284000 16K PTE D A G . . W R . 0xffffaf8082288000-0xffffaf808229c000 0x0000000102288000 80K PTE D A G . . W R V 0xffffaf808229c000-0xffffaf80822a0000 0x000000010229c000 16K PTE D A G . . W R . 0xffffaf80822a0000-0xffffaf80822a5000 0x00000001022a0000 20K PTE D A G . . W R V 0xffffaf80822a5000-0xffffaf80822a6000 0x00000001022a5000 4K PTE D A G . . . R V 0xffffaf80822a6000-0xffffaf80822ab000 0x00000001022a6000 20K PTE D A G . . W R V ... And when the memfd_secret() fd is released, the linear mapping is correctly reset: ... 0xffffaf8082240000-0xffffaf80822a5000 0x0000000102240000 404K PTE D A G . . W R V 0xffffaf80822a5000-0xffffaf80822a6000 0x00000001022a5000 4K PTE D A G . . . R V 0xffffaf80822a6000-0xffffaf80822af000 0x00000001022a6000 36K PTE D A G . . W R V ... Signed-off-by: Alexandre Ghiti <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Palmer Dabbelt <[email protected]>
1 parent 629db01 commit 311cd2f

File tree

1 file changed

+230
-40
lines changed

1 file changed

+230
-40
lines changed

arch/riscv/mm/pageattr.c

Lines changed: 230 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
#include <linux/pagewalk.h>
77
#include <linux/pgtable.h>
8+
#include <linux/vmalloc.h>
89
#include <asm/tlbflush.h>
910
#include <asm/bitops.h>
1011
#include <asm/set_memory.h>
@@ -25,19 +26,6 @@ static unsigned long set_pageattr_masks(unsigned long val, struct mm_walk *walk)
2526
return new_val;
2627
}
2728

28-
static int pageattr_pgd_entry(pgd_t *pgd, unsigned long addr,
29-
unsigned long next, struct mm_walk *walk)
30-
{
31-
pgd_t val = READ_ONCE(*pgd);
32-
33-
if (pgd_leaf(val)) {
34-
val = __pgd(set_pageattr_masks(pgd_val(val), walk));
35-
set_pgd(pgd, val);
36-
}
37-
38-
return 0;
39-
}
40-
4129
static int pageattr_p4d_entry(p4d_t *p4d, unsigned long addr,
4230
unsigned long next, struct mm_walk *walk)
4331
{
@@ -96,7 +84,6 @@ static int pageattr_pte_hole(unsigned long addr, unsigned long next,
9684
}
9785

9886
static const struct mm_walk_ops pageattr_ops = {
99-
.pgd_entry = pageattr_pgd_entry,
10087
.p4d_entry = pageattr_p4d_entry,
10188
.pud_entry = pageattr_pud_entry,
10289
.pmd_entry = pageattr_pmd_entry,
@@ -105,12 +92,181 @@ static const struct mm_walk_ops pageattr_ops = {
10592
.walk_lock = PGWALK_RDLOCK,
10693
};
10794

95+
#ifdef CONFIG_64BIT
96+
static int __split_linear_mapping_pmd(pud_t *pudp,
97+
unsigned long vaddr, unsigned long end)
98+
{
99+
pmd_t *pmdp;
100+
unsigned long next;
101+
102+
pmdp = pmd_offset(pudp, vaddr);
103+
104+
do {
105+
next = pmd_addr_end(vaddr, end);
106+
107+
if (next - vaddr >= PMD_SIZE &&
108+
vaddr <= (vaddr & PMD_MASK) && end >= next)
109+
continue;
110+
111+
if (pmd_leaf(*pmdp)) {
112+
struct page *pte_page;
113+
unsigned long pfn = _pmd_pfn(*pmdp);
114+
pgprot_t prot = __pgprot(pmd_val(*pmdp) & ~_PAGE_PFN_MASK);
115+
pte_t *ptep_new;
116+
int i;
117+
118+
pte_page = alloc_page(GFP_KERNEL);
119+
if (!pte_page)
120+
return -ENOMEM;
121+
122+
ptep_new = (pte_t *)page_address(pte_page);
123+
for (i = 0; i < PTRS_PER_PTE; ++i, ++ptep_new)
124+
set_pte(ptep_new, pfn_pte(pfn + i, prot));
125+
126+
smp_wmb();
127+
128+
set_pmd(pmdp, pfn_pmd(page_to_pfn(pte_page), PAGE_TABLE));
129+
}
130+
} while (pmdp++, vaddr = next, vaddr != end);
131+
132+
return 0;
133+
}
134+
135+
static int __split_linear_mapping_pud(p4d_t *p4dp,
136+
unsigned long vaddr, unsigned long end)
137+
{
138+
pud_t *pudp;
139+
unsigned long next;
140+
int ret;
141+
142+
pudp = pud_offset(p4dp, vaddr);
143+
144+
do {
145+
next = pud_addr_end(vaddr, end);
146+
147+
if (next - vaddr >= PUD_SIZE &&
148+
vaddr <= (vaddr & PUD_MASK) && end >= next)
149+
continue;
150+
151+
if (pud_leaf(*pudp)) {
152+
struct page *pmd_page;
153+
unsigned long pfn = _pud_pfn(*pudp);
154+
pgprot_t prot = __pgprot(pud_val(*pudp) & ~_PAGE_PFN_MASK);
155+
pmd_t *pmdp_new;
156+
int i;
157+
158+
pmd_page = alloc_page(GFP_KERNEL);
159+
if (!pmd_page)
160+
return -ENOMEM;
161+
162+
pmdp_new = (pmd_t *)page_address(pmd_page);
163+
for (i = 0; i < PTRS_PER_PMD; ++i, ++pmdp_new)
164+
set_pmd(pmdp_new,
165+
pfn_pmd(pfn + ((i * PMD_SIZE) >> PAGE_SHIFT), prot));
166+
167+
smp_wmb();
168+
169+
set_pud(pudp, pfn_pud(page_to_pfn(pmd_page), PAGE_TABLE));
170+
}
171+
172+
ret = __split_linear_mapping_pmd(pudp, vaddr, next);
173+
if (ret)
174+
return ret;
175+
} while (pudp++, vaddr = next, vaddr != end);
176+
177+
return 0;
178+
}
179+
180+
static int __split_linear_mapping_p4d(pgd_t *pgdp,
181+
unsigned long vaddr, unsigned long end)
182+
{
183+
p4d_t *p4dp;
184+
unsigned long next;
185+
int ret;
186+
187+
p4dp = p4d_offset(pgdp, vaddr);
188+
189+
do {
190+
next = p4d_addr_end(vaddr, end);
191+
192+
/*
193+
* If [vaddr; end] contains [vaddr & P4D_MASK; next], we don't
194+
* need to split, we'll change the protections on the whole P4D.
195+
*/
196+
if (next - vaddr >= P4D_SIZE &&
197+
vaddr <= (vaddr & P4D_MASK) && end >= next)
198+
continue;
199+
200+
if (p4d_leaf(*p4dp)) {
201+
struct page *pud_page;
202+
unsigned long pfn = _p4d_pfn(*p4dp);
203+
pgprot_t prot = __pgprot(p4d_val(*p4dp) & ~_PAGE_PFN_MASK);
204+
pud_t *pudp_new;
205+
int i;
206+
207+
pud_page = alloc_page(GFP_KERNEL);
208+
if (!pud_page)
209+
return -ENOMEM;
210+
211+
/*
212+
* Fill the pud level with leaf puds that have the same
213+
* protections as the leaf p4d.
214+
*/
215+
pudp_new = (pud_t *)page_address(pud_page);
216+
for (i = 0; i < PTRS_PER_PUD; ++i, ++pudp_new)
217+
set_pud(pudp_new,
218+
pfn_pud(pfn + ((i * PUD_SIZE) >> PAGE_SHIFT), prot));
219+
220+
/*
221+
* Make sure the pud filling is not reordered with the
222+
* p4d store which could result in seeing a partially
223+
* filled pud level.
224+
*/
225+
smp_wmb();
226+
227+
set_p4d(p4dp, pfn_p4d(page_to_pfn(pud_page), PAGE_TABLE));
228+
}
229+
230+
ret = __split_linear_mapping_pud(p4dp, vaddr, next);
231+
if (ret)
232+
return ret;
233+
} while (p4dp++, vaddr = next, vaddr != end);
234+
235+
return 0;
236+
}
237+
238+
static int __split_linear_mapping_pgd(pgd_t *pgdp,
239+
unsigned long vaddr,
240+
unsigned long end)
241+
{
242+
unsigned long next;
243+
int ret;
244+
245+
do {
246+
next = pgd_addr_end(vaddr, end);
247+
/* We never use PGD mappings for the linear mapping */
248+
ret = __split_linear_mapping_p4d(pgdp, vaddr, next);
249+
if (ret)
250+
return ret;
251+
} while (pgdp++, vaddr = next, vaddr != end);
252+
253+
return 0;
254+
}
255+
256+
static int split_linear_mapping(unsigned long start, unsigned long end)
257+
{
258+
return __split_linear_mapping_pgd(pgd_offset_k(start), start, end);
259+
}
260+
#endif /* CONFIG_64BIT */
261+
108262
static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask,
109263
pgprot_t clear_mask)
110264
{
111265
int ret;
112266
unsigned long start = addr;
113267
unsigned long end = start + PAGE_SIZE * numpages;
268+
unsigned long __maybe_unused lm_start;
269+
unsigned long __maybe_unused lm_end;
114270
struct pageattr_masks masks = {
115271
.set_mask = set_mask,
116272
.clear_mask = clear_mask
@@ -120,11 +276,67 @@ static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask,
120276
return 0;
121277

122278
mmap_write_lock(&init_mm);
279+
280+
#ifdef CONFIG_64BIT
281+
/*
282+
* We are about to change the permissions of a kernel mapping, we must
283+
* apply the same changes to its linear mapping alias, which may imply
284+
* splitting a huge mapping.
285+
*/
286+
287+
if (is_vmalloc_or_module_addr((void *)start)) {
288+
struct vm_struct *area = NULL;
289+
int i, page_start;
290+
291+
area = find_vm_area((void *)start);
292+
page_start = (start - (unsigned long)area->addr) >> PAGE_SHIFT;
293+
294+
for (i = page_start; i < page_start + numpages; ++i) {
295+
lm_start = (unsigned long)page_address(area->pages[i]);
296+
lm_end = lm_start + PAGE_SIZE;
297+
298+
ret = split_linear_mapping(lm_start, lm_end);
299+
if (ret)
300+
goto unlock;
301+
302+
ret = walk_page_range_novma(&init_mm, lm_start, lm_end,
303+
&pageattr_ops, NULL, &masks);
304+
if (ret)
305+
goto unlock;
306+
}
307+
} else if (is_kernel_mapping(start) || is_linear_mapping(start)) {
308+
lm_start = (unsigned long)lm_alias(start);
309+
lm_end = (unsigned long)lm_alias(end);
310+
311+
ret = split_linear_mapping(lm_start, lm_end);
312+
if (ret)
313+
goto unlock;
314+
315+
ret = walk_page_range_novma(&init_mm, lm_start, lm_end,
316+
&pageattr_ops, NULL, &masks);
317+
if (ret)
318+
goto unlock;
319+
}
320+
123321
ret = walk_page_range_novma(&init_mm, start, end, &pageattr_ops, NULL,
124322
&masks);
323+
324+
unlock:
325+
mmap_write_unlock(&init_mm);
326+
327+
/*
328+
* We can't use flush_tlb_kernel_range() here as we may have split a
329+
* hugepage that is larger than that, so let's flush everything.
330+
*/
331+
flush_tlb_all();
332+
#else
333+
ret = walk_page_range_novma(&init_mm, start, end, &pageattr_ops, NULL,
334+
&masks);
335+
125336
mmap_write_unlock(&init_mm);
126337

127338
flush_tlb_kernel_range(start, end);
339+
#endif
128340

129341
return ret;
130342
}
@@ -159,36 +371,14 @@ int set_memory_nx(unsigned long addr, int numpages)
159371

160372
int set_direct_map_invalid_noflush(struct page *page)
161373
{
162-
int ret;
163-
unsigned long start = (unsigned long)page_address(page);
164-
unsigned long end = start + PAGE_SIZE;
165-
struct pageattr_masks masks = {
166-
.set_mask = __pgprot(0),
167-
.clear_mask = __pgprot(_PAGE_PRESENT)
168-
};
169-
170-
mmap_read_lock(&init_mm);
171-
ret = walk_page_range(&init_mm, start, end, &pageattr_ops, &masks);
172-
mmap_read_unlock(&init_mm);
173-
174-
return ret;
374+
return __set_memory((unsigned long)page_address(page), 1,
375+
__pgprot(0), __pgprot(_PAGE_PRESENT));
175376
}
176377

177378
int set_direct_map_default_noflush(struct page *page)
178379
{
179-
int ret;
180-
unsigned long start = (unsigned long)page_address(page);
181-
unsigned long end = start + PAGE_SIZE;
182-
struct pageattr_masks masks = {
183-
.set_mask = PAGE_KERNEL,
184-
.clear_mask = __pgprot(0)
185-
};
186-
187-
mmap_read_lock(&init_mm);
188-
ret = walk_page_range(&init_mm, start, end, &pageattr_ops, &masks);
189-
mmap_read_unlock(&init_mm);
190-
191-
return ret;
380+
return __set_memory((unsigned long)page_address(page), 1,
381+
PAGE_KERNEL, __pgprot(0));
192382
}
193383

194384
#ifdef CONFIG_DEBUG_PAGEALLOC

0 commit comments

Comments
 (0)