Skip to content

Commit 859e63b

Browse files
kirylbp3tk0v
authored andcommitted
x86/tdx: Convert shared memory back to private on kexec
TDX guests allocate shared buffers to perform I/O. It is done by allocating pages normally from the buddy allocator and converting them to shared with set_memory_decrypted(). The second, kexec-ed kernel has no idea what memory is converted this way. It only sees E820_TYPE_RAM. Accessing shared memory via private mapping is fatal. It leads to unrecoverable TD exit. On kexec, walk direct mapping and convert all shared memory back to private. It makes all RAM private again and second kernel may use it normally. The conversion occurs in two steps: stopping new conversions and unsharing all memory. In the case of normal kexec, the stopping of conversions takes place while scheduling is still functioning. This allows for waiting until any ongoing conversions are finished. The second step is carried out when all CPUs except one are inactive and interrupts are disabled. This prevents any conflicts with code that may access shared memory. Signed-off-by: Kirill A. Shutemov <[email protected]> Signed-off-by: Borislav Petkov (AMD) <[email protected]> Reviewed-by: Rick Edgecombe <[email protected]> Reviewed-by: Kai Huang <[email protected]> Tested-by: Tao Liu <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent 22daa42 commit 859e63b

File tree

4 files changed

+141
-3
lines changed

4 files changed

+141
-3
lines changed

arch/x86/coco/tdx/tdx.c

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,15 @@
77
#include <linux/cpufeature.h>
88
#include <linux/export.h>
99
#include <linux/io.h>
10+
#include <linux/kexec.h>
1011
#include <asm/coco.h>
1112
#include <asm/tdx.h>
1213
#include <asm/vmx.h>
1314
#include <asm/ia32.h>
1415
#include <asm/insn.h>
1516
#include <asm/insn-eval.h>
1617
#include <asm/pgtable.h>
18+
#include <asm/set_memory.h>
1719

1820
/* MMIO direction */
1921
#define EPT_READ 0
@@ -831,6 +833,95 @@ static int tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
831833
return 0;
832834
}
833835

836+
/* Stop new private<->shared conversions */
837+
static void tdx_kexec_begin(void)
838+
{
839+
if (!IS_ENABLED(CONFIG_KEXEC_CORE))
840+
return;
841+
842+
/*
843+
* Crash kernel reaches here with interrupts disabled: can't wait for
844+
* conversions to finish.
845+
*
846+
* If race happened, just report and proceed.
847+
*/
848+
if (!set_memory_enc_stop_conversion())
849+
pr_warn("Failed to stop shared<->private conversions\n");
850+
}
851+
852+
/* Walk direct mapping and convert all shared memory back to private */
853+
static void tdx_kexec_finish(void)
854+
{
855+
unsigned long addr, end;
856+
long found = 0, shared;
857+
858+
if (!IS_ENABLED(CONFIG_KEXEC_CORE))
859+
return;
860+
861+
lockdep_assert_irqs_disabled();
862+
863+
addr = PAGE_OFFSET;
864+
end = PAGE_OFFSET + get_max_mapped();
865+
866+
while (addr < end) {
867+
unsigned long size;
868+
unsigned int level;
869+
pte_t *pte;
870+
871+
pte = lookup_address(addr, &level);
872+
size = page_level_size(level);
873+
874+
if (pte && pte_decrypted(*pte)) {
875+
int pages = size / PAGE_SIZE;
876+
877+
/*
878+
* Touching memory with shared bit set triggers implicit
879+
* conversion to shared.
880+
*
881+
* Make sure nobody touches the shared range from
882+
* now on.
883+
*/
884+
set_pte(pte, __pte(0));
885+
886+
/*
887+
* Memory encryption state persists across kexec.
888+
* If tdx_enc_status_changed() fails in the first
889+
* kernel, it leaves memory in an unknown state.
890+
*
891+
* If that memory remains shared, accessing it in the
892+
* *next* kernel through a private mapping will result
893+
* in an unrecoverable guest shutdown.
894+
*
895+
* The kdump kernel boot is not impacted as it uses
896+
* a pre-reserved memory range that is always private.
897+
* However, gathering crash information could lead to
898+
* a crash if it accesses unconverted memory through
899+
* a private mapping which is possible when accessing
900+
* that memory through /proc/vmcore, for example.
901+
*
902+
* In all cases, print error info in order to leave
903+
* enough bread crumbs for debugging.
904+
*/
905+
if (!tdx_enc_status_changed(addr, pages, true)) {
906+
pr_err("Failed to unshare range %#lx-%#lx\n",
907+
addr, addr + size);
908+
}
909+
910+
found += pages;
911+
}
912+
913+
addr += size;
914+
}
915+
916+
__flush_tlb_all();
917+
918+
shared = atomic_long_read(&nr_shared);
919+
if (shared != found) {
920+
pr_err("shared page accounting is off\n");
921+
pr_err("nr_shared = %ld, nr_found = %ld\n", shared, found);
922+
}
923+
}
924+
834925
void __init tdx_early_init(void)
835926
{
836927
struct tdx_module_args args = {
@@ -890,6 +981,9 @@ void __init tdx_early_init(void)
890981
x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required;
891982
x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required;
892983

984+
x86_platform.guest.enc_kexec_begin = tdx_kexec_begin;
985+
x86_platform.guest.enc_kexec_finish = tdx_kexec_finish;
986+
893987
/*
894988
* TDX intercepts the RDMSR to read the X2APIC ID in the parallel
895989
* bringup low level code. That raises #VE which cannot be handled

arch/x86/include/asm/pgtable.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,11 @@ static inline int pte_young(pte_t pte)
140140
return pte_flags(pte) & _PAGE_ACCESSED;
141141
}
142142

143+
static inline bool pte_decrypted(pte_t pte)
144+
{
145+
return cc_mkdec(pte_val(pte)) == pte_val(pte);
146+
}
147+
143148
#define pmd_dirty pmd_dirty
144149
static inline bool pmd_dirty(pmd_t pmd)
145150
{

arch/x86/include/asm/set_memory.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,11 @@ int set_memory_wb(unsigned long addr, int numpages);
4949
int set_memory_np(unsigned long addr, int numpages);
5050
int set_memory_p(unsigned long addr, int numpages);
5151
int set_memory_4k(unsigned long addr, int numpages);
52+
53+
bool set_memory_enc_stop_conversion(void);
5254
int set_memory_encrypted(unsigned long addr, int numpages);
5355
int set_memory_decrypted(unsigned long addr, int numpages);
56+
5457
int set_memory_np_noalias(unsigned long addr, int numpages);
5558
int set_memory_nonglobal(unsigned long addr, int numpages);
5659
int set_memory_global(unsigned long addr, int numpages);

arch/x86/mm/pat/set_memory.c

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2227,12 +2227,48 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
22272227
return ret;
22282228
}
22292229

2230+
/*
2231+
* The lock serializes conversions between private and shared memory.
2232+
*
2233+
* It is taken for read on conversion. A write lock guarantees that no
2234+
* concurrent conversions are in progress.
2235+
*/
2236+
static DECLARE_RWSEM(mem_enc_lock);
2237+
2238+
/*
2239+
* Stop new private<->shared conversions.
2240+
*
2241+
* Taking the exclusive mem_enc_lock waits for in-flight conversions to complete.
2242+
* The lock is not released to prevent new conversions from being started.
2243+
*/
2244+
bool set_memory_enc_stop_conversion(void)
2245+
{
2246+
/*
2247+
* In a crash scenario, sleep is not allowed. Try to take the lock.
2248+
* Failure indicates that there is a race with the conversion.
2249+
*/
2250+
if (oops_in_progress)
2251+
return down_write_trylock(&mem_enc_lock);
2252+
2253+
down_write(&mem_enc_lock);
2254+
2255+
return true;
2256+
}
2257+
22302258
static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
22312259
{
2232-
if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
2233-
return __set_memory_enc_pgtable(addr, numpages, enc);
2260+
int ret = 0;
22342261

2235-
return 0;
2262+
if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
2263+
if (!down_read_trylock(&mem_enc_lock))
2264+
return -EBUSY;
2265+
2266+
ret = __set_memory_enc_pgtable(addr, numpages, enc);
2267+
2268+
up_read(&mem_enc_lock);
2269+
}
2270+
2271+
return ret;
22362272
}
22372273

22382274
int set_memory_encrypted(unsigned long addr, int numpages)

0 commit comments

Comments
 (0)