Skip to content

Commit 7ad4d1f

Browse files
davidhildenbrandmstsirkin
authored andcommitted
fs/proc/vmcore: introduce PROC_VMCORE_DEVICE_RAM to detect device RAM ranges in 2nd kernel
s390 allocates+prepares the elfcore hdr in the dump (2nd) kernel, not in the crashed kernel. RAM provided by memory devices such as virtio-mem can only be detected using the device driver; when vmcore_init() is called, these device drivers are usually not loaded yet, or the devices did not get probed yet. Consequently, on s390 these RAM ranges will not be included in the crash dump, which makes the dump partially corrupt and is unfortunate. Instead of deferring the vmcore_init() call, to an (unclear?) later point, let's reuse the vmcore_cb infrastructure to obtain device RAM ranges as the device drivers probe the device and get access to this information. Then, we'll add these ranges to the vmcore, adding more PT_LOAD entries and updating the offsets+vmcore size. Use a separate Kconfig option to be set by an architecture to include this code only if the arch really needs it. Further, we'll make the config depend on the relevant drivers (i.e., virtio_mem) once they implement support (next). The alternative of having a PROVIDE_PROC_VMCORE_DEVICE_RAM config option was dropped for now for simplicity. The current target use case is s390, which only creates an elf64 elfcore, so focusing on elf64 is sufficient. Signed-off-by: David Hildenbrand <[email protected]> Message-Id: <[email protected]> Acked-by: Andrew Morton <[email protected]> Signed-off-by: Michael S. Tsirkin <[email protected]>
1 parent e29e9ac commit 7ad4d1f

File tree

3 files changed

+183
-0
lines changed

3 files changed

+183
-0
lines changed

fs/proc/Kconfig

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,24 @@ config PROC_VMCORE_DEVICE_DUMP
6161
as ELF notes to /proc/vmcore. You can still disable device
6262
dump using the kernel command line option 'novmcoredd'.
6363

64+
config NEED_PROC_VMCORE_DEVICE_RAM
65+
bool
66+
67+
config PROC_VMCORE_DEVICE_RAM
68+
def_bool y
69+
depends on PROC_VMCORE && NEED_PROC_VMCORE_DEVICE_RAM
70+
help
71+
If the elfcore hdr is allocated and prepared by the dump kernel
72+
("2nd kernel") instead of the crashed kernel, RAM provided by memory
73+
devices such as virtio-mem will not be included in the dump
74+
image, because only the device driver can properly detect them.
75+
76+
With this config enabled, these RAM ranges will be queried from the
77+
device drivers once the device gets probed, so they can be included
78+
in the crash dump.
79+
80+
Relevant architectures should select NEED_PROC_VMCORE_DEVICE_RAM.
81+
6482
config PROC_SYSCTL
6583
bool "Sysctl support (/proc/sys)" if EXPERT
6684
depends on PROC_FS

fs/proc/vmcore.c

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,8 @@ static bool vmcore_opened;
7979
/* Whether the vmcore is currently open. */
8080
static unsigned int vmcore_open;
8181

82+
static void vmcore_process_device_ram(struct vmcore_cb *cb);
83+
8284
void register_vmcore_cb(struct vmcore_cb *cb)
8385
{
8486
INIT_LIST_HEAD(&cb->next);
@@ -90,6 +92,8 @@ void register_vmcore_cb(struct vmcore_cb *cb)
9092
*/
9193
if (vmcore_opened)
9294
pr_warn_once("Unexpected vmcore callback registration\n");
95+
if (!vmcore_open && cb->get_device_ram)
96+
vmcore_process_device_ram(cb);
9397
mutex_unlock(&vmcore_mutex);
9498
}
9599
EXPORT_SYMBOL_GPL(register_vmcore_cb);
@@ -1535,6 +1539,158 @@ int vmcore_add_device_dump(struct vmcoredd_data *data)
15351539
EXPORT_SYMBOL(vmcore_add_device_dump);
15361540
#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
15371541

1542+
#ifdef CONFIG_PROC_VMCORE_DEVICE_RAM
1543+
static int vmcore_realloc_elfcore_buffer_elf64(size_t new_size)
1544+
{
1545+
char *elfcorebuf_new;
1546+
1547+
if (WARN_ON_ONCE(new_size < elfcorebuf_sz))
1548+
return -EINVAL;
1549+
if (get_order(elfcorebuf_sz_orig) == get_order(new_size)) {
1550+
elfcorebuf_sz_orig = new_size;
1551+
return 0;
1552+
}
1553+
1554+
elfcorebuf_new = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
1555+
get_order(new_size));
1556+
if (!elfcorebuf_new)
1557+
return -ENOMEM;
1558+
memcpy(elfcorebuf_new, elfcorebuf, elfcorebuf_sz);
1559+
free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig));
1560+
elfcorebuf = elfcorebuf_new;
1561+
elfcorebuf_sz_orig = new_size;
1562+
return 0;
1563+
}
1564+
1565+
static void vmcore_reset_offsets_elf64(void)
1566+
{
1567+
Elf64_Phdr *phdr_start = (Elf64_Phdr *)(elfcorebuf + sizeof(Elf64_Ehdr));
1568+
loff_t vmcore_off = elfcorebuf_sz + elfnotes_sz;
1569+
Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfcorebuf;
1570+
Elf64_Phdr *phdr;
1571+
int i;
1572+
1573+
for (i = 0, phdr = phdr_start; i < ehdr->e_phnum; i++, phdr++) {
1574+
u64 start, end;
1575+
1576+
/*
1577+
* After merge_note_headers_elf64() we should only have a single
1578+
* PT_NOTE entry that starts immediately after elfcorebuf_sz.
1579+
*/
1580+
if (phdr->p_type == PT_NOTE) {
1581+
phdr->p_offset = elfcorebuf_sz;
1582+
continue;
1583+
}
1584+
1585+
start = rounddown(phdr->p_offset, PAGE_SIZE);
1586+
end = roundup(phdr->p_offset + phdr->p_memsz, PAGE_SIZE);
1587+
phdr->p_offset = vmcore_off + (phdr->p_offset - start);
1588+
vmcore_off = vmcore_off + end - start;
1589+
}
1590+
set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
1591+
}
1592+
1593+
static int vmcore_add_device_ram_elf64(struct list_head *list, size_t count)
1594+
{
1595+
Elf64_Phdr *phdr_start = (Elf64_Phdr *)(elfcorebuf + sizeof(Elf64_Ehdr));
1596+
Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfcorebuf;
1597+
struct vmcore_range *cur;
1598+
Elf64_Phdr *phdr;
1599+
size_t new_size;
1600+
int rc;
1601+
1602+
if ((Elf32_Half)(ehdr->e_phnum + count) != ehdr->e_phnum + count) {
1603+
pr_err("too many device ram ranges\n");
1604+
return -ENOSPC;
1605+
}
1606+
1607+
/* elfcorebuf_sz must always cover full pages. */
1608+
new_size = sizeof(Elf64_Ehdr) +
1609+
(ehdr->e_phnum + count) * sizeof(Elf64_Phdr);
1610+
new_size = roundup(new_size, PAGE_SIZE);
1611+
1612+
/*
1613+
* Make sure we have sufficient space to include the new PT_LOAD
1614+
* entries.
1615+
*/
1616+
rc = vmcore_realloc_elfcore_buffer_elf64(new_size);
1617+
if (rc) {
1618+
pr_err("resizing elfcore failed\n");
1619+
return rc;
1620+
}
1621+
1622+
/* Modify our used elfcore buffer size to cover the new entries. */
1623+
elfcorebuf_sz = new_size;
1624+
1625+
/* Fill the added PT_LOAD entries. */
1626+
phdr = phdr_start + ehdr->e_phnum;
1627+
list_for_each_entry(cur, list, list) {
1628+
WARN_ON_ONCE(!IS_ALIGNED(cur->paddr | cur->size, PAGE_SIZE));
1629+
elfcorehdr_fill_device_ram_ptload_elf64(phdr, cur->paddr, cur->size);
1630+
1631+
/* p_offset will be adjusted later. */
1632+
phdr++;
1633+
ehdr->e_phnum++;
1634+
}
1635+
list_splice_tail(list, &vmcore_list);
1636+
1637+
/* We changed elfcorebuf_sz and added new entries; reset all offsets. */
1638+
vmcore_reset_offsets_elf64();
1639+
1640+
/* Finally, recalculate the total vmcore size. */
1641+
vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz,
1642+
&vmcore_list);
1643+
proc_vmcore->size = vmcore_size;
1644+
return 0;
1645+
}
1646+
1647+
static void vmcore_process_device_ram(struct vmcore_cb *cb)
1648+
{
1649+
unsigned char *e_ident = (unsigned char *)elfcorebuf;
1650+
struct vmcore_range *first, *m;
1651+
LIST_HEAD(list);
1652+
int count;
1653+
1654+
/* We only support Elf64 dumps for now. */
1655+
if (WARN_ON_ONCE(e_ident[EI_CLASS] != ELFCLASS64)) {
1656+
pr_err("device ram ranges only support Elf64\n");
1657+
return;
1658+
}
1659+
1660+
if (cb->get_device_ram(cb, &list)) {
1661+
pr_err("obtaining device ram ranges failed\n");
1662+
return;
1663+
}
1664+
count = list_count_nodes(&list);
1665+
if (!count)
1666+
return;
1667+
1668+
/*
1669+
* For some reason these ranges are already know? Might happen
1670+
* with unusual register->unregister->register sequences; we'll simply
1671+
* sanity check using the first range.
1672+
*/
1673+
first = list_first_entry(&list, struct vmcore_range, list);
1674+
list_for_each_entry(m, &vmcore_list, list) {
1675+
unsigned long long m_end = m->paddr + m->size;
1676+
unsigned long long first_end = first->paddr + first->size;
1677+
1678+
if (first->paddr < m_end && m->paddr < first_end)
1679+
goto out_free;
1680+
}
1681+
1682+
/* If adding the mem nodes succeeds, they must not be freed. */
1683+
if (!vmcore_add_device_ram_elf64(&list, count))
1684+
return;
1685+
out_free:
1686+
vmcore_free_ranges(&list);
1687+
}
1688+
#else /* !CONFIG_PROC_VMCORE_DEVICE_RAM */
1689+
static void vmcore_process_device_ram(struct vmcore_cb *cb)
1690+
{
1691+
}
1692+
#endif /* CONFIG_PROC_VMCORE_DEVICE_RAM */
1693+
15381694
/* Free all dumps in vmcore device dump list */
15391695
static void vmcore_free_device_dumps(void)
15401696
{

include/linux/crash_dump.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ extern int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size);
2020
extern void elfcorehdr_free(unsigned long long addr);
2121
extern ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos);
2222
extern ssize_t elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos);
23+
void elfcorehdr_fill_device_ram_ptload_elf64(Elf64_Phdr *phdr,
24+
unsigned long long paddr, unsigned long long size);
2325
extern int remap_oldmem_pfn_range(struct vm_area_struct *vma,
2426
unsigned long from, unsigned long pfn,
2527
unsigned long size, pgprot_t prot);
@@ -99,6 +101,12 @@ static inline void vmcore_unusable(void)
99101
* indicated in the vmcore instead. For example, a ballooned page
100102
* contains no data and reading from such a page will cause high
101103
* load in the hypervisor.
104+
* @get_device_ram: query RAM ranges that can only be detected by device
105+
* drivers, such as the virtio-mem driver, so they can be included in
106+
* the crash dump on architectures that allocate the elfcore hdr in the dump
107+
* ("2nd") kernel. Indicated RAM ranges may contain holes to reduce the
108+
* total number of ranges; such holes can be detected using the pfn_is_ram
109+
* callback just like for other RAM.
102110
* @next: List head to manage registered callbacks internally; initialized by
103111
* register_vmcore_cb().
104112
*
@@ -109,6 +117,7 @@ static inline void vmcore_unusable(void)
109117
*/
110118
struct vmcore_cb {
111119
bool (*pfn_is_ram)(struct vmcore_cb *cb, unsigned long pfn);
120+
int (*get_device_ram)(struct vmcore_cb *cb, struct list_head *list);
112121
struct list_head next;
113122
};
114123
extern void register_vmcore_cb(struct vmcore_cb *cb);

0 commit comments

Comments
 (0)