diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e9bbfacc35a64..97d9595a5ee86 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1608,6 +1608,7 @@ config ARCH_SELECTS_KEXEC_FILE def_bool y depends on KEXEC_FILE select HAVE_IMA_KEXEC if IMA + select KEXEC_PE_IMAGE config ARCH_SUPPORTS_KEXEC_SIG def_bool y diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h index 4d9cc7a76d9ca..d50796bd2f1e6 100644 --- a/arch/arm64/include/asm/kexec.h +++ b/arch/arm64/include/asm/kexec.h @@ -120,6 +120,7 @@ struct kimage_arch { #ifdef CONFIG_KEXEC_FILE extern const struct kexec_file_ops kexec_image_ops; +extern const struct kexec_file_ops kexec_pe_image_ops; int arch_kimage_file_post_load_cleanup(struct kimage *image); #define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c index af1ca875c52ce..7c544c385a9ab 100644 --- a/arch/arm64/kernel/machine_kexec_file.c +++ b/arch/arm64/kernel/machine_kexec_file.c @@ -24,6 +24,9 @@ const struct kexec_file_ops * const kexec_file_loaders[] = { &kexec_image_ops, +#ifdef CONFIG_KEXEC_PE_IMAGE + &kexec_pe_image_ops, +#endif NULL }; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8f6e87f0f3a89..8ce93d0d862b9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3709,4 +3709,46 @@ int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char * const char **linep, int *nump); struct bpf_prog *bpf_prog_find_from_stack(void); +enum alloc_type { + TYPE_KALLOC, + TYPE_VMALLOC, + TYPE_VMAP, +}; + +struct mem_range_result { + struct kref ref; + char *buf; + uint32_t buf_sz; + uint32_t data_sz; + /* kmalloc-ed, vmalloc-ed, or vmap-ed */ + enum alloc_type alloc_type; + /* Valid if vmap-ed */ + struct page **pages; + unsigned int pg_cnt; + int status; + struct mem_cgroup *memcg; +}; + +struct mem_range_result *mem_range_result_alloc(void); +void mem_range_result_get(struct mem_range_result *r); +void mem_range_result_put(struct mem_range_result *r); + +__bpf_kfunc int bpf_mem_range_result_put(struct mem_range_result *result); +__bpf_kfunc int bpf_copy_to_kernel(const char *name, char *buf, int size); + +typedef int (*resource_handler)(const char *name, struct mem_range_result *r); + +struct carrier_listener { + struct hlist_node node; + char *name; + resource_handler handler; + /* + * bpf_copy_to_kernel() knows the size in advance, so vmap-ed is not + * supported. + */ + enum alloc_type alloc_type; +}; + +int register_carrier_listener(struct carrier_listener *listener); +int unregister_carrier_listener(char *str); #endif /* _LINUX_BPF_H */ diff --git a/include/linux/decompress/mm.h b/include/linux/decompress/mm.h index ac862422df158..aa35b9ea96f10 100644 --- a/include/linux/decompress/mm.h +++ b/include/linux/decompress/mm.h @@ -92,7 +92,14 @@ MALLOC_VISIBLE void free(void *where) #define large_malloc(a) vmalloc(a) #define large_free(a) vfree(a) +#ifdef CONFIG_KEEP_DECOMPRESSOR +#define INIT +#define INITCONST +#else #define INIT __init +#define INITCONST __initconst +#endif + #define STATIC #include diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 1b10a5d84b68c..2998d8da09d86 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -23,6 +23,10 @@ #include #include +#if defined(CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY) || defined(CONFIG_KEXEC_PE_IMAGE) +#include +#endif + extern note_buf_t __percpu *crash_notes; #ifdef CONFIG_CRASH_DUMP @@ -158,6 +162,7 @@ extern const struct kexec_file_ops * const kexec_file_loaders[]; int kexec_image_probe_default(struct kimage *image, void *buf, unsigned long buf_len); +void *kexec_image_load_default(struct kimage *image); int kexec_image_post_load_cleanup_default(struct kimage *image); /* @@ -443,6 +448,7 @@ static inline int machine_kexec_post_load(struct kimage *image) { return 0; } extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; +extern const struct kexec_file_ops pe_image_ops; bool kexec_load_permitted(int kexec_image_type); @@ -548,6 +554,10 @@ void set_kexec_sig_enforced(void); static inline void set_kexec_sig_enforced(void) {} #endif +#if defined(CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY) || defined(CONFIG_KEXEC_PE_IMAGE) +const Elf_Sym *elf_find_symbol(const Elf_Ehdr *ehdr, const char *name); +#endif + #endif /* !defined(__ASSEBMLY__) */ #endif /* LINUX_KEXEC_H */ diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 2ee603a98813e..ee87241c944e0 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -46,6 +46,15 @@ config KEXEC_FILE for kernel and initramfs as opposed to list of segments as accepted by kexec system call. +config KEXEC_PE_IMAGE + bool "Enable parsing UEFI PE file through kexec file based system call" + select KEEP_DECOMPRESSOR + depends on KEXEC_FILE + depends on DEBUG_INFO_BTF && BPF_SYSCALL + help + This option makes the kexec_file_load() syscall cooperates with bpf-prog + to parse PE format file + config KEXEC_SIG bool "Verify kernel signature during kexec_file_load() syscall" depends on ARCH_SUPPORTS_KEXEC_SIG diff --git a/kernel/Makefile b/kernel/Makefile index c60623448235f..04490182f653c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -80,6 +80,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_core.o obj-$(CONFIG_CRASH_DM_CRYPT) += crash_dump_dm_crypt.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o +obj-$(CONFIG_KEXEC_PE_IMAGE) += kexec_pe_image.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o @@ -141,6 +142,7 @@ obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o CFLAGS_kstack_erase.o += $(DISABLE_KSTACK_ERASE) CFLAGS_kstack_erase.o += $(call cc-option,-mgeneral-regs-only) +CFLAGS_kexec_pe_image.o += -I$(srctree)/tools/lib obj-$(CONFIG_KSTACK_ERASE) += kstack_erase.o KASAN_SANITIZE_kstack_erase.o := n KCSAN_SANITIZE_kstack_erase.o := n diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 269c04a246640..3912ed4300472 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -56,6 +56,9 @@ obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o ifeq ($(CONFIG_DMA_SHARED_BUFFER),y) obj-$(CONFIG_BPF_SYSCALL) += dmabuf_iter.o endif +ifeq ($(CONFIG_KEXEC_PE_IMAGE),y) +obj-$(CONFIG_BPF_SYSCALL) += helpers_carrier.o +endif CFLAGS_REMOVE_percpu_freelist.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index cdffd74ddbe65..2659f09225324 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "../../lib/kstrtox.h" @@ -3714,12 +3715,241 @@ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign) return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX); } +#ifdef CONFIG_KEXEC_PE_IMAGE + +#define MAX_UNCOMPRESSED_BUF_SIZE (1 << 28) +/* a chunk should be large enough to contain a decompressing */ +#define CHUNK_SIZE (1 << 23) + +/* + * At present, one global allocator for decompression. Later if needed, changing the + * prototype of decompress_fn to introduce each task's allocator. + */ +static DEFINE_MUTEX(output_buf_mutex); + +struct decompress_mem_allocator { + struct page **pages; + unsigned int pg_idx; + void *chunk_start; + unsigned int chunk_size; + void *chunk_cur; +}; + +static struct decompress_mem_allocator dcmpr_allocator; + +/* + * Set up an active chunk to hold partial decompressed data. + */ +static void *vmap_decompressed_chunk(void) +{ + struct decompress_mem_allocator *a = &dcmpr_allocator; + unsigned int i, pg_cnt = a->chunk_size >> PAGE_SHIFT; + struct page **pg_start = &a->pages[a->pg_idx]; + + for (i = 0; i < pg_cnt; i++) + a->pages[a->pg_idx++] = alloc_page(GFP_KERNEL | __GFP_ACCOUNT); + + return vmap(pg_start, pg_cnt, VM_MAP, PAGE_KERNEL); +} + +/* + * Present the scattered pages containing decompressed data at a unified virtual + * address. + */ +static int decompress_mem_allocator_handover(struct decompress_mem_allocator *a, + struct mem_range_result *range) +{ + unsigned long pg_array_sz = a->pg_idx * sizeof(struct page *); + + range->pages = vmalloc(pg_array_sz); + if (!range->pages) + return -ENOMEM; + + range->pg_cnt = a->pg_idx; + memcpy(range->pages, a->pages, pg_array_sz); + range->buf = vmap(range->pages, range->pg_cnt, VM_MAP, PAGE_KERNEL); + if (!range->buf) { + vfree(range->pages); + return -1; + } + /* + * Free the tracing pointer; The pages are freed when mem_range_result + * is released. + */ + vfree(a->pages); + a->pages = NULL; + + /* vmap-ed */ + range->alloc_type = TYPE_VMAP; + range->buf_sz = a->pg_idx << PAGE_SHIFT; + range->data_sz = range->buf_sz - a->chunk_size; + range->data_sz += a->chunk_cur - a->chunk_start; + + return 0; +} + +static int decompress_mem_allocator_init( + struct decompress_mem_allocator *allocator, + unsigned int chunk_size) +{ + unsigned long sz = (MAX_UNCOMPRESSED_BUF_SIZE >> PAGE_SHIFT) * sizeof(struct page *); + + allocator->pages = __vmalloc(sz, GFP_KERNEL | __GFP_ACCOUNT); + if (!allocator->pages) + return -ENOMEM; + + allocator->pg_idx = 0; + allocator->chunk_start = NULL; + allocator->chunk_size = chunk_size; + allocator->chunk_cur = NULL; + return 0; +} + +static void decompress_mem_allocator_fini(struct decompress_mem_allocator *allocator) +{ + unsigned int i; + + /* unmap the active chunk */ + if (!!allocator->chunk_start) + vunmap(allocator->chunk_start); + if (!!allocator->pages) { + for (i = 0; i < allocator->pg_idx; i++) + __free_pages(allocator->pages[i], 0); + vfree(allocator->pages); + } +} + +/* + * This is a callback for decompress_fn. + * + * It copies the partial decompressed content in [buf, buf + len) to dst. If the + * active chunk is not large enough, retire it and activate a new chunk to hold + * the remaining data. + */ +static long flush(void *buf, unsigned long len) +{ + struct decompress_mem_allocator *a = &dcmpr_allocator; + long free, copied = 0; + + /* The first time allocation */ + if (unlikely(!a->chunk_start)) { + a->chunk_start = a->chunk_cur = vmap_decompressed_chunk(); + if (unlikely(!a->chunk_start)) + return -1; + } + + free = a->chunk_start + a->chunk_size - a->chunk_cur; + BUG_ON(free < 0); + if (free < len) { + /* + * If the totoal size exceeds MAX_UNCOMPRESSED_BUF_SIZE, + * return -1 to indicate the decompress method that something + * is wrong + */ + if (unlikely((a->pg_idx >= MAX_UNCOMPRESSED_BUF_SIZE >> PAGE_SHIFT))) + return -1; + memcpy(a->chunk_cur, buf, free); + copied += free; + a->chunk_cur += free; + len -= free; + /* + * When retiring the active chunk, release its virtual address + * but do not release the contents in the pages. + */ + vunmap(a->chunk_start); + a->chunk_start = a->chunk_cur = vmap_decompressed_chunk(); + if (unlikely(!a->chunk_start)) + return -1; + } + memcpy(a->chunk_cur, buf, len); + copied += len; + a->chunk_cur += len; + return copied; +} + +__bpf_kfunc struct mem_range_result *bpf_decompress(char *image_gz_payload, int image_gz_sz) +{ + struct decompress_mem_allocator *a = &dcmpr_allocator; + decompress_fn decompressor; + struct mem_cgroup *memcg, *old_memcg; + struct mem_range_result *range; + const char *name; + char *input_buf; + int ret; + + memcg = get_mem_cgroup_from_current(); + old_memcg = set_active_memcg(memcg); + range = mem_range_result_alloc(); + if (!range) { + pr_err("fail to allocate mem_range_result\n"); + goto error; + } + + input_buf = __vmalloc(image_gz_sz, GFP_KERNEL | __GFP_ACCOUNT); + if (!input_buf) { + kfree(range); + pr_err("fail to allocate input buffer\n"); + goto error; + } + + ret = copy_from_kernel_nofault(input_buf, image_gz_payload, image_gz_sz); + if (ret < 0) { + kfree(range); + vfree(input_buf); + pr_err("Error when copying from 0x%p, size:0x%x\n", + image_gz_payload, image_gz_sz); + goto error; + } + + mutex_lock(&output_buf_mutex); + decompress_mem_allocator_init(a, CHUNK_SIZE); + decompressor = decompress_method(input_buf, image_gz_sz, &name); + if (!decompressor) { + kfree(range); + vfree(input_buf); + pr_err("Can not find decompress method\n"); + goto error; + } + ret = decompressor(input_buf, image_gz_sz, NULL, flush, + NULL, NULL, NULL); + + vfree(input_buf); + if (ret == 0) { + ret = decompress_mem_allocator_handover(a, range); + if (!!ret) + goto fail; + range->status = 0; + mem_cgroup_tryget(memcg); + range->memcg = memcg; + set_active_memcg(old_memcg); + } +fail: + decompress_mem_allocator_fini(a); + mutex_unlock(&output_buf_mutex); + if (!!ret) { + kfree(range); + range = NULL; + pr_err("Decompress error\n"); + } + +error: + set_active_memcg(old_memcg); + mem_cgroup_put(memcg); + return range; +} +#endif + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) #ifdef CONFIG_CRASH_DUMP BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) #endif +#ifdef CONFIG_KEXEC_PE_IMAGE +BTF_ID_FLAGS(func, bpf_decompress, KF_TRUSTED_ARGS | KF_ACQUIRE | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_mem_range_result_put, KF_RELEASE | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_copy_to_kernel, KF_TRUSTED_ARGS | KF_SLEEPABLE) +#endif BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE) diff --git a/kernel/bpf/helpers_carrier.c b/kernel/bpf/helpers_carrier.c new file mode 100644 index 0000000000000..7af4ef07ce750 --- /dev/null +++ b/kernel/bpf/helpers_carrier.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +DEFINE_STATIC_SRCU(srcu); +static DEFINE_MUTEX(carrier_listeners_mutex); +static DEFINE_HASHTABLE(carrier_listeners, 8); + +static struct carrier_listener *find_listener(const char *str) +{ + struct carrier_listener *item; + unsigned int hash = jhash(str, strlen(str), 0); + + hash_for_each_possible_rcu(carrier_listeners, item, node, hash) { + if (strcmp(item->name, str) == 0) + return item; + } + return NULL; +} + +static void __mem_range_result_free(struct kref *kref) +{ + struct mem_range_result *result = container_of(kref, struct mem_range_result, ref); + struct mem_cgroup *memcg, *old_memcg; + + /* vunmap() is blocking */ + might_sleep(); + memcg = result->memcg; + old_memcg = set_active_memcg(memcg); + if (likely(!!result->buf)) { + switch (result->alloc_type) { + case TYPE_KALLOC: + kfree(result->buf); + break; + case TYPE_VMALLOC: + vfree(result->buf); + break; + case TYPE_VMAP: + vunmap(result->buf); + for (unsigned int i = 0; i < result->pg_cnt; i++) + __free_pages(result->pages[i], 0); + vfree(result->pages); + } + } + kfree(result); + set_active_memcg(old_memcg); + mem_cgroup_put(memcg); +} + +struct mem_range_result *mem_range_result_alloc(void) +{ + struct mem_range_result *range; + + range = kmalloc(sizeof(struct mem_range_result), GFP_KERNEL); + if (!range) + return NULL; + kref_init(&range->ref); + return range; +} + +void mem_range_result_get(struct mem_range_result *r) +{ + if (!r) + return; + kref_get(&r->ref); +} + +void mem_range_result_put(struct mem_range_result *r) +{ + might_sleep(); + if (!r) + return; + kref_put(&r->ref, __mem_range_result_free); +} + +__bpf_kfunc int bpf_mem_range_result_put(struct mem_range_result *result) +{ + mem_range_result_put(result); + return 0; +} + +/* + * Cache the content in @buf into kernel + */ +__bpf_kfunc int bpf_copy_to_kernel(const char *name, char *buf, int size) +{ + struct mem_range_result *range; + struct mem_cgroup *memcg, *old_memcg; + struct carrier_listener *item; + resource_handler handler; + enum alloc_type alloc_type; + char *kbuf; + int id, ret = 0; + + /* + * This lock ensures no use of item after free and there is no in-flight + * handler + */ + id = srcu_read_lock(&srcu); + item = find_listener(name); + if (!item) { + srcu_read_unlock(&srcu, id); + return -EINVAL; + } + alloc_type = item->alloc_type; + handler = item->handler; + memcg = get_mem_cgroup_from_current(); + old_memcg = set_active_memcg(memcg); + range = mem_range_result_alloc(); + if (!range) { + pr_err("fail to allocate mem_range_result\n"); + ret = -ENOMEM; + goto err; + } + + switch (alloc_type) { + case TYPE_KALLOC: + kbuf = kmalloc(size, GFP_KERNEL | __GFP_ACCOUNT); + break; + case TYPE_VMALLOC: + kbuf = __vmalloc(size, GFP_KERNEL | __GFP_ACCOUNT); + break; + default: + kfree(range); + ret = -EINVAL; + goto err; + } + if (!kbuf) { + kfree(range); + ret = -ENOMEM; + goto err; + } + ret = copy_from_kernel_nofault(kbuf, buf, size); + if (unlikely(ret < 0)) { + if (range->alloc_type == TYPE_KALLOC) + kfree(kbuf); + else + vfree(kbuf); + kfree(range); + ret = -EINVAL; + goto err; + } + range->buf = kbuf; + range->buf_sz = size; + range->data_sz = size; + range->memcg = memcg; + mem_cgroup_tryget(memcg); + range->status = 0; + range->alloc_type = alloc_type; + /* We exit the lock after the handler finishes */ + ret = handler(name, range); + srcu_read_unlock(&srcu, id); + mem_range_result_put(range); +err: + if (ret != 0) + srcu_read_unlock(&srcu, id); + set_active_memcg(old_memcg); + mem_cgroup_put(memcg); + return ret; +} + +int register_carrier_listener(struct carrier_listener *listener) +{ + unsigned int hash; + int ret = 0; + char *str = listener->name; + + /* Not support vmap-ed */ + if (listener->alloc_type > TYPE_VMALLOC) + return -EINVAL; + if (!str) + return -EINVAL; + hash = jhash(str, strlen(str), 0); + mutex_lock(&carrier_listeners_mutex); + if (!find_listener(str)) + hash_add_rcu(carrier_listeners, &listener->node, hash); + else + ret = -EBUSY; + mutex_unlock(&carrier_listeners_mutex); + + return ret; +} +EXPORT_SYMBOL(register_carrier_listener); + +int unregister_carrier_listener(char *str) +{ + struct carrier_listener *item; + int ret = 0; + + mutex_lock(&carrier_listeners_mutex); + item = find_listener(str); + if (!!item) { + hash_del_rcu(&item->node); + /* + * It also waits on in-flight handler. Refer to note on the read + * side + */ + synchronize_srcu(&srcu); + } else { + ret = -EINVAL; + } + mutex_unlock(&carrier_listeners_mutex); + + return ret; +} +EXPORT_SYMBOL(unregister_carrier_listener); + diff --git a/kernel/kexec_bpf/Makefile b/kernel/kexec_bpf/Makefile new file mode 100644 index 0000000000000..20448bae233a0 --- /dev/null +++ b/kernel/kexec_bpf/Makefile @@ -0,0 +1,71 @@ +# SPDX-License-Identifier: GPL-2.0 +OUTPUT := .output +CLANG ?= clang +LLC ?= llc +LLVM_STRIP ?= llvm-strip +DEFAULT_BPFTOOL := $(OUTPUT)/sbin/bpftool +BPFTOOL ?= $(DEFAULT_BPFTOOL) +LIBBPF_SRC := $(abspath ../../tools/lib/bpf) +BPFOBJ := $(OUTPUT)/libbpf.a +BPF_INCLUDE := $(OUTPUT) +INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../tools/lib) \ + -I$(abspath ../../tools/include/uapi) +CFLAGS := -g -Wall + +srctree := $(patsubst %/kernel/kexec_bpf,%,$(CURDIR)) +VMLINUX = $(srctree)/vmlinux + +abs_out := $(abspath $(OUTPUT)) +ifeq ($(V),1) +Q = +msg = +else +Q = @ +msg = @printf ' %-8s %s%s\n' "$(1)" "$(notdir $(2))" "$(if $(3), $(3))"; +MAKEFLAGS += --no-print-directory +submake_extras := feature_display=0 +endif + +.DELETE_ON_ERROR: + +.PHONY: all clean + +all: kexec_pe_parser_bpf.lskel.h + +clean: + $(call msg,CLEAN) + $(Q)rm -rf $(OUTPUT) kexec_pe_parser_bpf.lskel.h + +kexec_pe_parser_bpf.lskel.h: $(OUTPUT)/kexec_pe_parser_bpf.o | $(BPFTOOL) + $(call msg,GEN-SKEL,$@) + $(Q)$(BPFTOOL) gen skeleton -L $< > $@ + @# The following sed commands make opts_data[] and opts_insn[] visible in a file instead of only in a function. + @# And it removes the bytecode + $(Q) sed -i '/static const char opts_data\[\].*=/,/";$$/d' $@ + $(Q) sed -i '/static const char opts_insn\[\].*=/,/";$$/d' $@ + $(Q) sed -i \ + -e 's/opts\.data_sz = sizeof(opts_data) - 1;/opts.data_sz = opts_data_sz;/' \ + -e 's/opts\.insns_sz = sizeof(opts_insn) - 1;/opts.insns_sz = opts_insn_sz;/' $@ + $(Q) sed -i '7i static char *opts_data, *opts_insn;\nstatic unsigned int opts_data_sz, opts_insn_sz;' $@ + +$(OUTPUT)/vmlinux.h: $(VMLINUX) $(BPFOBJ) | $(OUTPUT) + @$(BPFTOOL) btf dump file $(VMLINUX) format c > $(OUTPUT)/vmlinux.h + + +$(OUTPUT)/kexec_pe_parser_bpf.o: kexec_pe_parser_bpf.c $(BPFOBJ) | $(OUTPUT) + $(call msg,BPF,$@) + $(Q)$(CLANG) -g -O2 -target bpf $(INCLUDES) \ + -c $(filter %.c,$^) -o $@ && \ + $(LLVM_STRIP) -g $@ + +$(OUTPUT): + $(call msg,MKDIR,$@) + $(Q)mkdir -p $(OUTPUT) + +$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT) + $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) \ + OUTPUT=$(abspath $(dir $@))/ $(abspath $@) + +$(DEFAULT_BPFTOOL): + $(Q)$(MAKE) $(submake_extras) -C ../../tools/bpf/bpftool \ + prefix= OUTPUT=$(abs_out)/ DESTDIR=$(abs_out) install diff --git a/kernel/kexec_bpf/kexec_pe_parser_bpf.c b/kernel/kexec_bpf/kexec_pe_parser_bpf.c new file mode 100644 index 0000000000000..7d524459806e2 --- /dev/null +++ b/kernel/kexec_bpf/kexec_pe_parser_bpf.c @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include +#include +#include +#include + +/* + * The ringbufs can have different capacity. But only four ringbuf are provided. + */ +#define RINGBUF1_SIZE 4 +#define RINGBUF2_SIZE 4 +#define RINGBUF3_SIZE 4 +#define RINGBUF4_SIZE 4 + +#define KEXEC_RES_KERNEL_NAME "kexec:kernel" +#define KEXEC_RES_INITRD_NAME "kexec:initrd" +#define KEXEC_RES_CMDLINE_NAME "kexec:cmdline" + +/* ringbuf is safe since the user space has no write access to them */ +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, RINGBUF1_SIZE); +} ringbuf_1 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, RINGBUF2_SIZE); +} ringbuf_2 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, RINGBUF3_SIZE); +} ringbuf_3 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, RINGBUF4_SIZE); +} ringbuf_4 SEC(".maps"); + +char LICENSE[] SEC("license") = "GPL"; + +/* + * This function ensures that the sections .rodata, .data .bss and .rodata.str1.1 + * are created for a bpf prog. + */ +__attribute__((used)) static int dummy(void) +{ + static const char res_kernel[16] __attribute__((used, section(".rodata"))) = KEXEC_RES_KERNEL_NAME; + static char local_name[16] __attribute__((used, section(".data"))) = KEXEC_RES_CMDLINE_NAME; + static char res_cmdline[16] __attribute__((used, section(".bss"))); + + __builtin_memcpy(local_name, KEXEC_RES_INITRD_NAME, 16); + return __builtin_memcmp(local_name, res_kernel, 4); +} + +SEC("fentry.s/bpf_handle_pefile") +__attribute__((used)) int BPF_PROG(parse_pe, struct kexec_context *context) +{ + return 0; +} + +SEC("fentry.s/bpf_post_handle_pefile") +__attribute__((used)) int BPF_PROG(post_parse_pe, struct kexec_context *context) +{ + return 0; +} diff --git a/kernel/kexec_bpf/kexec_pe_parser_bpf.lskel.h b/kernel/kexec_bpf/kexec_pe_parser_bpf.lskel.h new file mode 100644 index 0000000000000..88a3aa90d5e04 --- /dev/null +++ b/kernel/kexec_bpf/kexec_pe_parser_bpf.lskel.h @@ -0,0 +1,147 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* THIS FILE IS AUTOGENERATED BY BPFTOOL! */ +#ifndef __KEXEC_PE_PARSER_BPF_SKEL_H__ +#define __KEXEC_PE_PARSER_BPF_SKEL_H__ + +#include +static char *opts_data, *opts_insn; +static unsigned int opts_data_sz, opts_insn_sz; + +struct kexec_pe_parser_bpf { + struct bpf_loader_ctx ctx; + struct { + struct bpf_map_desc ringbuf_1; + struct bpf_map_desc ringbuf_2; + struct bpf_map_desc ringbuf_3; + struct bpf_map_desc ringbuf_4; + struct bpf_map_desc rodata; + struct bpf_map_desc data; + struct bpf_map_desc bss; + struct bpf_map_desc rodata_str1_1; + } maps; + struct { + struct bpf_prog_desc parse_pe; + struct bpf_prog_desc post_parse_pe; + } progs; + struct { + int parse_pe_fd; + int post_parse_pe_fd; + } links; +}; + +static inline int +kexec_pe_parser_bpf__parse_pe__attach(struct kexec_pe_parser_bpf *skel) +{ + int prog_fd = skel->progs.parse_pe.prog_fd; + int fd = skel_raw_tracepoint_open(NULL, prog_fd); + + if (fd > 0) + skel->links.parse_pe_fd = fd; + return fd; +} + +static inline int +kexec_pe_parser_bpf__post_parse_pe__attach(struct kexec_pe_parser_bpf *skel) +{ + int prog_fd = skel->progs.post_parse_pe.prog_fd; + int fd = skel_raw_tracepoint_open(NULL, prog_fd); + + if (fd > 0) + skel->links.post_parse_pe_fd = fd; + return fd; +} + +static inline int +kexec_pe_parser_bpf__attach(struct kexec_pe_parser_bpf *skel) +{ + int ret = 0; + + ret = ret < 0 ? ret : kexec_pe_parser_bpf__parse_pe__attach(skel); + ret = ret < 0 ? ret : kexec_pe_parser_bpf__post_parse_pe__attach(skel); + return ret < 0 ? ret : 0; +} + +static inline void +kexec_pe_parser_bpf__detach(struct kexec_pe_parser_bpf *skel) +{ + skel_closenz(skel->links.parse_pe_fd); + skel_closenz(skel->links.post_parse_pe_fd); +} +static void +kexec_pe_parser_bpf__destroy(struct kexec_pe_parser_bpf *skel) +{ + if (!skel) + return; + kexec_pe_parser_bpf__detach(skel); + skel_closenz(skel->progs.parse_pe.prog_fd); + skel_closenz(skel->progs.post_parse_pe.prog_fd); + skel_closenz(skel->maps.ringbuf_1.map_fd); + skel_closenz(skel->maps.ringbuf_2.map_fd); + skel_closenz(skel->maps.ringbuf_3.map_fd); + skel_closenz(skel->maps.ringbuf_4.map_fd); + skel_closenz(skel->maps.rodata.map_fd); + skel_closenz(skel->maps.data.map_fd); + skel_closenz(skel->maps.bss.map_fd); + skel_closenz(skel->maps.rodata_str1_1.map_fd); + skel_free(skel); +} +static inline struct kexec_pe_parser_bpf * +kexec_pe_parser_bpf__open(void) +{ + struct kexec_pe_parser_bpf *skel; + + skel = skel_alloc(sizeof(*skel)); + if (!skel) + goto cleanup; + skel->ctx.sz = (void *)&skel->links - (void *)skel; + return skel; +cleanup: + kexec_pe_parser_bpf__destroy(skel); + return NULL; +} + +static inline int +kexec_pe_parser_bpf__load(struct kexec_pe_parser_bpf *skel) +{ + struct bpf_load_and_run_opts opts = {}; + int err; + + opts.ctx = (struct bpf_loader_ctx *)skel; + opts.data_sz = opts_data_sz; + opts.data = (void *)opts_data; + opts.insns_sz = opts_insn_sz; + opts.insns = (void *)opts_insn; + + err = bpf_load_and_run(&opts); + if (err < 0) + return err; + return 0; +} + +static inline struct kexec_pe_parser_bpf * +kexec_pe_parser_bpf__open_and_load(void) +{ + struct kexec_pe_parser_bpf *skel; + + skel = kexec_pe_parser_bpf__open(); + if (!skel) + return NULL; + if (kexec_pe_parser_bpf__load(skel)) { + kexec_pe_parser_bpf__destroy(skel); + return NULL; + } + return skel; +} + +__attribute__((unused)) static void +kexec_pe_parser_bpf__assert(struct kexec_pe_parser_bpf *s __attribute__((unused))) +{ +#ifdef __cplusplus +#define _Static_assert static_assert +#endif +#ifdef __cplusplus +#undef _Static_assert +#endif +} + +#endif /* __KEXEC_PE_PARSER_BPF_SKEL_H__ */ diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 91d46502a8174..137049e7e2410 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -80,7 +80,7 @@ int kexec_image_probe_default(struct kimage *image, void *buf, return ret; } -static void *kexec_image_load_default(struct kimage *image) +void *kexec_image_load_default(struct kimage *image) { if (!image->fops || !image->fops->load) return ERR_PTR(-ENOEXEC); @@ -880,6 +880,51 @@ static int kexec_calculate_store_digests(struct kimage *image) return ret; } +#if defined(CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY) || defined(CONFIG_KEXEC_PE_IMAGE) +const Elf_Sym *elf_find_symbol(const Elf_Ehdr *ehdr, const char *name) +{ + const Elf_Shdr *sechdrs; + const Elf_Sym *syms; + const char *strtab; + int i, k; + + sechdrs = (void *)ehdr + ehdr->e_shoff; + + for (i = 0; i < ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type != SHT_SYMTAB) + continue; + + if (sechdrs[i].sh_link >= ehdr->e_shnum) + /* Invalid strtab section number */ + continue; + strtab = (void *)ehdr + sechdrs[sechdrs[i].sh_link].sh_offset; + syms = (void *)ehdr + sechdrs[i].sh_offset; + + /* Go through symbols for a match */ + for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { + if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) + continue; + + if (strcmp(strtab + syms[k].st_name, name) != 0) + continue; + + if (syms[k].st_shndx == SHN_UNDEF || + syms[k].st_shndx >= ehdr->e_shnum) { + pr_debug("Symbol: %s has bad section index %d.\n", + name, syms[k].st_shndx); + return NULL; + } + + /* Found the symbol we are looking for */ + return &syms[k]; + } + } + + return NULL; +} + +#endif + #ifdef CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY /* * kexec_purgatory_setup_kbuf - prepare buffer to load purgatory. @@ -1137,49 +1182,10 @@ int kexec_load_purgatory(struct kimage *image, struct kexec_buf *kbuf) static const Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, const char *name) { - const Elf_Shdr *sechdrs; - const Elf_Ehdr *ehdr; - const Elf_Sym *syms; - const char *strtab; - int i, k; - if (!pi->ehdr) return NULL; - ehdr = pi->ehdr; - sechdrs = (void *)ehdr + ehdr->e_shoff; - - for (i = 0; i < ehdr->e_shnum; i++) { - if (sechdrs[i].sh_type != SHT_SYMTAB) - continue; - - if (sechdrs[i].sh_link >= ehdr->e_shnum) - /* Invalid strtab section number */ - continue; - strtab = (void *)ehdr + sechdrs[sechdrs[i].sh_link].sh_offset; - syms = (void *)ehdr + sechdrs[i].sh_offset; - - /* Go through symbols for a match */ - for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { - if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) - continue; - - if (strcmp(strtab + syms[k].st_name, name) != 0) - continue; - - if (syms[k].st_shndx == SHN_UNDEF || - syms[k].st_shndx >= ehdr->e_shnum) { - pr_debug("Symbol: %s has bad section index %d.\n", - name, syms[k].st_shndx); - return NULL; - } - - /* Found the symbol we are looking for */ - return &syms[k]; - } - } - - return NULL; + return elf_find_symbol(pi->ehdr, name); } void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) diff --git a/kernel/kexec_pe_image.c b/kernel/kexec_pe_image.c new file mode 100644 index 0000000000000..0e9cd09782463 --- /dev/null +++ b/kernel/kexec_pe_image.c @@ -0,0 +1,463 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Kexec PE image loader + + * Copyright (C) 2025 Red Hat, Inc + */ + +#define pr_fmt(fmt) "kexec_file(Image): " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kexec_bpf/kexec_pe_parser_bpf.lskel.h" + +#define KEXEC_RES_KERNEL_NAME "kexec:kernel" +#define KEXEC_RES_INITRD_NAME "kexec:initrd" +#define KEXEC_RES_CMDLINE_NAME "kexec:cmdline" + +struct kexec_res { + char *name; + /* The free of buffer is deferred to kimage_file_post_load_cleanup */ + struct mem_range_result *r; +}; + +static struct kexec_res parsed_resource[3] = { + { KEXEC_RES_KERNEL_NAME, }, + { KEXEC_RES_INITRD_NAME, }, + { KEXEC_RES_CMDLINE_NAME, }, +}; + +/* + * @name should be one of : kernel, initrd, cmdline + */ +static int bpf_kexec_carrier(const char *name, struct mem_range_result *r) +{ + struct kexec_res *res; + int i; + + if (!r || !name) + return -EINVAL; + + for (i = 0; i < 3; i++) { + if (!strcmp(parsed_resource[i].name, name)) + break; + } + if (i >= 3) + return -EINVAL; + + res = &parsed_resource[i]; + /* + * Replace the intermediate resource generated by the previous step. + */ + if (!!res->r) + mem_range_result_put(res->r); + mem_range_result_get(r); + res->r = r; + return 0; +} + +static struct carrier_listener kexec_res_listener[3] = { + { .name = KEXEC_RES_KERNEL_NAME, + .alloc_type = 1, + .handler = bpf_kexec_carrier, + }, + { .name = KEXEC_RES_INITRD_NAME, + .alloc_type = 1, + .handler = bpf_kexec_carrier, + }, + { .name = KEXEC_RES_CMDLINE_NAME, + /* kmalloc-ed */ + .alloc_type = 0, + .handler = bpf_kexec_carrier, + }, +}; + +static bool pe_has_bpf_section(const char *file_buf, unsigned long pe_sz); + +static bool is_valid_pe(const char *kernel_buf, unsigned long kernel_len) +{ + struct mz_hdr *mz; + struct pe_hdr *pe; + + if (!kernel_buf) + return false; + mz = (struct mz_hdr *)kernel_buf; + if (mz->magic != IMAGE_DOS_SIGNATURE) + return false; + pe = (struct pe_hdr *)(kernel_buf + mz->peaddr); + if (pe->magic != IMAGE_NT_SIGNATURE) + return false; + if (pe->opt_hdr_size == 0) { + pr_err("optional header is missing\n"); + return false; + } + + return pe_has_bpf_section(kernel_buf, kernel_len); +} + +static bool is_valid_format(const char *kernel_buf, unsigned long kernel_len) +{ + return is_valid_pe(kernel_buf, kernel_len); +} + +/* + * The UEFI Terse Executable (TE) image has MZ header. + */ +static int pe_image_probe(const char *kernel_buf, unsigned long kernel_len) +{ + return is_valid_pe(kernel_buf, kernel_len) ? 0 : -1; +} + +static int pe_get_section(const char *file_buf, const char *sect_name, + char **sect_start, unsigned long *sect_sz) +{ + struct pe_hdr *pe_hdr; + struct pe32plus_opt_hdr *opt_hdr; + struct section_header *sect_hdr; + int section_nr, i; + struct mz_hdr *mz = (struct mz_hdr *)file_buf; + + *sect_start = NULL; + *sect_sz = 0; + pe_hdr = (struct pe_hdr *)(file_buf + mz->peaddr); + section_nr = pe_hdr->sections; + opt_hdr = (struct pe32plus_opt_hdr *)(file_buf + mz->peaddr + sizeof(struct pe_hdr)); + sect_hdr = (struct section_header *)((char *)opt_hdr + pe_hdr->opt_hdr_size); + + for (i = 0; i < section_nr; i++) { + if (strcmp(sect_hdr->name, sect_name) == 0) { + *sect_start = (char *)file_buf + sect_hdr->data_addr; + *sect_sz = sect_hdr->raw_data_size; + return 0; + } + sect_hdr++; + } + + return -1; +} + +static bool pe_has_bpf_section(const char *file_buf, unsigned long pe_sz) +{ + char *sect_start = NULL; + unsigned long sect_sz = 0; + int ret; + + ret = pe_get_section(file_buf, ".bpf", §_start, §_sz); + if (ret < 0) + return false; + return true; +} + +static struct kexec_pe_parser_bpf *pe_parser; + +static void *get_symbol_from_elf(const char *elf_data, size_t elf_size, + const char *symbol_name, unsigned int *symbol_size) +{ + Elf_Ehdr *ehdr = (Elf_Ehdr *)elf_data; + Elf_Shdr *shdr, *dst_shdr; + const Elf_Sym *sym; + void *symbol_data; + + if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) { + pr_err("Not a valid ELF file\n"); + return NULL; + } + + sym = elf_find_symbol(ehdr, symbol_name); + if (!sym) + return NULL; + shdr = (struct elf_shdr *)(elf_data + ehdr->e_shoff); + dst_shdr = &shdr[sym->st_shndx]; + symbol_data = (void *)(elf_data + dst_shdr->sh_offset + sym->st_value); + *symbol_size = sym->st_size; + + return symbol_data; +} + +/* Load a ELF */ +static int arm_bpf_prog(char *bpf_elf, unsigned long sz) +{ + opts_data = get_symbol_from_elf(bpf_elf, sz, "opts_data", &opts_data_sz); + opts_insn = get_symbol_from_elf(bpf_elf, sz, "opts_insn", &opts_insn_sz); + if (!opts_data || !opts_insn) + return -1; + /* + * When light skeleton generates opts_data[] and opts_insn[], it appends a + * NULL terminator at the end of string + */ + opts_data_sz = opts_data_sz - 1; + opts_insn_sz = opts_insn_sz - 1; + + pe_parser = kexec_pe_parser_bpf__open_and_load(); + if (!pe_parser) + return -1; + kexec_pe_parser_bpf__attach(pe_parser); + + return 0; +} + +static void disarm_bpf_prog(void) +{ + kexec_pe_parser_bpf__destroy(pe_parser); + pe_parser = NULL; + opts_data = NULL; + opts_insn = NULL; +} + +struct kexec_context { + bool kdump; + char *image; + int image_sz; + char *initrd; + int initrd_sz; + char *cmdline; + int cmdline_sz; +}; + +void bpf_handle_pefile(struct kexec_context *context); +void bpf_post_handle_pefile(struct kexec_context *context); + + +/* + * optimize("O0") prevents inline, compiler constant propagation + */ +__attribute__((used, optimize("O0"))) void bpf_handle_pefile(struct kexec_context *context) +{ + /* + * To prevent linker from Identical Code Folding (ICF) with bpf_handle_pefile, + * making them have different code. + */ + volatile int dummy = 0; + + dummy += 1; +} + +__attribute__((used, optimize("O0"))) void bpf_post_handle_pefile(struct kexec_context *context) +{ + volatile int dummy = 0; + + dummy += 2; +} + +BTF_KFUNCS_START(kexec_modify_return_ids) +BTF_ID_FLAGS(func, bpf_handle_pefile, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_post_handle_pefile, KF_SLEEPABLE) +BTF_KFUNCS_END(kexec_modify_return_ids) + +static const struct btf_kfunc_id_set kexec_modify_return_set = { + .owner = THIS_MODULE, + .set = &kexec_modify_return_ids, +}; + +static int __init kexec_bpf_prog_run_init(void) +{ + return register_btf_fmodret_id_set(&kexec_modify_return_set); +} +late_initcall(kexec_bpf_prog_run_init); + +/* + * PE file may be nested and should be unfold one by one. + * Query 'kernel', 'initrd', 'cmdline' in cur_phase, as they are inputs for the + * next phase. + */ +static int prepare_nested_pe(char **kernel, unsigned long *kernel_len, char **initrd, + unsigned long *initrd_len, char **cmdline) +{ + struct kexec_res *res; + int ret = -1; + + *kernel = NULL; + *kernel_len = 0; + + res = &parsed_resource[0]; + if (!!res->r) { + *kernel = res->r->buf; + *kernel_len = res->r->data_sz; + ret = 0; + } + + res = &parsed_resource[1]; + if (!!res->r) { + *initrd = res->r->buf; + *initrd_len = res->r->data_sz; + } + + res = &parsed_resource[2]; + if (!!res->r) { + *cmdline = res->r->buf; + } + + return ret; +} + +static void *pe_image_load(struct kimage *image, + char *kernel, unsigned long kernel_len, + char *initrd, unsigned long initrd_len, + char *cmdline, unsigned long cmdline_len) +{ + char *linux_start, *initrd_start, *cmdline_start, *bpf_start; + unsigned long linux_sz, initrd_sz, cmdline_sz, bpf_sz; + struct kexec_res *res; + struct mem_range_result *r; + void *ldata; + int ret; + + linux_start = kernel; + linux_sz = kernel_len; + initrd_start = initrd; + initrd_sz = initrd_len; + cmdline_start = cmdline; + cmdline_sz = cmdline_len; + + for (int i = 0; i < ARRAY_SIZE(kexec_res_listener); i++) + register_carrier_listener(&kexec_res_listener[i]); + + while (is_valid_format(linux_start, linux_sz) && + pe_has_bpf_section(linux_start, linux_sz)) { + struct kexec_context context; + + pe_get_section((const char *)linux_start, ".bpf", &bpf_start, &bpf_sz); + if (!!bpf_sz) { + /* load and attach bpf-prog */ + ret = arm_bpf_prog(bpf_start, bpf_sz); + if (ret) { + pr_err("Fail to load .bpf section\n"); + ldata = ERR_PTR(ret); + goto err; + } + } + if (image->type != KEXEC_TYPE_CRASH) + context.kdump = false; + else + context.kdump = true; + context.image = linux_start; + context.image_sz = linux_sz; + context.initrd = initrd_start; + context.initrd_sz = initrd_sz; + context.cmdline = cmdline_start; + context.cmdline_sz = strlen(cmdline_start); + /* bpf-prog fentry, which handle above buffers. */ + bpf_handle_pefile(&context); + + prepare_nested_pe(&linux_start, &linux_sz, &initrd_start, + &initrd_sz, &cmdline_start); + /* bpf-prog fentry */ + bpf_post_handle_pefile(&context); + /* + * detach the current bpf-prog from their attachment points. + */ + disarm_bpf_prog(); + } + + for (int i = 0; i < ARRAY_SIZE(kexec_res_listener); i++) + unregister_carrier_listener(kexec_res_listener[i].name); + + /* + * image's kernel_buf, initrd_buf, cmdline_buf are set. Now they should + * be updated to the new content. + */ + + res = &parsed_resource[0]; + /* Kernel part should always be parsed */ + if (!res->r) { + pr_err("Can not parse kernel\n"); + ldata = ERR_PTR(-EINVAL); + goto err; + } + kernel = res->r->buf; + kernel_len = res->r->data_sz; + vfree(image->kernel_buf); + image->kernel_buf = kernel; + image->kernel_buf_len = kernel_len; + + res = &parsed_resource[1]; + if (!!res->r) { + initrd = res->r->buf; + initrd_len = res->r->data_sz; + vfree(image->initrd_buf); + image->initrd_buf = initrd; + image->initrd_buf_len = initrd_len; + } + res = &parsed_resource[2]; + if (!!res->r) { + cmdline = res->r->buf; + cmdline_len = res->r->data_sz; + kfree(image->cmdline_buf); + image->cmdline_buf = cmdline; + image->cmdline_buf_len = cmdline_len; + } + + if (kernel == NULL || initrd == NULL || cmdline == NULL) { + char *c, buf[64]; + + c = buf; + if (kernel == NULL) { + strcpy(c, "kernel "); + c += strlen("kernel "); + } + if (initrd == NULL) { + strcpy(c, "initrd "); + c += strlen("initrd "); + } + if (cmdline == NULL) { + strcpy(c, "cmdline "); + c += strlen("cmdline "); + } + c = '\0'; + pr_err("Can not extract data for %s", buf); + ldata = ERR_PTR(-EINVAL); + goto err; + } + + ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, + image->kernel_buf_len); + if (ret) { + pr_err("Fail to find suitable image loader\n"); + ldata = ERR_PTR(ret); + goto err; + } + ldata = kexec_image_load_default(image); + if (IS_ERR(ldata)) { + pr_err("architecture code fails to load image\n"); + goto err; + } + image->image_loader_data = ldata; + +err: + for (int i = 0; i < 3; i++) { + r = parsed_resource[i].r; + if (!r) + continue; + parsed_resource[i].r = NULL; + /* + * The release of buffer defers to + * kimage_file_post_load_cleanup() + */ + r->buf = NULL; + r->buf_sz = 0; + mem_range_result_put(r); + } + + return ldata; +} + +const struct kexec_file_ops kexec_pe_image_ops = { + .probe = pe_image_probe, + .load = pe_image_load, +#ifdef CONFIG_KEXEC_IMAGE_VERIFY_SIG + .verify_sig = kexec_kernel_verify_pe_sig, +#endif +}; diff --git a/lib/Kconfig b/lib/Kconfig index c483951b624ff..05b1e353dcd2a 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -165,6 +165,9 @@ config RANDOM32_SELFTEST # # compression support is select'ed if needed # +config KEEP_DECOMPRESSOR + bool + config 842_COMPRESS select CRC32 tristate diff --git a/lib/decompress.c b/lib/decompress.c index ab3fc90ffc646..3d5b6304bb0f1 100644 --- a/lib/decompress.c +++ b/lib/decompress.c @@ -6,7 +6,7 @@ */ #include - +#include #include #include #include @@ -48,7 +48,7 @@ struct compress_format { decompress_fn decompressor; }; -static const struct compress_format compressed_formats[] __initconst = { +static const struct compress_format compressed_formats[] INITCONST = { { {0x1f, 0x8b}, "gzip", gunzip }, { {0x1f, 0x9e}, "gzip", gunzip }, { {0x42, 0x5a}, "bzip2", bunzip2 }, @@ -60,7 +60,7 @@ static const struct compress_format compressed_formats[] __initconst = { { {0, 0}, NULL, NULL } }; -decompress_fn __init decompress_method(const unsigned char *inbuf, long len, +decompress_fn INIT decompress_method(const unsigned char *inbuf, long len, const char **name) { const struct compress_format *cf; diff --git a/tools/kexec/Makefile b/tools/kexec/Makefile new file mode 100644 index 0000000000000..5cc4b6088b3f8 --- /dev/null +++ b/tools/kexec/Makefile @@ -0,0 +1,90 @@ +# SPDX-License-Identifier: GPL-2.0 + +# Ensure Kbuild variables are available +include ../scripts/Makefile.include + +srctree := $(patsubst %/tools/kexec,%,$(CURDIR)) +VMLINUX = $(srctree)/vmlinux +TOOLSDIR := $(srctree)/tools +LIBDIR := $(TOOLSDIR)/lib +BPFDIR := $(LIBDIR)/bpf +ARCH ?= $(shell uname -m | sed -e s/i.86/x86/ -e s/x86_64/x86/ -e s/aarch64.*/arm64/ -e s/riscv64/riscv/ -e s/loongarch.*/loongarch/) +# At present, zboot image format is used by arm64, riscv, loongarch +# And arch/$(ARCH)/boot/vmlinux.bin is the uncompressed file instead of arch/$(ARCH)/boot/Image +ifeq ($(ARCH),$(filter $(ARCH),arm64 riscv loongarch)) + EFI_IMAGE := $(srctree)/arch/$(ARCH)/boot/vmlinuz.efi + KERNEL_IMAGE := $(srctree)/arch/$(ARCH)/boot/vmlinux.bin +else + @echo "Unsupported architecture: $(ARCH)" + @exit 1 +endif + + +CC = clang +CFLAGS = -O2 +BPF_PROG_CFLAGS = -g -O2 -target bpf -Wall -I $(BPFDIR) -I . +BPFTOOL = bpftool + +# List of generated target files +HEADERS = vmlinux.h bpf_helper_defs.h image_size.h +ZBOOT_TARGETS = bytecode.c zboot_parser_bpf.o bytecode.o zboot_image_builder zboot.efi + + +# Targets +zboot: $(HEADERS) $(ZBOOT_TARGETS) + +# Rule to generate vmlinux.h from vmlinux +vmlinux.h: $(VMLINUX) + @command -v $(BPFTOOL) >/dev/null 2>&1 || { echo >&2 "$(BPFTOOL) is required but not found. Please install it."; exit 1; } + @$(BPFTOOL) btf dump file $(VMLINUX) format c > vmlinux.h + +bpf_helper_defs.h: $(srctree)/tools/include/uapi/linux/bpf.h + @$(QUIET_GEN)$(srctree)/scripts/bpf_doc.py --header \ + --file $(srctree)/tools/include/uapi/linux/bpf.h > bpf_helper_defs.h + +image_size.h: $(KERNEL_IMAGE) + @{ \ + if [ ! -f "$(KERNEL_IMAGE)" ]; then \ + echo "Error: File '$(KERNEL_IMAGE)' does not exist"; \ + exit 1; \ + fi; \ + FILE_SIZE=$$(stat -c '%s' "$(KERNEL_IMAGE)" 2>/dev/null); \ + POWER=4096; \ + while [ $$POWER -le $$FILE_SIZE ]; do \ + POWER=$$((POWER * 2)); \ + done; \ + RINGBUF_SIZE=$$POWER; \ + echo "#define RINGBUF1_SIZE $$RINGBUF_SIZE" > $@; \ + echo "#define IMAGE_SIZE $$FILE_SIZE" >> $@; \ + } + + +# Rule to generate zboot_parser_bpf.o, depends on vmlinux.h +zboot_parser_bpf.o: zboot_parser_bpf.c vmlinux.h bpf_helper_defs.h + @$(CC) $(BPF_PROG_CFLAGS) -c zboot_parser_bpf.c -o zboot_parser_bpf.o + +# Generate zboot_parser_bpf.lskel.h using bpftool +# Then, extract the opts_data[] and opts_insn[] arrays and remove 'static' +# keywords to avoid being optimized away. +bytecode.c: zboot_parser_bpf.o + @$(BPFTOOL) gen skeleton -L zboot_parser_bpf.o > zboot_parser_bpf.lskel.h + @sed -n '/static const char opts_data\[\]/,/;/p' zboot_parser_bpf.lskel.h | sed 's/static const/const/' > $@ + @sed -n '/static const char opts_insn\[\]/,/;/p' zboot_parser_bpf.lskel.h | sed 's/static const/const/' >> $@ + @rm -f zboot_parser_bpf.lskel.h + +bytecode.o: bytecode.c + @$(CC) -c $< -o $@ + +# Rule to build zboot_image_builder executable +zboot_image_builder: zboot_image_builder.c + @$(CC) $(CFLAGS) $< -o $@ + +zboot.efi: zboot_image_builder bytecode.o + @chmod +x zboot_image_builder + @./zboot_image_builder $(EFI_IMAGE) bytecode.o $@ + +# Clean up generated files +clean: + @rm -f $(HEADERS) $(ZBOOT_TARGETS) + +.PHONY: all clean diff --git a/tools/kexec/pe.h b/tools/kexec/pe.h new file mode 100644 index 0000000000000..c2273d3fc3bb3 --- /dev/null +++ b/tools/kexec/pe.h @@ -0,0 +1,177 @@ +/* + * Extract from linux kernel include/linux/pe.h + */ + +#ifndef __PE_H__ +#define __PE_H__ + +#define IMAGE_DOS_SIGNATURE 0x5a4d /* "MZ" */ +#define IMAGE_NT_SIGNATURE 0x00004550 /* "PE\0\0" */ + +struct mz_hdr { + uint16_t magic; /* MZ_MAGIC */ + uint16_t lbsize; /* size of last used block */ + uint16_t blocks; /* pages in file, 0x3 */ + uint16_t relocs; /* relocations */ + uint16_t hdrsize; /* header size in "paragraphs" */ + uint16_t min_extra_pps; /* .bss */ + uint16_t max_extra_pps; /* runtime limit for the arena size */ + uint16_t ss; /* relative stack segment */ + uint16_t sp; /* initial %sp register */ + uint16_t checksum; /* word checksum */ + uint16_t ip; /* initial %ip register */ + uint16_t cs; /* initial %cs relative to load segment */ + uint16_t reloc_table_offset; /* offset of the first relocation */ + uint16_t overlay_num; /* overlay number. set to 0. */ + uint16_t reserved0[4]; /* reserved */ + uint16_t oem_id; /* oem identifier */ + uint16_t oem_info; /* oem specific */ + uint16_t reserved1[10]; /* reserved */ + uint32_t peaddr; /* address of pe header */ + char message[]; /* message to print */ +}; + +struct pe_hdr { + uint32_t magic; /* PE magic */ + uint16_t machine; /* machine type */ + uint16_t sections; /* number of sections */ + uint32_t timestamp; /* time_t */ + uint32_t symbol_table; /* symbol table offset */ + uint32_t symbols; /* number of symbols */ + uint16_t opt_hdr_size; /* size of optional header */ + uint16_t flags; /* flags */ +}; + +/* the fact that pe32 isn't padded where pe32+ is 64-bit means union won't + * work right. vomit. */ +struct pe32_opt_hdr { + /* "standard" header */ + uint16_t magic; /* file type */ + uint8_t ld_major; /* linker major version */ + uint8_t ld_minor; /* linker minor version */ + uint32_t text_size; /* size of text section(s) */ + uint32_t data_size; /* size of data section(s) */ + uint32_t bss_size; /* size of bss section(s) */ + uint32_t entry_point; /* file offset of entry point */ + uint32_t code_base; /* relative code addr in ram */ + uint32_t data_base; /* relative data addr in ram */ + /* "windows" header */ + uint32_t image_base; /* preferred load address */ + uint32_t section_align; /* alignment in bytes */ + uint32_t file_align; /* file alignment in bytes */ + uint16_t os_major; /* major OS version */ + uint16_t os_minor; /* minor OS version */ + uint16_t image_major; /* major image version */ + uint16_t image_minor; /* minor image version */ + uint16_t subsys_major; /* major subsystem version */ + uint16_t subsys_minor; /* minor subsystem version */ + uint32_t win32_version; /* reserved, must be 0 */ + uint32_t image_size; /* image size */ + uint32_t header_size; /* header size rounded up to + file_align */ + uint32_t csum; /* checksum */ + uint16_t subsys; /* subsystem */ + uint16_t dll_flags; /* more flags! */ + uint32_t stack_size_req;/* amt of stack requested */ + uint32_t stack_size; /* amt of stack required */ + uint32_t heap_size_req; /* amt of heap requested */ + uint32_t heap_size; /* amt of heap required */ + uint32_t loader_flags; /* reserved, must be 0 */ + uint32_t data_dirs; /* number of data dir entries */ +}; + +struct pe32plus_opt_hdr { + uint16_t magic; /* file type */ + uint8_t ld_major; /* linker major version */ + uint8_t ld_minor; /* linker minor version */ + uint32_t text_size; /* size of text section(s) */ + uint32_t data_size; /* size of data section(s) */ + uint32_t bss_size; /* size of bss section(s) */ + uint32_t entry_point; /* file offset of entry point */ + uint32_t code_base; /* relative code addr in ram */ + /* "windows" header */ + uint64_t image_base; /* preferred load address */ + uint32_t section_align; /* alignment in bytes */ + uint32_t file_align; /* file alignment in bytes */ + uint16_t os_major; /* major OS version */ + uint16_t os_minor; /* minor OS version */ + uint16_t image_major; /* major image version */ + uint16_t image_minor; /* minor image version */ + uint16_t subsys_major; /* major subsystem version */ + uint16_t subsys_minor; /* minor subsystem version */ + uint32_t win32_version; /* reserved, must be 0 */ + uint32_t image_size; /* image size */ + uint32_t header_size; /* header size rounded up to + file_align */ + uint32_t csum; /* checksum */ + uint16_t subsys; /* subsystem */ + uint16_t dll_flags; /* more flags! */ + uint64_t stack_size_req;/* amt of stack requested */ + uint64_t stack_size; /* amt of stack required */ + uint64_t heap_size_req; /* amt of heap requested */ + uint64_t heap_size; /* amt of heap required */ + uint32_t loader_flags; /* reserved, must be 0 */ + uint32_t data_dirs; /* number of data dir entries */ +}; + +struct data_dirent { + uint32_t virtual_address; /* relative to load address */ + uint32_t size; +}; + +struct data_directory { + struct data_dirent exports; /* .edata */ + struct data_dirent imports; /* .idata */ + struct data_dirent resources; /* .rsrc */ + struct data_dirent exceptions; /* .pdata */ + struct data_dirent certs; /* certs */ + struct data_dirent base_relocations; /* .reloc */ + struct data_dirent debug; /* .debug */ + struct data_dirent arch; /* reservered */ + struct data_dirent global_ptr; /* global pointer reg. Size=0 */ + struct data_dirent tls; /* .tls */ + struct data_dirent load_config; /* load configuration structure */ + struct data_dirent bound_imports; /* no idea */ + struct data_dirent import_addrs; /* import address table */ + struct data_dirent delay_imports; /* delay-load import table */ + struct data_dirent clr_runtime_hdr; /* .cor (object only) */ + struct data_dirent reserved; +}; + +struct section_header { + char name[8]; /* name or "/12\0" string tbl offset */ + uint32_t virtual_size; /* size of loaded section in ram */ + uint32_t virtual_address; /* relative virtual address */ + uint32_t raw_data_size; /* size of the section */ + uint32_t data_addr; /* file pointer to first page of sec */ + uint32_t relocs; /* file pointer to relocation entries */ + uint32_t line_numbers; /* line numbers! */ + uint16_t num_relocs; /* number of relocations */ + uint16_t num_lin_numbers; /* srsly. */ + uint32_t flags; +}; + +struct win_certificate { + uint32_t length; + uint16_t revision; + uint16_t cert_type; +}; + +/* + * Return -1 if not PE, else offset of the PE header + */ +static int get_pehdr_offset(const char *buf) +{ + int pe_hdr_offset; + + pe_hdr_offset = *((int *)(buf + 0x3c)); + buf += pe_hdr_offset; + if (!!memcmp(buf, "PE\0\0", 4)) { + printf("Not a PE file\n"); + return -1; + } + + return pe_hdr_offset; +} + +#endif diff --git a/tools/kexec/zboot_image_builder.c b/tools/kexec/zboot_image_builder.c new file mode 100644 index 0000000000000..2508cafd7c200 --- /dev/null +++ b/tools/kexec/zboot_image_builder.c @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Red Hat, Inc. + * The zboot format carries the compressed kernel image offset and size + * information in the DOS header. The program appends a bpf section to PE file, + * meanwhile maintains the offset and size information, which is lost when using + * objcopy to handle zboot image. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "pe.h" + +#ifdef DEBUG_DETAIL + #define dprintf(...) printf(__VA_ARGS__) +#else + #define dprintf(...) ((void)0) +#endif + +typedef struct { + union { + struct { + unsigned int mz_magic; + char image_type[4]; + /* offset to the whole file start */ + unsigned int payload_offset; + unsigned int payload_size; + unsigned int reserved[2]; + char comp_type[4]; + }; + char raw_bytes[56]; + }; + unsigned int linux_pe_magic; + /* offset at: 0x3c or 60 */ + unsigned int pe_header_offset; +} __attribute__((packed)) pe_zboot_header; + +typedef unsigned long uintptr_t; +#define ALIGN_UP(p, size) (__typeof__(p))(((uintptr_t)(p) + ((size) - 1)) & ~((size) - 1)) + +int main(int argc, char **argv) +{ + uint32_t payload_new_offset, payload_sect_off; + uint32_t payload_size; + uint32_t payload_sect_idx; + pe_zboot_header *zheader; + struct pe_hdr *pe_hdr; + struct pe32plus_opt_hdr *opt_hdr; + int base_fd, bpf_fd, out_fd; + char *base_start_addr, *base_cur; + char *out_start_addr, *out_cur; + uint32_t out_sz, max_va_end = 0; + struct stat sb; + int i = 0, ret = 0; + + if (argc != 4) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return -1; + } + + const char *original_pe = argv[1]; + const char *binary_file = argv[2]; + const char *new_pe = argv[3]; + FILE *bin_fp = fopen(binary_file, "rb"); + if (!bin_fp) { + perror("Failed to open binary file"); + return -1; + } + fseek(bin_fp, 0, SEEK_END); + size_t bin_size = ftell(bin_fp); + fseek(bin_fp, 0, SEEK_SET); + base_fd = open(original_pe, O_RDWR); + out_fd = open(new_pe, O_RDWR | O_CREAT, 0644); + if (base_fd == -1 || out_fd == -1) { + perror("Error opening file"); + exit(1); + } + + if (fstat(base_fd, &sb) == -1) { + perror("Error getting file size"); + exit(1); + } + base_start_addr = mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, base_fd, 0); + if (base_start_addr == MAP_FAILED) { + perror("Error mmapping the file"); + exit(1); + } + /* 64KB for section table extending */ + out_sz = sb.st_size + bin_size + (1 << 16); + out_start_addr = mmap(NULL, out_sz, PROT_WRITE, MAP_SHARED, out_fd, 0); + if (ftruncate(out_fd, out_sz) == -1) { + perror("Failed to resize output file"); + ret = -1; + goto err; + } + if (out_start_addr == MAP_FAILED) { + perror("Error mmapping the file"); + exit(1); + } + + zheader = (pe_zboot_header *)base_start_addr; + if (zheader->mz_magic != 0x5A4D) { // 'MZ' + fprintf(stderr, "Invalid DOS signature\n"); + return -1; + } + uint32_t pe_hdr_offset = get_pehdr_offset((const char *)base_start_addr); + base_cur = base_start_addr + pe_hdr_offset; + pe_hdr = (struct pe_hdr *)base_cur; + if (pe_hdr->magic!= 0x00004550) { // 'PE\0\0' + fprintf(stderr, "Invalid PE signature\n"); + return -1; + } + base_cur += sizeof(struct pe_hdr); + opt_hdr = (struct pe32plus_opt_hdr *)base_cur; + uint32_t file_align = opt_hdr->file_align; + uint32_t section_alignment = opt_hdr->section_align; + + uint16_t num_sections = pe_hdr->sections; + struct section_header *base_sections, *sect; + uint32_t section_table_offset = pe_hdr_offset + sizeof(struct pe_hdr) + pe_hdr->opt_hdr_size; + base_sections = (struct section_header *)(base_start_addr + section_table_offset); + + /* Decide the section idx and the payload offset within the section */ + for (i = 0; i < num_sections; i++) { + sect = &base_sections[i]; + if (zheader->payload_offset >= sect->data_addr && + zheader->payload_offset < (sect->data_addr + sect->raw_data_size)) { + payload_sect_idx = i; + payload_sect_off = zheader->payload_offset - sect->data_addr; + } + } + + /* Calculate the end of the last section in virtual memory */ + for (i = 0; i < num_sections; i++) { + uint32_t section_end = base_sections[i].virtual_address + base_sections[i].virtual_size; + if (section_end > max_va_end) { + max_va_end = section_end; + } + } + + /* Calculate virtual address for the new .bpf section */ + uint32_t bpf_virtual_address = ALIGN_UP(max_va_end, section_alignment); + + pe_zboot_header *new_zhdr = malloc(sizeof(pe_zboot_header)); + memcpy(new_zhdr, zheader, sizeof(pe_zboot_header)); + struct pe_hdr *new_hdr = malloc(sizeof(struct pe_hdr)); + memcpy(new_hdr, pe_hdr, sizeof(struct pe_hdr)); + new_hdr->sections += 1; + struct pe32plus_opt_hdr *new_opt_hdr = malloc(pe_hdr->opt_hdr_size); + memcpy(new_opt_hdr, opt_hdr, pe_hdr->opt_hdr_size); + /* Create new section headers array (original + new section) */ + struct section_header *new_sections = calloc(1, new_hdr->sections * sizeof(struct section_header)); + if (!new_sections) { + perror("Failed to allocate memory for new section headers"); + return -1; + } + memcpy(new_sections, base_sections, pe_hdr->sections * sizeof(struct section_header)); + + /* Configure the new .bpf section */ + struct section_header *bpf_section = &new_sections[new_hdr->sections - 1]; + memset(bpf_section, 0, sizeof(struct section_header)); + strncpy((char *)bpf_section->name, ".bpf", 8); + bpf_section->virtual_size = bin_size; + bpf_section->virtual_address = bpf_virtual_address; + bpf_section->raw_data_size = bin_size; + bpf_section->flags = 0x40000000; //Readable + + /* Update headers */ + uint32_t new_size_of_image = bpf_section->virtual_address + bpf_section->virtual_size; + new_size_of_image = ALIGN_UP(new_size_of_image, section_alignment); + new_opt_hdr->image_size = new_size_of_image; + + size_t section_table_size = new_hdr->sections * (sizeof(struct section_header)); + size_t headers_size = section_table_offset + section_table_size; + size_t aligned_headers_size = ALIGN_UP(headers_size, file_align); + new_opt_hdr->header_size = aligned_headers_size; + + + uint32_t current_offset = aligned_headers_size; + /* + * If the original PE data_addr is covered by enlarged header_size + * re-assign new data_addr for all sections + */ + if (base_sections[0].data_addr < aligned_headers_size) { + for (i = 0; i < new_hdr->sections; i++) { + new_sections[i].data_addr = current_offset; + current_offset += ALIGN_UP(new_sections[i].raw_data_size, file_align); + } + /* Keep unchanged, just allocating file pointer for bpf section */ + } else { + uint32_t t; + i = new_hdr->sections - 2; + t = new_sections[i].data_addr + new_sections[i].raw_data_size; + i++; + new_sections[i].data_addr = ALIGN_UP(t, file_align); + } + + payload_new_offset = new_sections[payload_sect_idx].data_addr + payload_sect_off; + /* Update */ + new_zhdr->payload_offset = payload_new_offset; + new_zhdr->payload_size = zheader->payload_size; + dprintf("zboot payload_offset updated from 0x%x to 0x%x, size:0x%x\n", + zheader->payload_offset, payload_new_offset, new_zhdr->payload_size); + + + /* compose the new PE file */ + + /* Write Dos header */ + memcpy(out_start_addr, new_zhdr, sizeof(pe_zboot_header)); + out_cur = out_start_addr + pe_hdr_offset; + + /* Write PE header */ + memcpy(out_cur, new_hdr, sizeof(struct pe_hdr)); + out_cur += sizeof(struct pe_hdr); + + /* Write PE optional header */ + memcpy(out_cur, new_opt_hdr, new_hdr->opt_hdr_size); + out_cur += new_hdr->opt_hdr_size; + + /* Write all section headers */ + memcpy(out_cur, new_sections, new_hdr->sections * sizeof(struct section_header)); + + /* Skip padding and copy the section data */ + for (i = 0; i < pe_hdr->sections; i++) { + base_cur = base_start_addr + base_sections[i].data_addr; + out_cur = out_start_addr + new_sections[i].data_addr; + memcpy(out_cur, base_cur, base_sections[i].raw_data_size); + } + msync(out_start_addr, new_sections[i].data_addr + new_sections[i].raw_data_size, MS_ASYNC); + /* For the bpf section */ + out_cur = out_start_addr + new_sections[i].data_addr; + + /* Write .bpf section data */ + char *bin_data = calloc(1, bin_size); + if (!bin_data) { + perror("Failed to allocate memory for binary data"); + free(base_sections); + free(new_sections); + ret = -1; + goto err; + } + if (fread(bin_data, bin_size, 1, bin_fp) != 1) { + perror("Failed to read binary data"); + free(bin_data); + free(base_sections); + free(new_sections); + ret = -1; + goto err; + } + + if (out_cur + bin_size > out_start_addr + out_sz) { + perror("out of out_fd mmap\n"); + ret = -1; + goto err; + } + memcpy(out_cur, bin_data, bin_size); + /* calculate the real size */ + out_sz = out_cur + bin_size - out_start_addr; + msync(out_start_addr, out_sz, MS_ASYNC); + /* truncate to the real size */ + if (ftruncate(out_fd, out_sz) == -1) { + perror("Failed to resize output file"); + ret = -1; + goto err; + } + printf("Create a new PE file with bpf section: %s\n", new_pe); +err: + munmap(out_start_addr, out_sz); + munmap(base_start_addr, sb.st_size); + close(base_fd); + close(out_fd); + close(bpf_fd); + + return ret; +} diff --git a/tools/kexec/zboot_parser_bpf.c b/tools/kexec/zboot_parser_bpf.c new file mode 100644 index 0000000000000..e60621780a1a9 --- /dev/null +++ b/tools/kexec/zboot_parser_bpf.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0 +// +#include "vmlinux.h" +#include +#include +#include "image_size.h" + +/* uncompressed vmlinux.bin plus 4KB */ +#define MAX_RECORD_SIZE (IMAGE_SIZE + 4096) +/* ringbuf 2,3,4 are useless */ +#define MIN_BUF_SIZE 1 + +#define KEXEC_RES_KERNEL_NAME "kexec:kernel" +#define KEXEC_RES_INITRD_NAME "kexec:initrd" +#define KEXEC_RES_CMDLINE_NAME "kexec:cmdline" + +/* ringbuf is safe since the user space has no write access to them */ +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, RINGBUF1_SIZE); +} ringbuf_1 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, MIN_BUF_SIZE); +} ringbuf_2 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, MIN_BUF_SIZE); +} ringbuf_3 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, MIN_BUF_SIZE); +} ringbuf_4 SEC(".maps"); + +char LICENSE[] SEC("license") = "GPL"; + +/* + * This function ensures that the sections .rodata, .data .bss and .rodata.str1.1 + * are created for a bpf prog. + */ +__attribute__((used)) static int dummy(void) +{ + static const char res_kernel[16] __attribute__((used, section(".rodata"))) = KEXEC_RES_KERNEL_NAME; + static char local_name[16] __attribute__((used, section(".data"))) = KEXEC_RES_CMDLINE_NAME; + static char res_cmdline[16] __attribute__((used, section(".bss"))); + + __builtin_memcpy(local_name, KEXEC_RES_INITRD_NAME, 16); + return __builtin_memcmp(local_name, res_kernel, 4); +} + +extern int bpf_copy_to_kernel(const char *name, char *buf, int size) __weak __ksym; +extern struct mem_range_result *bpf_decompress(char *image_gz_payload, int image_gz_sz) __weak __ksym; +extern int bpf_mem_range_result_put(struct mem_range_result *result) __weak __ksym; + + + + +/* see drivers/firmware/efi/libstub/zboot-header.S */ +struct linux_pe_zboot_header { + unsigned int mz_magic; + char image_type[4]; + unsigned int payload_offset; + unsigned int payload_size; + unsigned int reserved[2]; + char comp_type[4]; + unsigned int linux_pe_magic; + unsigned int pe_header_offset; +} __attribute__((packed)); + + +SEC("fentry.s/bpf_handle_pefile") +int BPF_PROG(parse_pe, struct kexec_context *context) +{ + struct linux_pe_zboot_header *zboot_header; + unsigned int image_sz; + char *buf; + char local_name[32]; + + bpf_printk("begin parse PE\n"); + /* BPF verifier should know each variable initial state */ + if (!context->image || (context->image_sz > MAX_RECORD_SIZE)) { + bpf_printk("Err: image size is greater than 0x%lx\n", MAX_RECORD_SIZE); + return 0; + } + + /* In order to access bytes not aligned on 2 order, copy into ringbuf. + * And allocate the memory all at once, later overwriting. + * + * R2 is ARG_CONST_ALLOC_SIZE_OR_ZERO, should be decided at compling time + */ + buf = (char *)bpf_ringbuf_reserve(&ringbuf_1, MAX_RECORD_SIZE, 0); + if (!buf) { + bpf_printk("Err: fail to reserve ringbuf to parse zboot header\n"); + return 0; + } + image_sz = context->image_sz; + bpf_probe_read((void *)buf, sizeof(struct linux_pe_zboot_header), context->image); + zboot_header = (struct linux_pe_zboot_header *)buf; + if (!!__builtin_memcmp(&zboot_header->image_type, "zimg", + sizeof(zboot_header->image_type))) { + bpf_ringbuf_discard(buf, BPF_RB_NO_WAKEUP); + bpf_printk("Err: image is not zboot image\n"); + return 0; + } + + unsigned int payload_offset = zboot_header->payload_offset; + unsigned int payload_size = zboot_header->payload_size; + bpf_printk("zboot image payload offset=0x%x, size=0x%x\n", payload_offset, payload_size); + /* sane check */ + if (payload_size > image_sz) { + bpf_ringbuf_discard(buf, BPF_RB_NO_WAKEUP); + bpf_printk("Invalid zboot image payload offset and size\n"); + return 0; + } + if (payload_size >= MAX_RECORD_SIZE ) { + bpf_ringbuf_discard(buf, BPF_RB_NO_WAKEUP); + bpf_printk("Err: payload_size > MAX_RECORD_SIZE\n"); + return 0; + } + /* Overwrite buf */ + bpf_probe_read((void *)buf, payload_size, context->image + payload_offset); + bpf_printk("Calling bpf_kexec_decompress()\n"); + struct mem_range_result *r = bpf_decompress(buf, payload_size - 4); + if (!r) { + bpf_ringbuf_discard(buf, BPF_RB_NO_WAKEUP); + bpf_printk("Err: fail to decompress\n"); + return 0; + } + + image_sz = r->data_sz; + if (image_sz > MAX_RECORD_SIZE) { + bpf_ringbuf_discard(buf, BPF_RB_NO_WAKEUP); + bpf_mem_range_result_put(r); + bpf_printk("Err: decompressed size too big\n"); + return 0; + } + + /* Since the decompressed size is bigger than original, no need to clean */ + bpf_probe_read((void *)buf, image_sz, r->buf); + bpf_printk("Calling bpf_copy_to_kernel(), image_sz=0x%x\n", image_sz); + /* Verifier is unhappy to expose .rodata.str1.1 'map' to kernel */ + __builtin_memcpy(local_name, KEXEC_RES_KERNEL_NAME, 32); + const char *res_name = local_name; + bpf_copy_to_kernel(res_name, buf, image_sz); + bpf_ringbuf_discard(buf, BPF_RB_NO_WAKEUP); + bpf_mem_range_result_put(r); + + return 0; +} + +SEC("fentry.s/bpf_post_handle_pefile") +int BPF_PROG(post_parse_pe, struct kexec_context *context) +{ + return 0; +}