diff --git a/.github/workflows/test-libbpf.yml b/.github/workflows/test-libbpf.yml index af7a8346..e2075375 100644 --- a/.github/workflows/test-libbpf.yml +++ b/.github/workflows/test-libbpf.yml @@ -62,6 +62,13 @@ jobs: make -C src/30-sslsniff sudo timeout -s 2 3 src/30-sslsniff/sslsniff || if [ $? = 124 ]; then exit 0; else exit $?; fi + - name: test 32 wallclock-profiler + run: | + make -C src/32-wallclock-profiler + # Test individual tools can be built and show help + src/32-wallclock-profiler/oncputime --help || true + src/32-wallclock-profiler/offcputime --help || true + - name: test 33 funclatency run: | make -C src/33-funclatency diff --git a/src/16-memleak/.gitignore b/src/16-memleak/.gitignore index 3bbbd457..23979aa4 100644 --- a/src/16-memleak/.gitignore +++ b/src/16-memleak/.gitignore @@ -6,3 +6,5 @@ package.json package.yaml ecli memleak +test_memleak +.output/ diff --git a/src/16-memleak/Makefile b/src/16-memleak/Makefile index 0c220924..0c0cd92f 100644 --- a/src/16-memleak/Makefile +++ b/src/16-memleak/Makefile @@ -25,6 +25,7 @@ CFLAGS := -g -Wall ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS) APPS = # minimal minimal_legacy bootstrap uprobe kprobe fentry usdt sockfilter tc ksyscall +TEST_APPS = test_memleak CARGO ?= $(shell which cargo) ifeq ($(strip $(CARGO)),) @@ -69,12 +70,12 @@ $(call allow-override,CC,$(CROSS_COMPILE)cc) $(call allow-override,LD,$(CROSS_COMPILE)ld) .PHONY: all -all: $(APPS) +all: $(APPS) $(TEST_APPS) .PHONY: clean clean: $(call msg,CLEAN) - $(Q)rm -rf $(OUTPUT) $(APPS) + $(Q)rm -rf $(OUTPUT) $(APPS) $(TEST_APPS) $(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT): $(call msg,MKDIR,$@) @@ -94,16 +95,16 @@ $(BPFTOOL): | $(BPFTOOL_OUTPUT) $(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap -$(LIBBLAZESYM_SRC)/target/release/libblazesym.a:: - $(Q)cd $(LIBBLAZESYM_SRC) && RUSTFLAGS="-A dangerous_implicit_autorefs" $(CARGO) build --features=cheader,dont-generate-test-files --release +$(LIBBLAZESYM_SRC)/target/release/libblazesym_c.a:: + $(Q)cd $(LIBBLAZESYM_SRC)/capi && $(CARGO) build --release -$(LIBBLAZESYM_OBJ): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT) +$(LIBBLAZESYM_OBJ): $(LIBBLAZESYM_SRC)/target/release/libblazesym_c.a | $(OUTPUT) $(call msg,LIB, $@) - $(Q)cp $(LIBBLAZESYM_SRC)/target/release/libblazesym.a $@ + $(Q)cp $(LIBBLAZESYM_SRC)/target/release/libblazesym_c.a $@ -$(LIBBLAZESYM_HEADER): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT) +$(LIBBLAZESYM_HEADER): $(LIBBLAZESYM_SRC)/target/release/libblazesym_c.a | $(OUTPUT) $(call msg,LIB,$@) - $(Q)cp $(LIBBLAZESYM_SRC)/target/release/blazesym.h $@ + $(Q)cp $(LIBBLAZESYM_SRC)/capi/include/blazesym.h $@ # Build BPF code $(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL) @@ -137,5 +138,10 @@ $(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT) # delete failed targets .DELETE_ON_ERROR: +# Test program (no BPF, just plain C) +test_memleak: test_memleak.c + $(call msg,TEST,$@) + $(Q)$(CC) $(CFLAGS) $< -o $@ + # keep intermediate (.skel.h, .bpf.o, etc) targets .SECONDARY: diff --git a/src/16-memleak/README.md b/src/16-memleak/README.md index 03d34941..5530e6ae 100644 --- a/src/16-memleak/README.md +++ b/src/16-memleak/README.md @@ -419,6 +419,8 @@ Reference: ## Compile and Run +This implementation uses the latest **blazesym v0.2.0** library for symbol resolution, which provides improved performance and a modern C API through the `capi` package. + ```console $ make $ sudo ./memleak @@ -438,6 +440,35 @@ Tracing outstanding memory allocs... Hit Ctrl-C to end ... ``` +## Testing memleak + +This repository includes a test program (`test_memleak.c`) that intentionally leaks memory for testing purposes. You can build and run it to verify that memleak correctly detects memory leaks: + +```console +$ make test_memleak +$ ./test_memleak & +$ sudo ./memleak -p $(pidof test_memleak) +using default object: libc.so.6 +using page size: 4096 +tracing kernel: false +Tracing outstanding memory allocs... Hit Ctrl-C to end +[19:31:49] Top 1 stacks with outstanding allocations: +10240 bytes in 5 allocations from stack + 0 [<00005a7ea0a34212>] leak_with_loop+0x1f + 1 [<00005a7ea0a3428e>] main+0x6e + 2 [<00007b4ea482a1ca>] + 3 [<00007b4ea482a28b>] __libc_start_main+0x8b + 4 [<00005a7ea0a34105>] _start+0x25 +``` + +As shown above, memleak successfully: +- Detected **10240 bytes in 5 allocations** (5 × 2048 bytes) that were leaked +- Identified the leaking function **leak_with_loop+0x1f** with the correct offset +- Provided the complete call stack showing the leak originated from `main` function +- Used the new **blazesym v0.2.0** C API for fast and accurate symbol resolution + +The test demonstrates that memleak with the updated blazesym library can effectively trace memory allocations and pinpoint the exact functions responsible for memory leaks. + ## Summary Through this eBPF introductory tutorial, you have learned how to write a Memleak eBPF monitoring program to monitor memory leaks in real time. You have also learned about the application of eBPF in memory monitoring, how to write eBPF programs using the BPF API, create and use eBPF maps, and how to use eBPF tools to monitor and analyze memory leak issues. We have provided a detailed example to help you understand the execution flow and principles of eBPF code. diff --git a/src/16-memleak/README.zh.md b/src/16-memleak/README.zh.md index d049480e..6d781435 100644 --- a/src/16-memleak/README.zh.md +++ b/src/16-memleak/README.zh.md @@ -459,6 +459,8 @@ int attach_uprobes(struct memleak_bpf *skel) ## 编译运行 +本实现使用最新的 **blazesym v0.2.0** 库进行符号解析,该库通过 `capi` 包提供了改进的性能和现代化的 C API。 + ```console $ make $ sudo ./memleak @@ -478,6 +480,35 @@ Tracing outstanding memory allocs... Hit Ctrl-C to end ... ``` +## 测试 memleak + +本仓库包含一个测试程序(`test_memleak.c`),该程序会故意泄漏内存以供测试使用。您可以编译并运行它来验证 memleak 是否能正确检测内存泄漏: + +```console +$ make test_memleak +$ ./test_memleak & +$ sudo ./memleak -p $(pidof test_memleak) +using default object: libc.so.6 +using page size: 4096 +tracing kernel: false +Tracing outstanding memory allocs... Hit Ctrl-C to end +[19:31:49] Top 1 stacks with outstanding allocations: +10240 bytes in 5 allocations from stack + 0 [<00005a7ea0a34212>] leak_with_loop+0x1f + 1 [<00005a7ea0a3428e>] main+0x6e + 2 [<00007b4ea482a1ca>] + 3 [<00007b4ea482a28b>] __libc_start_main+0x8b + 4 [<00005a7ea0a34105>] _start+0x25 +``` + +如上所示,memleak 成功地: +- 检测到 **10240 字节的 5 次分配**(5 × 2048 字节)发生了泄漏 +- 识别出泄漏函数 **leak_with_loop+0x1f** 及其正确的偏移量 +- 提供了完整的调用栈,显示泄漏源自 `main` 函数 +- 使用新的 **blazesym v0.2.0** C API 进行快速准确的符号解析 + +该测试演示了更新 blazesym 库后的 memleak 可以有效地跟踪内存分配,并精确定位导致内存泄漏的函数。 + ## 总结 通过本篇 eBPF 入门实践教程,您已经学习了如何编写 Memleak eBPF 监控程序,以实时监控程序的内存泄漏。您已经了解了 eBPF 在内存监控方面的应用,学会了使用 BPF API 编写 eBPF 程序,创建和使用 eBPF maps,并且明白了如何用 eBPF 工具监测和分析内存泄漏问题。我们展示了一个详细的例子,帮助您理解 eBPF 代码的运行流程和原理。 diff --git a/src/16-memleak/memleak.c b/src/16-memleak/memleak.c index fa47b879..72e86e63 100644 --- a/src/16-memleak/memleak.c +++ b/src/16-memleak/memleak.c @@ -133,7 +133,7 @@ static int event_notify(int fd, uint64_t event); static pid_t fork_sync_exec(const char *command, int fd); -static void print_stack_frame_by_blazesym(size_t frame, uint64_t addr, const blazesym_csym *sym); +static void print_stack_frame_by_blazesym(size_t frame, uint64_t addr, const struct blaze_sym *sym); static void print_stack_frames_by_blazesym(); static int print_stack_frames(struct allocation *allocs, size_t nr_allocs, int stack_traces_fd); @@ -205,8 +205,7 @@ static struct sigaction sig_action = { static int child_exec_event_fd = -1; -static blazesym *symbolizer; -static sym_src_cfg src_cfg; +static blaze_symbolizer *symbolizer; static void (*print_stack_frames_func)(); static uint64_t *stack; @@ -302,14 +301,8 @@ int main(int argc, char *argv[]) goto cleanup; } - if (env.pid < 0) { - src_cfg.src_type = SRC_T_KERNEL; - src_cfg.params.kernel.kallsyms = NULL; - src_cfg.params.kernel.kernel_image = NULL; - } else { - src_cfg.src_type = SRC_T_PROCESS; - src_cfg.params.process.pid = env.pid; - } + // Symbolization source is now set up in print_stack_frames_by_blazesym() + // based on env.kernel_trace and env.pid // allocate space for storing "allocation" structs if (env.combined_only) @@ -395,9 +388,9 @@ int main(int argc, char *argv[]) } } - symbolizer = blazesym_new(); + symbolizer = blaze_symbolizer_new(); if (!symbolizer) { - fprintf(stderr, "Failed to load blazesym\n"); + fprintf(stderr, "Failed to create blazesym symbolizer\n"); ret = -ENOMEM; goto cleanup; @@ -439,7 +432,7 @@ int main(int argc, char *argv[]) } cleanup: - blazesym_free(symbolizer); + blaze_symbolizer_free(symbolizer); memleak_bpf__destroy(skel); free(allocs); @@ -636,54 +629,67 @@ pid_t fork_sync_exec(const char *command, int fd) return pid; } -void print_stack_frame_by_blazesym(size_t frame, uint64_t addr, const blazesym_csym *sym) +void print_stack_frame_by_blazesym(size_t frame, uint64_t addr, const struct blaze_sym *sym) { - if (!sym) + if (!sym || !sym->name) printf("\t%zu [<%016lx>] <%s>\n", frame, addr, "null sym"); - else if (sym->path && strlen(sym->path)) - printf("\t%zu [<%016lx>] %s+0x%lx %s:%ld\n", frame, addr, sym->symbol, addr - sym->start_address, sym->path, sym->line_no); + else if (sym->code_info.file && strlen(sym->code_info.file)) + printf("\t%zu [<%016lx>] %s+0x%zx %s:%u\n", frame, addr, sym->name, sym->offset, sym->code_info.file, sym->code_info.line); else - printf("\t%zu [<%016lx>] %s+0x%lx\n", frame, addr, sym->symbol, addr - sym->start_address); + printf("\t%zu [<%016lx>] %s+0x%zx\n", frame, addr, sym->name, sym->offset); } void print_stack_frames_by_blazesym() { - const blazesym_result *result = blazesym_symbolize(symbolizer, &src_cfg, 1, stack, env.perf_max_stack_depth); + const struct blaze_syms *syms; + + // Choose symbolization source based on kernel_trace flag + if (env.kernel_trace) { + const struct blaze_symbolize_src_kernel src = { + .type_size = sizeof(src), + .kallsyms = NULL, + .vmlinux = NULL, + .debug_syms = false, + .reserved = {0}, + }; + syms = blaze_symbolize_kernel_abs_addrs(symbolizer, &src, stack, env.perf_max_stack_depth); + } else { + const struct blaze_symbolize_src_process src = { + .type_size = sizeof(src), + .pid = env.pid, + }; + syms = blaze_symbolize_process_abs_addrs(symbolizer, &src, stack, env.perf_max_stack_depth); + } + + if (!syms) { + fprintf(stderr, "Failed to symbolize addresses\n"); + return; + } - for (size_t j = 0; j < result->size; ++j) { + for (size_t j = 0; j < syms->cnt; ++j) { const uint64_t addr = stack[j]; if (addr == 0) break; - // no symbol found - if (!result || j >= result->size || result->entries[j].size == 0) { - print_stack_frame_by_blazesym(j, addr, NULL); + const struct blaze_sym *sym = &syms->syms[j]; - continue; - } + // Print main symbol + print_stack_frame_by_blazesym(j, addr, sym); - // single symbol found - if (result->entries[j].size == 1) { - const blazesym_csym *sym = &result->entries[j].syms[0]; - print_stack_frame_by_blazesym(j, addr, sym); - - continue; - } - - // multi symbol found - printf("\t%zu [<%016lx>] (%lu entries)\n", j, addr, result->entries[j].size); - - for (size_t k = 0; k < result->entries[j].size; ++k) { - const blazesym_csym *sym = &result->entries[j].syms[k]; - if (sym->path && strlen(sym->path)) - printf("\t\t%s@0x%lx %s:%ld\n", sym->symbol, sym->start_address, sym->path, sym->line_no); - else - printf("\t\t%s@0x%lx\n", sym->symbol, sym->start_address); + // Print inlined function calls if any + if (sym->inlined_cnt > 0) { + for (size_t k = 0; k < sym->inlined_cnt; ++k) { + const struct blaze_symbolize_inlined_fn *inlined = &sym->inlined[k]; + if (inlined->code_info.file && strlen(inlined->code_info.file)) + printf("\t\t[inlined] %s %s:%u\n", inlined->name, inlined->code_info.file, inlined->code_info.line); + else + printf("\t\t[inlined] %s\n", inlined->name); + } } } - blazesym_result_free(result); + blaze_syms_free(syms); } int print_stack_frames(struct allocation *allocs, size_t nr_allocs, int stack_traces_fd) diff --git a/src/16-memleak/test_memleak.c b/src/16-memleak/test_memleak.c new file mode 100644 index 00000000..2f755a0e --- /dev/null +++ b/src/16-memleak/test_memleak.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +// Test program for memleak - intentionally leaks memory for testing +#include +#include +#include + +void leak_small() { + malloc(1024); +} + +void leak_large() { + malloc(8192); +} + +void leak_with_loop() { + for (int i = 0; i < 5; i++) { + malloc(2048); + } +} + +int main() { + printf("Memory leak test starting (PID: %d)\n", getpid()); + printf("This program intentionally leaks memory for testing memleak\n"); + + // Wait a bit for memleak to attach + sleep(2); + + // Create various leaks + leak_small(); + sleep(1); + + leak_large(); + sleep(1); + + leak_with_loop(); + sleep(1); + + // Keep running so memleak can observe + printf("Leaks created, sleeping for 30 seconds...\n"); + sleep(30); + + printf("Test complete\n"); + return 0; +} diff --git a/src/32-wallclock-profiler/.config b/src/32-wallclock-profiler/.config new file mode 100644 index 00000000..0cffb731 --- /dev/null +++ b/src/32-wallclock-profiler/.config @@ -0,0 +1 @@ +level=Depth diff --git a/src/32-wallclock-profiler/.gitignore b/src/32-wallclock-profiler/.gitignore new file mode 100644 index 00000000..3d996e88 --- /dev/null +++ b/src/32-wallclock-profiler/.gitignore @@ -0,0 +1,25 @@ +/.output +/bootstrap +/profile +/offcputime +/oncputime +res.txt +*.o +oncpu.txt +offcpu.txt +FlameGraph +pmumon +test_combined +pmumon_advanced +/demo_results.svg +demo_results.folded +test_program +combined_profile_pid*.svg +combined_profile_pid*.folded +combined_profile_pid*.txt +combined_flamegraph.pl +multithread_combined_profile_pid* +demo_results_single_thread_analysis.txt +test_multithread +perf.data* +double_bandwidth diff --git a/src/32-wallclock-profiler/Makefile b/src/32-wallclock-profiler/Makefile new file mode 100644 index 00000000..1a596574 --- /dev/null +++ b/src/32-wallclock-profiler/Makefile @@ -0,0 +1,163 @@ +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +OUTPUT := .output +CLANG ?= clang +LIBBPF_SRC := $(abspath ../third_party/libbpf/src) +BPFTOOL_SRC := $(abspath ../third_party/bpftool/src) +LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a) +BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool) +BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool +LIBBLAZESYM_SRC := $(abspath ../third_party/blazesym/) +LIBBLAZESYM_OBJ := $(abspath $(OUTPUT)/libblazesym.a) +LIBBLAZESYM_HEADER := $(abspath $(OUTPUT)/blazesym.h) +ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \ + | sed 's/arm.*/arm/' \ + | sed 's/aarch64/arm64/' \ + | sed 's/ppc64le/powerpc/' \ + | sed 's/mips.*/mips/' \ + | sed 's/riscv64/riscv/' \ + | sed 's/loongarch64/loongarch/') +VMLINUX := ../third_party/vmlinux/$(ARCH)/vmlinux.h +# Use our own libbpf API headers and Linux UAPI headers distributed with +# libbpf to avoid dependency on system-wide headers, which could be missing or +# outdated +INCLUDES := -I$(OUTPUT) -I../third_party/libbpf/include/uapi -I$(dir $(VMLINUX)) +CFLAGS := -g -Wall -fno-omit-frame-pointer +ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS) + +APPS = offcputime oncputime + +# Test programs +TEST_PROGS = tests/test_combined tests/test_multithread tests/double_bandwidth + +# Enable blazesym with new capi +CARGO ?= $(shell which cargo) +ifeq ($(strip $(CARGO)),) +BZS_APPS := +else +BZS_APPS := offcputime oncputime +# Required by libblazesym +ALL_LDFLAGS += -lrt -ldl -lpthread -lm +endif + +# Get Clang's default includes on this system. We'll explicitly add these dirs +# to the includes list when compiling with `-target bpf` because otherwise some +# architecture-specific dirs will be "missing" on some architectures/distros - +# headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h, +# sys/cdefs.h etc. might be missing. +# +# Use '-idirafter': Don't interfere with include mechanics except where the +# build would have failed anyways. +CLANG_BPF_SYS_INCLUDES ?= $(shell $(CLANG) -v -E - &1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') + +ifeq ($(V),1) + Q = + msg = +else + Q = @ + msg = @printf ' %-8s %s%s\n' \ + "$(1)" \ + "$(patsubst $(abspath $(OUTPUT))/%,%,$(2))" \ + "$(if $(3), $(3))"; + MAKEFLAGS += --no-print-directory +endif + +define allow-override + $(if $(or $(findstring environment,$(origin $(1))),\ + $(findstring command line,$(origin $(1)))),,\ + $(eval $(1) = $(2))) +endef + +$(call allow-override,CC,$(CROSS_COMPILE)cc) +$(call allow-override,LD,$(CROSS_COMPILE)ld) + +.PHONY: all +all: $(APPS) $(TEST_PROGS) + +.PHONY: tests +tests: $(TEST_PROGS) + +.PHONY: all-with-tests +all-with-tests: all tests + +.PHONY: clean +clean: + $(call msg,CLEAN) + $(Q)rm -rf $(OUTPUT) $(APPS) $(TEST_PROGS) + +$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT): + $(call msg,MKDIR,$@) + $(Q)mkdir -p $@ + +# Build libbpf +$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf + $(call msg,LIB,$@) + $(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1 \ + OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@) \ + INCLUDEDIR= LIBDIR= UAPIDIR= \ + install + +# Build bpftool +$(BPFTOOL): | $(BPFTOOL_OUTPUT) + $(call msg,BPFTOOL,$@) + $(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap + + +$(LIBBLAZESYM_SRC)/target/release/libblazesym_c.a:: + $(Q)cd $(LIBBLAZESYM_SRC)/capi && $(CARGO) build --release + +$(LIBBLAZESYM_OBJ): $(LIBBLAZESYM_SRC)/target/release/libblazesym_c.a | $(OUTPUT) + $(call msg,LIB, $@) + $(Q)cp $(LIBBLAZESYM_SRC)/target/release/libblazesym_c.a $@ + +$(LIBBLAZESYM_HEADER): $(LIBBLAZESYM_SRC)/target/release/libblazesym_c.a | $(OUTPUT) + $(call msg,LIB,$@) + $(Q)cp $(LIBBLAZESYM_SRC)/capi/include/blazesym.h $@ + +# Build BPF code +$(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL) + $(call msg,BPF,$@) + $(Q)$(CLANG) -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH) \ + $(INCLUDES) $(CLANG_BPF_SYS_INCLUDES) \ + -c $(filter %.c,$^) -o $(patsubst %.bpf.o,%.tmp.bpf.o,$@) + $(Q)$(BPFTOOL) gen object $@ $(patsubst %.bpf.o,%.tmp.bpf.o,$@) + +# Generate BPF skeletons +$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) $(BPFTOOL) + $(call msg,GEN-SKEL,$@) + $(Q)$(BPFTOOL) gen skeleton $< > $@ + +# Build user-space code +$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h + +$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT) + $(call msg,CC,$@) + $(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@ + +$(patsubst %,$(OUTPUT)/%.o,$(BZS_APPS)): $(LIBBLAZESYM_HEADER) + +$(BZS_APPS): $(LIBBLAZESYM_OBJ) + +# Build application binary +$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT) + $(call msg,BINARY,$@) + $(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@ + +# Build test programs +tests/test_combined: tests/test_combined.c + $(call msg,TEST,$@) + $(Q)$(CC) $(CFLAGS) $< -o $@ -lpthread + +tests/test_multithread: tests/test_multithread.c + $(call msg,TEST,$@) + $(Q)$(CC) $(CFLAGS) $< -o $@ -lpthread -lm + +tests/double_bandwidth: tests/double_bandwidth.c + $(call msg,TEST,$@) + $(Q)$(CC) $(CFLAGS) -D_GNU_SOURCE -std=c11 -O3 $< -o $@ -lpthread -lm + +# delete failed targets +.DELETE_ON_ERROR: + +# keep intermediate (.skel.h, .bpf.o, etc) targets +.SECONDARY: diff --git a/src/32-wallclock-profiler/README.md b/src/32-wallclock-profiler/README.md new file mode 100644 index 00000000..faad42d9 --- /dev/null +++ b/src/32-wallclock-profiler/README.md @@ -0,0 +1,364 @@ +# eBPF Tutorial by Example 32: Wall Clock Profiling with Combined On-CPU and Off-CPU Analysis + +Performance bottlenecks can hide in two very different places. Your code might be burning CPU cycles in hot loops, or it might be sitting idle waiting for I/O, network responses, or lock contention. Traditional profilers often focus on just one side of this story. But what if you could see both at once? + +This tutorial introduces a complete wall clock profiling solution that combines on-CPU and off-CPU analysis using eBPF. We'll show you how to capture the full picture of where your application spends its time, using two complementary eBPF programs that work together to account for every microsecond of execution. Whether your performance problems come from computation or waiting, you'll be able to spot them in a unified flame graph view. + +> The complete source code: + +## Understanding Wall Clock Profiling + +Wall clock time is the actual elapsed time from start to finish, like checking a stopwatch. For any running process, this time gets divided into two categories. On-CPU time is when your code actively executes on a processor, doing real work. Off-CPU time is when your process exists but isn't running, waiting for something like disk I/O, network packets, or acquiring a lock. + +Traditional CPU profilers only show you the on-CPU story. They sample the stack at regular intervals when your code runs, building a picture of which functions consume CPU cycles. But these profilers are blind to off-CPU time. When your thread blocks on a system call or waits for a mutex, the profiler stops seeing it. This creates a massive blind spot for applications that spend significant time waiting. + +Off-CPU profilers flip the problem around. They track when threads go to sleep and wake up, measuring blocked time and capturing stack traces at blocking points. This reveals I/O bottlenecks and lock contention. But they miss pure computation problems. + +The tools in this tutorial solve both problems by running two eBPF programs simultaneously. The `oncputime` tool samples on-CPU execution using perf events. The `offcputime` tool hooks into the kernel scheduler to catch blocking operations. A Python script combines the results, normalizing the time scales so you can see CPU-intensive code paths (marked red) and blocking operations (marked blue) in the same flame graph. This complete view shows where every microsecond goes. + +Here's an example flame graph showing combined on-CPU and off-CPU profiling results: + +![Combined Wall Clock Flame Graph Example](https://github.com/eunomia-bpf/bpf-developer-tutorial/blob/main/src/32-wallclock-profiler/tests/example.svg) + +In this visualization, you can clearly see the distinction between CPU-intensive work (shown in red/warm colors marked with `_[c]`) and blocking operations (shown in blue/cool colors marked with `_[o]`). The relative widths immediately reveal where your application spends its wall clock time. + +## The Tools: oncputime and offcputime + +This tutorial provides two complementary profiling tools. The `oncputime` tool samples your process at regular intervals using perf events, capturing stack traces when code actively runs on the CPU. At a default rate of 49 Hz, it wakes up roughly every 20 milliseconds to record where your program is executing. Higher sample counts in the output indicate more CPU time spent in those code paths. + +The `offcputime` tool takes a different approach. It hooks into the kernel scheduler's context switch mechanism, specifically the `sched_switch` tracepoint. When your thread goes off-CPU, the tool records a timestamp and captures the stack trace showing why it blocked. When the thread returns to running, it calculates how long the thread was sleeping. This directly measures I/O waits, lock contention, and other blocking operations in microseconds. + +Both tools use BPF stack maps to efficiently capture kernel and user space call chains with minimal overhead. They aggregate results by unique stack traces, so repeated execution of the same code path gets summed together. The tools can filter by process ID, thread ID, and various other criteria to focus analysis on specific parts of your application. + +## Implementation: Kernel-Space eBPF Programs + +Let's examine how these tools work at the eBPF level. We'll start with the on-CPU profiler, then look at the off-CPU profiler, and see how they complement each other. + +### On-CPU Profiling with oncputime + +The on-CPU profiler uses perf events to sample execution at regular time intervals. Here's the complete eBPF program: + +```c +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +#include +#include +#include +#include +#include "oncputime.h" + +const volatile bool kernel_stacks_only = false; +const volatile bool user_stacks_only = false; +const volatile bool include_idle = false; +const volatile bool filter_by_pid = false; +const volatile bool filter_by_tid = false; + +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __type(key, u32); +} stackmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct key_t); + __type(value, u64); + __uint(max_entries, MAX_ENTRIES); +} counts SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, u8); + __uint(max_entries, MAX_PID_NR); +} pids SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, u8); + __uint(max_entries, MAX_TID_NR); +} tids SEC(".maps"); + +SEC("perf_event") +int do_perf_event(struct bpf_perf_event_data *ctx) +{ + u64 *valp; + static const u64 zero; + struct key_t key = {}; + u64 id; + u32 pid; + u32 tid; + + id = bpf_get_current_pid_tgid(); + pid = id >> 32; + tid = id; + + if (!include_idle && tid == 0) + return 0; + + if (filter_by_pid && !bpf_map_lookup_elem(&pids, &pid)) + return 0; + + if (filter_by_tid && !bpf_map_lookup_elem(&tids, &tid)) + return 0; + + key.pid = pid; + bpf_get_current_comm(&key.name, sizeof(key.name)); + + if (user_stacks_only) + key.kern_stack_id = -1; + else + key.kern_stack_id = bpf_get_stackid(&ctx->regs, &stackmap, 0); + + if (kernel_stacks_only) + key.user_stack_id = -1; + else + key.user_stack_id = bpf_get_stackid(&ctx->regs, &stackmap, + BPF_F_USER_STACK); + + valp = bpf_map_lookup_or_try_init(&counts, &key, &zero); + if (valp) + __sync_fetch_and_add(valp, 1); + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; +``` + +The program starts by defining several BPF maps. The `stackmap` is a special map type for storing stack traces. When you call `bpf_get_stackid()`, the kernel walks the stack and stores the instruction pointers in this map, returning an ID you can use to look it up later. The `counts` map aggregates samples by a composite key that includes both the process ID and the stack IDs. The `pids` and `tids` maps act as filters, letting you restrict profiling to specific processes or threads. + +The main logic lives in the `do_perf_event()` function, which runs every time a perf event fires. The user space program sets up these perf events at a specific frequency (default 49 Hz), one per CPU core. When a CPU triggers its timer, this function executes on whatever process happens to be running at that moment. It first extracts the process and thread IDs from the current task, then applies any configured filters. If the current thread should be sampled, it builds a key structure that includes the process name and stack traces. + +The two calls to `bpf_get_stackid()` capture different pieces of the execution context. The first call without flags gets the kernel stack, showing what kernel functions were active. The second call with `BPF_F_USER_STACK` gets the user space stack, showing your application's function calls. These stack IDs go into the key, and the program increments a counter for that unique combination. Over time, hot code paths get sampled more frequently, building up higher counts. + +### Off-CPU Profiling with offcputime + +The off-CPU profiler hooks into the scheduler to measure blocking time. Here's the complete eBPF program: + +```c +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include "offcputime.h" + +#define PF_KTHREAD 0x00200000 + +const volatile bool kernel_threads_only = false; +const volatile bool user_threads_only = false; +const volatile __u64 max_block_ns = -1; +const volatile __u64 min_block_ns = 0; +const volatile bool filter_by_tgid = false; +const volatile bool filter_by_pid = false; +const volatile long state = -1; + +struct internal_key { + u64 start_ts; + struct key_t key; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, struct internal_key); + __uint(max_entries, MAX_ENTRIES); +} start SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(key_size, sizeof(u32)); +} stackmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct key_t); + __type(value, struct val_t); + __uint(max_entries, MAX_ENTRIES); +} info SEC(".maps"); + +static bool allow_record(struct task_struct *t) +{ + u32 tgid = BPF_CORE_READ(t, tgid); + u32 pid = BPF_CORE_READ(t, pid); + + if (filter_by_tgid && !bpf_map_lookup_elem(&tgids, &tgid)) + return false; + if (filter_by_pid && !bpf_map_lookup_elem(&pids, &pid)) + return false; + if (user_threads_only && (BPF_CORE_READ(t, flags) & PF_KTHREAD)) + return false; + else if (kernel_threads_only && !(BPF_CORE_READ(t, flags) & PF_KTHREAD)) + return false; + if (state != -1 && get_task_state(t) != state) + return false; + return true; +} + +static int handle_sched_switch(void *ctx, bool preempt, struct task_struct *prev, struct task_struct *next) +{ + struct internal_key *i_keyp, i_key; + struct val_t *valp, val; + s64 delta; + u32 pid; + + if (allow_record(prev)) { + pid = BPF_CORE_READ(prev, pid); + if (!pid) + pid = bpf_get_smp_processor_id(); + i_key.key.pid = pid; + i_key.key.tgid = BPF_CORE_READ(prev, tgid); + i_key.start_ts = bpf_ktime_get_ns(); + + if (BPF_CORE_READ(prev, flags) & PF_KTHREAD) + i_key.key.user_stack_id = -1; + else + i_key.key.user_stack_id = bpf_get_stackid(ctx, &stackmap, BPF_F_USER_STACK); + i_key.key.kern_stack_id = bpf_get_stackid(ctx, &stackmap, 0); + bpf_map_update_elem(&start, &pid, &i_key, 0); + bpf_probe_read_kernel_str(&val.comm, sizeof(prev->comm), BPF_CORE_READ(prev, comm)); + val.delta = 0; + bpf_map_update_elem(&info, &i_key.key, &val, BPF_NOEXIST); + } + + pid = BPF_CORE_READ(next, pid); + i_keyp = bpf_map_lookup_elem(&start, &pid); + if (!i_keyp) + return 0; + delta = (s64)(bpf_ktime_get_ns() - i_keyp->start_ts); + if (delta < 0) + goto cleanup; + if (delta < min_block_ns || delta > max_block_ns) + goto cleanup; + delta /= 1000U; + valp = bpf_map_lookup_elem(&info, &i_keyp->key); + if (!valp) + goto cleanup; + __sync_fetch_and_add(&valp->delta, delta); + +cleanup: + bpf_map_delete_elem(&start, &pid); + return 0; +} + +SEC("tp_btf/sched_switch") +int BPF_PROG(sched_switch, bool preempt, struct task_struct *prev, struct task_struct *next) +{ + return handle_sched_switch(ctx, preempt, prev, next); +} + +char LICENSE[] SEC("license") = "GPL"; +``` + +The off-CPU profiler is more complex because it needs to track timing across multiple events. The `start` map stores timestamps and stack information for threads that go off-CPU. When a thread blocks, we record when it happened and why (the stack trace). When that same thread returns to running, we calculate how long it was blocked. + +The scheduler switch happens many times per second on a busy system, so performance matters. The `allow_record()` function quickly filters out threads we don't care about before doing expensive operations. If a thread passes the filter, the program captures the current timestamp using `bpf_ktime_get_ns()` and records the stack traces showing where the thread blocked. + +The key insight is in the two-stage approach. The `prev` task (the thread going off-CPU) gets its blocking point recorded with a timestamp. When the scheduler later switches to the `next` task (a thread waking up), we look up whether we previously recorded this thread going to sleep. If we find a record, we calculate the delta between now and when it went to sleep. This delta is the off-CPU time in nanoseconds, which we convert to microseconds and add to the accumulated total for that stack trace. + +### User-Space Programs: Loading and Processing + +Both tools follow a similar pattern in user space. They use libbpf to load the compiled eBPF object file and attach it to the appropriate event. For `oncputime`, this means setting up perf events at the desired sampling frequency. For `offcputime`, it means attaching to the scheduler tracepoint. The user space programs then periodically read the BPF maps, resolve the stack IDs to actual function names using symbol tables, and format the output. + +The symbol resolution is handled by the blazesym library, which parses DWARF debug information from binaries. When you see a stack trace with function names and line numbers, that's blazesym converting raw instruction pointer addresses into human-readable form. The user space programs output in "folded" format, where each line contains a semicolon-separated stack trace followed by a count or time value. This format feeds directly into flame graph generation tools. + +## Combining On-CPU and Off-CPU Profiles + +The real power comes from running both tools together and merging their results. The `wallclock_profiler.py` script orchestrates this process. It launches both profilers simultaneously on the target process, waits for them to complete, and then combines their outputs. + +The challenge is that the two tools measure different things in different units. The on-CPU profiler counts samples (49 per second by default), while the off-CPU profiler measures microseconds. To create a unified view, the script normalizes the off-CPU time to equivalent sample counts. If sampling at 49 Hz, each sample represents about 20,408 microseconds of potential execution time. The script divides off-CPU microseconds by this value to get equivalent samples. + +After normalization, the script adds annotations to distinguish the two types of time. On-CPU stack traces get a `_[c]` suffix (for compute), while off-CPU stacks get `_[o]` (for off-CPU or blocking). A custom color palette in the flame graph tool renders these different colors, red for CPU time and blue for blocking time. The result is a single flame graph where you can see both types of activity and their relative magnitudes. + +The script also handles multi-threaded applications by profiling each thread separately. It detects threads at startup, launches parallel profiling sessions for each one, and generates individual flame graphs showing per-thread behavior. This helps identify which threads are busy versus idle, and whether your parallelism is effective. + +## Compilation and Execution + +Building the tools requires a standard eBPF development environment. The tutorial repository includes all dependencies in the `src/third_party/` directory. To build: + +```bash +cd src/32-wallclock-profiler +make +``` + +The Makefile compiles the eBPF C code with clang, generates skeletons with bpftool, builds the blazesym symbol resolver, and links everything with libbpf to create the final executables. + +To use the individual tools: + +```bash +# Profile on-CPU execution for 30 seconds +sudo ./oncputime -p -F 99 30 + +# Profile off-CPU blocking for 30 seconds +sudo ./offcputime -p -m 1000 30 + +# Use the combined profiler (recommended) +sudo python3 wallclock_profiler.py -d 30 -f 99 +``` + +Let's try profiling a test program that does both CPU work and blocking I/O: + +```bash +# Build and run the test program +cd tests +make +./test_combined & +TEST_PID=$! + +# Profile it with the combined profiler +cd .. +sudo python3 wallclock_profiler.py $TEST_PID -d 30 + +# This generates: +# - combined_profile_pid_.folded (raw data) +# - combined_profile_pid_.svg (flame graph) +# - combined_profile_pid__single_thread_analysis.txt (time breakdown) +``` + +The output flame graph will show red frames for the `cpu_work()` function consuming CPU time, and blue frames for the `blocking_work()` function spending time in sleep. The relative widths show how much wall clock time each consumes. + +For multi-threaded applications, the profiler creates a directory with per-thread results: + +```bash +# Profile a multi-threaded application +sudo python3 wallclock_profiler.py -d 30 + +# Output in multithread_combined_profile_pid_/ +# - thread__main.svg (main thread flame graph) +# - thread__.svg (worker thread flame graphs) +# - *_thread_analysis.txt (time analysis for all threads) +``` + +The analysis files show time accounting, letting you verify that on-CPU plus off-CPU time adds up correctly to the wall clock profiling duration. Coverage percentages help identify if threads are mostly idle or if you're missing data. + +## Interpreting the Results + +When you open the flame graph SVG in a browser, each horizontal box represents a function in a stack trace. The width shows how much time was spent there. Boxes stacked vertically show the call chain, with lower boxes calling higher ones. Red boxes indicate on-CPU time, blue boxes show off-CPU time. + +Look for wide red sections to find CPU bottlenecks. These are functions burning through cycles in tight loops or expensive algorithms. Wide blue sections indicate blocking operations. Common patterns include file I/O (read/write system calls), network operations (recv/send), and lock contention (futex calls). + +The flame graph is interactive. Click any box to zoom in and see details about that subtree. The search function lets you highlight all frames matching a pattern, useful for finding specific functions or libraries. Hovering shows the full function name and exact sample count or time value. + +Pay attention to the relative proportions. An application that's 90% blue is I/O bound and probably won't benefit much from CPU optimization. One that's mostly red is CPU bound. Applications split evenly between red and blue might benefit from overlapping computation and I/O, such as using asynchronous I/O or threading. + +For multi-threaded profiles, compare the per-thread flame graphs. Ideally, worker threads should show similar patterns if the workload is balanced. If one thread is mostly red while others are mostly blue, you might have load imbalance. If all threads show lots of blue time in futex waits with similar stacks, that's lock contention. + +## Summary + +Wall clock profiling with eBPF gives you complete visibility into application performance by combining on-CPU and off-CPU analysis. The on-CPU profiler samples execution to find hot code paths that consume CPU cycles. The off-CPU profiler hooks into the scheduler to measure blocking time and identify I/O bottlenecks or lock contention. Together, they account for every microsecond of wall clock time, showing where your application actually spends its life. + +The tools use eBPF's low-overhead instrumentation to collect this data with minimal impact on the target application. Stack trace capture and aggregation happen in the kernel, avoiding expensive context switches. The user space programs only need to periodically read accumulated results and resolve symbols, making the overhead negligible even for production use. + +By visualizing both types of time in a single flame graph with color coding, you can quickly identify whether problems are computational or blocking in nature. This guides optimization efforts more effectively than traditional profiling approaches that only show one side of the picture. Multi-threaded profiling support reveals parallelism issues and thread-level bottlenecks. + +> If you'd like to dive deeper into eBPF, check out our tutorial repository at or visit our website at . + +## Reference + +- BCC libbpf-tools offcputime: +- BCC libbpf-tools profile: +- Blazesym symbol resolution: +- FlameGraph visualization: +- "Off-CPU Analysis" by Brendan Gregg: + +> The original link of this article: diff --git a/src/32-wallclock-profiler/README.zh.md b/src/32-wallclock-profiler/README.zh.md new file mode 100644 index 00000000..234e5ea1 --- /dev/null +++ b/src/32-wallclock-profiler/README.zh.md @@ -0,0 +1,364 @@ +# eBPF 开发实践教程:示例 32 - 结合 On-CPU 和 Off-CPU 分析的挂钟时间分析 + +性能瓶颈可能隐藏在两个截然不同的地方。你的代码可能在热循环中消耗 CPU 周期,也可能在空闲地等待 I/O、网络响应或锁竞争。传统的性能分析工具通常只关注故事的一面。但如果你能同时看到两者呢? + +本教程介绍了一个完整的挂钟时间分析解决方案,它结合了 on-CPU 和 off-CPU 分析,使用 eBPF 技术实现。我们将展示如何捕获应用程序时间消耗的完整图景,使用两个互补的 eBPF 程序协同工作,追踪执行过程中的每一微秒。无论性能问题来自计算还是等待,你都能在统一的火焰图视图中发现它们。 + +> 完整源代码: + +## 理解挂钟时间分析 + +挂钟时间是从开始到结束的实际经过时间,就像看秒表一样。对于任何运行的进程,这段时间分为两类。On-CPU 时间是指代码在处理器上主动执行、完成实际工作的时间。Off-CPU 时间是指进程存在但未运行,等待某些事件(如磁盘 I/O、网络数据包或获取锁)的时间。 + +传统的 CPU 分析器只展示 on-CPU 的故事。它们在代码运行时以固定间隔采样调用栈,构建出哪些函数消耗 CPU 周期的图景。但这些分析器对 off-CPU 时间视而不见。当你的线程在系统调用上阻塞或等待互斥锁时,分析器就看不到它了。这为那些花费大量时间等待的应用程序创造了巨大的盲点。 + +Off-CPU 分析器则反其道而行之。它们跟踪线程何时休眠和唤醒,测量阻塞时间并在阻塞点捕获调用栈。这揭示了 I/O 瓶颈和锁竞争问题。但它们错过了纯计算问题。 + +本教程中的工具通过同时运行两个 eBPF 程序解决了这两个问题。`oncputime` 工具使用 perf 事件采样 on-CPU 执行。`offcputime` 工具挂钩到内核调度器以捕获阻塞操作。一个 Python 脚本组合结果,对时间尺度进行标准化,这样你就能在同一个火焰图中看到 CPU 密集型代码路径(标记为红色)和阻塞操作(标记为蓝色)。这个完整的视图展示了每一微秒的去向。 + +下面是一个结合 on-CPU 和 off-CPU 分析结果的示例火焰图: + +![结合挂钟时间火焰图示例](https://github.com/eunomia-bpf/bpf-developer-tutorial/blob/main/src/32-wallclock-profiler/tests/example.svg) + +在这个可视化中,你可以清楚地看到 CPU 密集型工作(以红色/暖色显示,标记为 `_[c]`)和阻塞操作(以蓝色/冷色显示,标记为 `_[o]`)之间的区别。相对宽度立即揭示了应用程序在哪里花费了挂钟时间。 + +## 工具: oncputime 和 offcputime + +本教程提供两个互补的分析工具。`oncputime` 工具使用 perf 事件以固定间隔采样你的进程,在代码主动在 CPU 上运行时捕获调用栈。默认速率为 49 Hz,大约每 20 毫秒唤醒一次以记录程序执行位置。输出中的采样计数越高,表示在这些代码路径上花费的 CPU 时间越多。 + +`offcputime` 工具采用不同的方法。它挂钩到内核调度器的上下文切换机制,具体来说是 `sched_switch` 追踪点。当线程离开 CPU 时,工具记录时间戳并捕获显示阻塞原因的调用栈。当线程恢复运行时,它计算线程休眠的时长。这直接测量 I/O 等待、锁竞争和其他以微秒为单位的阻塞操作。 + +两个工具都使用 BPF 栈映射以最小的开销高效地捕获内核和用户空间调用链。它们按唯一的调用栈聚合结果,因此同一代码路径的重复执行会被累加在一起。这些工具可以按进程 ID、线程 ID 和各种其他条件进行过滤,以便将分析集中在应用程序的特定部分。 + +## 实现: 内核空间 eBPF 程序 + +让我们在 eBPF 层面检查这些工具的工作原理。我们将从 on-CPU 分析器开始,然后看看 off-CPU 分析器,以及它们如何互补。 + +### 使用 oncputime 进行 On-CPU 分析 + +On-CPU 分析器使用 perf 事件以固定时间间隔采样执行。这是完整的 eBPF 程序: + +```c +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +#include +#include +#include +#include +#include "oncputime.h" + +const volatile bool kernel_stacks_only = false; +const volatile bool user_stacks_only = false; +const volatile bool include_idle = false; +const volatile bool filter_by_pid = false; +const volatile bool filter_by_tid = false; + +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __type(key, u32); +} stackmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct key_t); + __type(value, u64); + __uint(max_entries, MAX_ENTRIES); +} counts SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, u8); + __uint(max_entries, MAX_PID_NR); +} pids SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, u8); + __uint(max_entries, MAX_TID_NR); +} tids SEC(".maps"); + +SEC("perf_event") +int do_perf_event(struct bpf_perf_event_data *ctx) +{ + u64 *valp; + static const u64 zero; + struct key_t key = {}; + u64 id; + u32 pid; + u32 tid; + + id = bpf_get_current_pid_tgid(); + pid = id >> 32; + tid = id; + + if (!include_idle && tid == 0) + return 0; + + if (filter_by_pid && !bpf_map_lookup_elem(&pids, &pid)) + return 0; + + if (filter_by_tid && !bpf_map_lookup_elem(&tids, &tid)) + return 0; + + key.pid = pid; + bpf_get_current_comm(&key.name, sizeof(key.name)); + + if (user_stacks_only) + key.kern_stack_id = -1; + else + key.kern_stack_id = bpf_get_stackid(&ctx->regs, &stackmap, 0); + + if (kernel_stacks_only) + key.user_stack_id = -1; + else + key.user_stack_id = bpf_get_stackid(&ctx->regs, &stackmap, + BPF_F_USER_STACK); + + valp = bpf_map_lookup_or_try_init(&counts, &key, &zero); + if (valp) + __sync_fetch_and_add(valp, 1); + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; +``` + +程序首先定义几个 BPF 映射。`stackmap` 是用于存储调用栈跟踪的特殊映射类型。当你调用 `bpf_get_stackid()` 时,内核会遍历栈并将指令指针存储在此映射中,返回一个 ID 供你稍后查找。`counts` 映射按包含进程 ID 和栈 ID 的复合键聚合样本。`pids` 和 `tids` 映射充当过滤器,让你可以将分析限制在特定进程或线程上。 + +主要逻辑位于 `do_perf_event()` 函数中,该函数在每次 perf 事件触发时运行。用户空间程序以特定频率(默认 49 Hz)为每个 CPU 核心设置这些 perf 事件。当 CPU 触发其定时器时,此函数在当时正在运行的任何进程上执行。它首先从当前任务中提取进程和线程 ID,然后应用任何配置的过滤器。如果当前线程应该被采样,它会构建一个包含进程名称和调用栈的键结构。 + +两次对 `bpf_get_stackid()` 的调用捕获执行上下文的不同部分。第一次调用没有标志,获取内核栈,显示哪些内核函数处于活动状态。第二次调用带有 `BPF_F_USER_STACK`,获取用户空间栈,显示应用程序的函数调用。这些栈 ID 进入键中,程序为该唯一组合递增计数器。随着时间推移,热代码路径被更频繁地采样,累积更高的计数。 + +### 使用 offcputime 进行 Off-CPU 分析 + +Off-CPU 分析器挂钩到调度器以测量阻塞时间。这是完整的 eBPF 程序: + +```c +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include "offcputime.h" + +#define PF_KTHREAD 0x00200000 + +const volatile bool kernel_threads_only = false; +const volatile bool user_threads_only = false; +const volatile __u64 max_block_ns = -1; +const volatile __u64 min_block_ns = 0; +const volatile bool filter_by_tgid = false; +const volatile bool filter_by_pid = false; +const volatile long state = -1; + +struct internal_key { + u64 start_ts; + struct key_t key; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, struct internal_key); + __uint(max_entries, MAX_ENTRIES); +} start SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(key_size, sizeof(u32)); +} stackmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct key_t); + __type(value, struct val_t); + __uint(max_entries, MAX_ENTRIES); +} info SEC(".maps"); + +static bool allow_record(struct task_struct *t) +{ + u32 tgid = BPF_CORE_READ(t, tgid); + u32 pid = BPF_CORE_READ(t, pid); + + if (filter_by_tgid && !bpf_map_lookup_elem(&tgids, &tgid)) + return false; + if (filter_by_pid && !bpf_map_lookup_elem(&pids, &pid)) + return false; + if (user_threads_only && (BPF_CORE_READ(t, flags) & PF_KTHREAD)) + return false; + else if (kernel_threads_only && !(BPF_CORE_READ(t, flags) & PF_KTHREAD)) + return false; + if (state != -1 && get_task_state(t) != state) + return false; + return true; +} + +static int handle_sched_switch(void *ctx, bool preempt, struct task_struct *prev, struct task_struct *next) +{ + struct internal_key *i_keyp, i_key; + struct val_t *valp, val; + s64 delta; + u32 pid; + + if (allow_record(prev)) { + pid = BPF_CORE_READ(prev, pid); + if (!pid) + pid = bpf_get_smp_processor_id(); + i_key.key.pid = pid; + i_key.key.tgid = BPF_CORE_READ(prev, tgid); + i_key.start_ts = bpf_ktime_get_ns(); + + if (BPF_CORE_READ(prev, flags) & PF_KTHREAD) + i_key.key.user_stack_id = -1; + else + i_key.key.user_stack_id = bpf_get_stackid(ctx, &stackmap, BPF_F_USER_STACK); + i_key.key.kern_stack_id = bpf_get_stackid(ctx, &stackmap, 0); + bpf_map_update_elem(&start, &pid, &i_key, 0); + bpf_probe_read_kernel_str(&val.comm, sizeof(prev->comm), BPF_CORE_READ(prev, comm)); + val.delta = 0; + bpf_map_update_elem(&info, &i_key.key, &val, BPF_NOEXIST); + } + + pid = BPF_CORE_READ(next, pid); + i_keyp = bpf_map_lookup_elem(&start, &pid); + if (!i_keyp) + return 0; + delta = (s64)(bpf_ktime_get_ns() - i_keyp->start_ts); + if (delta < 0) + goto cleanup; + if (delta < min_block_ns || delta > max_block_ns) + goto cleanup; + delta /= 1000U; + valp = bpf_map_lookup_elem(&info, &i_keyp->key); + if (!valp) + goto cleanup; + __sync_fetch_and_add(&valp->delta, delta); + +cleanup: + bpf_map_delete_elem(&start, &pid); + return 0; +} + +SEC("tp_btf/sched_switch") +int BPF_PROG(sched_switch, bool preempt, struct task_struct *prev, struct task_struct *next) +{ + return handle_sched_switch(ctx, preempt, prev, next); +} + +char LICENSE[] SEC("license") = "GPL"; +``` + +Off-CPU 分析器更复杂,因为它需要跨多个事件跟踪时间。`start` 映射存储离开 CPU 的线程的时间戳和栈信息。当线程阻塞时,我们记录何时发生以及为什么(调用栈)。当同一线程恢复运行时,我们计算它阻塞了多长时间。 + +调度器切换在繁忙系统上每秒发生多次,因此性能很重要。`allow_record()` 函数在执行昂贵操作之前快速过滤掉我们不关心的线程。如果线程通过过滤器,程序使用 `bpf_ktime_get_ns()` 捕获当前时间戳,并记录显示线程阻塞位置的调用栈。 + +关键洞察在于两阶段方法。`prev` 任务(离开 CPU 的线程)的阻塞点被记录,带有时间戳。当调度器稍后切换到 `next` 任务(唤醒的线程)时,我们查找是否之前记录了这个线程进入睡眠。如果找到记录,我们计算现在与它进入睡眠时之间的差值。这个差值就是以纳秒为单位的 off-CPU 时间,我们将其转换为微秒并添加到该调用栈的累积总数中。 + +### 用户空间程序: 加载和处理 + +两个工具在用户空间遵循类似的模式。它们使用 libbpf 加载编译的 eBPF 目标文件并将其附加到适当的事件。对于 `oncputime`,这意味着以所需的采样频率设置 perf 事件。对于 `offcputime`,这意味着附加到调度器追踪点。然后用户空间程序定期读取 BPF 映射,使用符号表将栈 ID 解析为实际函数名,并格式化输出。 + +符号解析由 blazesym 库处理,该库解析二进制文件中的 DWARF 调试信息。当你看到带有函数名和行号的调用栈时,那就是 blazesym 将原始指令指针地址转换为人类可读形式。用户空间程序以"折叠"格式输出,其中每行包含一个分号分隔的调用栈,后跟计数或时间值。这种格式直接输入到火焰图生成工具。 + +## 组合 On-CPU 和 Off-CPU 配置文件 + +真正的力量来自于将两个工具一起运行并合并它们的结果。`wallclock_profiler.py` 脚本编排这个过程。它在目标进程上同时启动两个分析器,等待它们完成,然后组合它们的输出。 + +挑战在于这两个工具以不同的单位测量不同的东西。On-CPU 分析器计数样本(默认每秒 49 个),而 off-CPU 分析器测量微秒。为了创建统一视图,脚本将 off-CPU 时间标准化为等效的样本计数。如果以 49 Hz 采样,每个样本代表约 20,408 微秒的潜在执行时间。脚本将 off-CPU 微秒除以此值以获得等效样本。 + +标准化后,脚本添加注释以区分两种类型的时间。On-CPU 调用栈获得 `_[c]` 后缀(表示计算),而 off-CPU 栈获得 `_[o]`(表示 off-CPU 或阻塞)。火焰图工具中的自定义调色板以不同颜色渲染这些,CPU 时间为红色,阻塞时间为蓝色。结果是一个单一的火焰图,你可以看到两种类型的活动及其相对幅度。 + +脚本还通过分别分析每个线程来处理多线程应用程序。它在启动时检测线程,为每个线程启动并行分析会话,并生成显示每个线程行为的单独火焰图。这有助于识别哪些线程忙碌,哪些空闲,以及你的并行性是否有效。 + +## 编译和执行 + +构建工具需要标准的 eBPF 开发环境。教程仓库在 `src/third_party/` 目录中包含所有依赖项。要构建: + +```bash +cd src/32-wallclock-profiler +make +``` + +Makefile 使用 clang 编译 eBPF C 代码,使用 bpftool 生成骨架,构建 blazesym 符号解析器,并将所有内容与 libbpf 链接以创建最终可执行文件。 + +使用单独的工具: + +```bash +# 分析 30 秒的 on-CPU 执行 +sudo ./oncputime -p -F 99 30 + +# 分析 30 秒的 off-CPU 阻塞 +sudo ./offcputime -p -m 1000 30 + +# 使用组合分析器(推荐) +sudo python3 wallclock_profiler.py -d 30 -f 99 +``` + +让我们尝试分析一个同时执行 CPU 工作和阻塞 I/O 的测试程序: + +```bash +# 构建并运行测试程序 +cd tests +make +./test_combined & +TEST_PID=$! + +# 使用组合分析器分析它 +cd .. +sudo python3 wallclock_profiler.py $TEST_PID -d 30 + +# 这会生成: +# - combined_profile_pid_.folded (原始数据) +# - combined_profile_pid_.svg (火焰图) +# - combined_profile_pid__single_thread_analysis.txt (时间分解) +``` + +输出的火焰图将显示 `cpu_work()` 函数消耗 CPU 时间的红色帧,以及 `blocking_work()` 函数在睡眠中花费时间的蓝色帧。相对宽度显示每个消耗的挂钟时间量。 + +对于多线程应用程序,分析器创建一个包含每线程结果的目录: + +```bash +# 分析多线程应用程序 +sudo python3 wallclock_profiler.py -d 30 + +# 输出在 multithread_combined_profile_pid_/ 中 +# - thread__main.svg (主线程火焰图) +# - thread__.svg (工作线程火焰图) +# - *_thread_analysis.txt (所有线程的时间分析) +``` + +分析文件显示时间统计,让你验证 on-CPU 加 off-CPU 时间是否正确累加到挂钟分析持续时间。覆盖百分比有助于识别线程是否大多空闲或是否缺少数据。 + +## 解读结果 + +在浏览器中打开火焰图 SVG 时,每个水平框代表调用栈中的一个函数。宽度显示在那里花费了多少时间。垂直堆叠的框显示调用链,较低的框调用较高的框。红色框表示 on-CPU 时间,蓝色框显示 off-CPU 时间。 + +寻找宽的红色部分以找到 CPU 瓶颈。这些是在紧密循环或昂贵算法中消耗周期的函数。宽的蓝色部分表示阻塞操作。常见模式包括文件 I/O(读/写系统调用)、网络操作(recv/send)和锁竞争(futex 调用)。 + +火焰图是交互式的。单击任何框以放大并查看该子树的详细信息。搜索功能让你突出显示匹配模式的所有帧,这对查找特定函数或库很有用。悬停显示完整的函数名和确切的样本计数或时间值。 + +注意相对比例。90% 蓝色的应用程序是 I/O 绑定的,可能不会从 CPU 优化中受益太多。大部分红色的应用程序是 CPU 绑定的。红色和蓝色均分的应用程序可能受益于重叠计算和 I/O,例如使用异步 I/O 或线程。 + +对于多线程配置文件,比较每个线程的火焰图。理想情况下,如果工作负载平衡,工作线程应该显示相似的模式。如果一个线程大多是红色而其他线程大多是蓝色,你可能有负载不平衡。如果所有线程在 futex 等待中显示大量蓝色时间,具有相似的栈,那就是锁竞争。 + +## 总结 + +使用 eBPF 进行挂钟时间分析通过结合 on-CPU 和 off-CPU 分析,为你提供应用程序性能的完整可见性。On-CPU 分析器采样执行以查找消耗 CPU 周期的热代码路径。Off-CPU 分析器挂钩到调度器以测量阻塞时间并识别 I/O 瓶颈或锁竞争。它们一起统计挂钟时间的每一微秒,显示应用程序实际花费生命的地方。 + +这些工具使用 eBPF 的低开销检测来收集这些数据,对目标应用程序的影响最小。调用栈捕获和聚合在内核中进行,避免昂贵的上下文切换。用户空间程序只需要定期读取累积结果并解析符号,即使在生产环境中使用,开销也可以忽略不计。 + +通过在带有颜色编码的单个火焰图中可视化两种类型的时间,你可以快速识别问题是计算性质还是阻塞性质。这比传统的只显示图景一面的分析方法更有效地指导优化工作。多线程分析支持揭示并行性问题和线程级瓶颈。 + +> 如果你想更深入地了解 eBPF,请访问我们的教程代码仓库 或网站 + +## 参考资料 + +- BCC libbpf-tools offcputime: +- BCC libbpf-tools profile: +- Blazesym 符号解析: +- FlameGraph 可视化: +- Brendan Gregg 的 "Off-CPU Analysis": + +> 本文原文链接: diff --git a/src/32-wallclock-profiler/arg_parse.h b/src/32-wallclock-profiler/arg_parse.h new file mode 100644 index 00000000..871a0529 --- /dev/null +++ b/src/32-wallclock-profiler/arg_parse.h @@ -0,0 +1,352 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +#ifndef __ARG_PARSE_H +#define __ARG_PARSE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "common.h" + +#define OPT_PERF_MAX_STACK_DEPTH 1 /* --perf-max-stack-depth */ +#define OPT_STACK_STORAGE_SIZE 2 /* --stack-storage-size */ +#define OPT_STATE 3 /* --state */ + +typedef enum { + TOOL_OFFCPUTIME, + TOOL_PROFILE +} tool_type_t; + +struct common_env { + pid_t pids[MAX_PID_NR]; + pid_t tids[MAX_TID_NR]; + bool user_threads_only; + bool kernel_threads_only; + bool user_stacks_only; + bool kernel_stacks_only; + int stack_storage_size; + int perf_max_stack_depth; + __u64 min_block_time; + __u64 max_block_time; + long state; + int duration; + bool verbose; + bool folded; + bool delimiter; + bool freq; + int sample_freq; + bool include_idle; + int cpu; + tool_type_t tool_type; +}; + +static struct common_env env = { + .stack_storage_size = 1024, + .perf_max_stack_depth = 127, + .min_block_time = 1, + .max_block_time = -1, + .state = -1, + .duration = 99999999, + .freq = 1, + .sample_freq = 49, + .cpu = -1, +}; + +static const char *get_program_doc(void) +{ + switch (env.tool_type) { + case TOOL_OFFCPUTIME: + return "Summarize off-CPU time by stack trace.\n" + "\n" + "USAGE: offcputime [--help] [-p PID | -u | -k] [-m MIN-BLOCK-TIME] " + "[-M MAX-BLOCK-TIME] [--state] [--perf-max-stack-depth] [--stack-storage-size] " + "[-f] [-d] [duration]\n" + "EXAMPLES:\n" + " offcputime # trace off-CPU stack time until Ctrl-C\n" + " offcputime 5 # trace for 5 seconds only\n" + " offcputime -m 1000 # trace only events that last more than 1000 usec\n" + " offcputime -M 10000 # trace only events that last less than 10000 usec\n" + " offcputime -p 185,175,165 # only trace threads for PID 185,175,165\n" + " offcputime -t 188,120,134 # only trace threads 188,120,134\n" + " offcputime -u # only trace user threads (no kernel)\n" + " offcputime -k # only trace kernel threads (no user)\n" + " offcputime -f # output in folded format for flame graphs\n" + " offcputime -fd # folded format with delimiter between stacks\n"; + case TOOL_PROFILE: + return "Profile CPU usage by sampling stack traces at a timed interval.\n" + "\n" + "USAGE: profile [OPTIONS...] [duration]\n" + "EXAMPLES:\n" + " profile # profile stack traces at 49 Hertz until Ctrl-C\n" + " profile -F 99 # profile stack traces at 99 Hertz\n" + " profile -c 1000000 # profile stack traces every 1 in a million events\n" + " profile 5 # profile at 49 Hertz for 5 seconds only\n" + " profile -f # output in folded format for flame graphs\n" + " profile -p 185 # only profile process with PID 185\n" + " profile -L 185 # only profile thread with TID 185\n" + " profile -U # only show user space stacks (no kernel)\n" + " profile -K # only show kernel space stacks (no user)\n"; + default: + return "Unknown tool\n"; + } +} + +static const struct argp_option common_opts[] = { + { "pid", 'p', "PID", 0, "Trace these PIDs only, comma-separated list", 0 }, + { "tid", 't', "TID", 0, "Trace these TIDs only, comma-separated list", 0 }, + { "tid", 'L', "TID", 0, "profile threads with one or more comma-separated TIDs only", 0 }, + { "user-threads-only", 'u', NULL, 0, + "User threads only (no kernel threads)", 0 }, + { "kernel-threads-only", 'k', NULL, 0, + "Kernel threads only (no user threads)", 0 }, + { "user-stacks-only", 'U', NULL, 0, + "show stacks from user space only (no kernel space stacks)", 0 }, + { "kernel-stacks-only", 'K', NULL, 0, + "show stacks from kernel space only (no user space stacks)", 0 }, + { "perf-max-stack-depth", OPT_PERF_MAX_STACK_DEPTH, + "PERF-MAX-STACK-DEPTH", 0, "the limit for both kernel and user stack traces (default 127)", 0 }, + { "stack-storage-size", OPT_STACK_STORAGE_SIZE, "STACK-STORAGE-SIZE", 0, + "the number of unique stack traces that can be stored and displayed (default 1024)", 0 }, + { "min-block-time", 'm', "MIN-BLOCK-TIME", 0, + "the amount of time in microseconds over which we store traces (default 1)", 0 }, + { "max-block-time", 'M', "MAX-BLOCK-TIME", 0, + "the amount of time in microseconds under which we store traces (default U64_MAX)", 0 }, + { "state", OPT_STATE, "STATE", 0, "filter on this thread state bitmask (eg, 2 == TASK_UNINTERRUPTIBLE) see include/linux/sched.h", 0 }, + { "frequency", 'F', "FREQUENCY", 0, "sample frequency, Hertz", 0 }, + { "delimited", 'd', NULL, 0, "insert delimiter between kernel/user stacks", 0 }, + { "include-idle ", 'I', NULL, 0, "include CPU idle stacks", 0 }, + { "cpu", 'C', "CPU", 0, "cpu number to run profile on", 0 }, + { "folded", 'f', NULL, 0, "output folded format, one line per stack (for flame graphs)", 0 }, + { "verbose", 'v', NULL, 0, "Verbose debug output", 0 }, + { NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help", 0 }, + {}, +}; + +static error_t parse_common_arg(int key, char *arg, struct argp_state *state) +{ + static int pos_args; + int ret; + char *arg_copy; + + switch (key) { + case 'h': + argp_state_help(state, stderr, ARGP_HELP_STD_HELP); + break; + case 'v': + env.verbose = true; + break; + case 'f': + env.folded = true; + break; + case 'd': + env.delimiter = true; + break; + case 'p': + arg_copy = safe_strdup(arg); + ret = split_convert(arg_copy, ",", env.pids, sizeof(env.pids), + sizeof(pid_t), str_to_int); + free(arg_copy); + if (ret) { + if (ret == -ENOBUFS) + fprintf(stderr, "the number of pid is too big, please " + "increase MAX_PID_NR's value and recompile\n"); + else + fprintf(stderr, "invalid PID: %s\n", arg); + + argp_usage(state); + } + break; + case 't': + case 'L': + arg_copy = safe_strdup(arg); + ret = split_convert(arg_copy, ",", env.tids, sizeof(env.tids), + sizeof(pid_t), str_to_int); + free(arg_copy); + if (ret) { + if (ret == -ENOBUFS) + fprintf(stderr, "the number of tid is too big, please " + "increase MAX_TID_NR's value and recompile\n"); + else + fprintf(stderr, "invalid TID: %s\n", arg); + + argp_usage(state); + } + break; + case 'u': + if (env.tool_type == TOOL_OFFCPUTIME) { + env.user_threads_only = true; + } + /* No-op for profile */ + break; + case 'k': + if (env.tool_type == TOOL_OFFCPUTIME) { + env.kernel_threads_only = true; + } + /* No-op for profile */ + break; + case 'U': + if (env.tool_type == TOOL_PROFILE) { + env.user_stacks_only = true; + } + /* No-op for offcputime */ + break; + case 'K': + if (env.tool_type == TOOL_PROFILE) { + env.kernel_stacks_only = true; + } + /* No-op for offcputime */ + break; + case 'F': + if (env.tool_type == TOOL_PROFILE) { + errno = 0; + env.sample_freq = strtol(arg, NULL, 10); + if (errno || env.sample_freq <= 0) { + fprintf(stderr, "invalid FREQUENCY: %s\n", arg); + argp_usage(state); + } + } + /* No-op for offcputime */ + break; + case 'I': + if (env.tool_type == TOOL_PROFILE) { + env.include_idle = true; + } + /* No-op for offcputime */ + break; + case 'C': + if (env.tool_type == TOOL_PROFILE) { + errno = 0; + env.cpu = strtol(arg, NULL, 10); + if (errno) { + fprintf(stderr, "invalid CPU: %s\n", arg); + argp_usage(state); + } + } + /* No-op for offcputime */ + break; + case OPT_PERF_MAX_STACK_DEPTH: + errno = 0; + env.perf_max_stack_depth = strtol(arg, NULL, 10); + if (errno) { + fprintf(stderr, "invalid perf max stack depth: %s\n", arg); + argp_usage(state); + } + break; + case OPT_STACK_STORAGE_SIZE: + errno = 0; + env.stack_storage_size = strtol(arg, NULL, 10); + if (errno) { + fprintf(stderr, "invalid stack storage size: %s\n", arg); + argp_usage(state); + } + break; + case 'm': + if (env.tool_type == TOOL_OFFCPUTIME) { + errno = 0; + env.min_block_time = strtoll(arg, NULL, 10); + if (errno) { + fprintf(stderr, "Invalid min block time (in us): %s\n", arg); + argp_usage(state); + } + } + /* No-op for profile */ + break; + case 'M': + if (env.tool_type == TOOL_OFFCPUTIME) { + errno = 0; + env.max_block_time = strtoll(arg, NULL, 10); + if (errno) { + fprintf(stderr, "Invalid max block time (in us): %s\n", arg); + argp_usage(state); + } + } + /* No-op for profile */ + break; + case OPT_STATE: + if (env.tool_type == TOOL_OFFCPUTIME) { + errno = 0; + env.state = strtol(arg, NULL, 10); + if (errno || env.state < 0 || env.state > 2) { + fprintf(stderr, "Invalid task state: %s\n", arg); + argp_usage(state); + } + } + /* No-op for profile */ + break; + case ARGP_KEY_ARG: + if (pos_args++) { + fprintf(stderr, + "Unrecognized positional argument: %s\n", arg); + argp_usage(state); + } + errno = 0; + if (env.tool_type == TOOL_PROFILE) { + env.duration = strtol(arg, NULL, 10); + if (errno || env.duration <= 0) { + fprintf(stderr, "Invalid duration (in s): %s\n", arg); + argp_usage(state); + } + } else { + env.duration = strtol(arg, NULL, 10); + if (errno || env.duration <= 0) { + fprintf(stderr, "Invalid duration (in s): %s\n", arg); + argp_usage(state); + } + } + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +static void init_common_env(tool_type_t tool) +{ + env.tool_type = tool; + + /* Set tool-specific defaults */ + if (tool == TOOL_PROFILE) { + env.duration = INT_MAX; + } +} + +static int parse_common_args(int argc, char **argv, tool_type_t tool) +{ + init_common_env(tool); + + const struct argp argp = { + .options = common_opts, + .parser = parse_common_arg, + .doc = get_program_doc(), + }; + + return argp_parse(&argp, argc, argv, 0, NULL, NULL); +} + +static int validate_common_args(void) +{ + if (env.tool_type == TOOL_OFFCPUTIME) { + if (env.user_threads_only && env.kernel_threads_only) { + fprintf(stderr, "user_threads_only and kernel_threads_only cannot be used together.\n"); + return 1; + } + if (env.min_block_time >= env.max_block_time) { + fprintf(stderr, "min_block_time should be smaller than max_block_time\n"); + return 1; + } + } else if (env.tool_type == TOOL_PROFILE) { + if (env.user_stacks_only && env.kernel_stacks_only) { + fprintf(stderr, "user_stacks_only and kernel_stacks_only cannot be used together.\n"); + return 1; + } + } + + return 0; +} + +#endif /* __ARG_PARSE_H */ \ No newline at end of file diff --git a/src/32-wallclock-profiler/common.h b/src/32-wallclock-profiler/common.h new file mode 100644 index 00000000..6f0cdf4b --- /dev/null +++ b/src/32-wallclock-profiler/common.h @@ -0,0 +1,226 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +#ifndef __COMMON_H +#define __COMMON_H + +#include +#include +#include +#include +#include +#include +#include +#include "blazesym.h" + +/* Types needed for both profile and offcputime */ +#define TASK_COMM_LEN 16 +#define MAX_PID_NR 30 +#define MAX_TID_NR 30 + + +/* Common helper functions for BPF tools */ + +/** + * split_convert - Split a string by a delimiter and convert each token + * @s: String to split + * @delim: Delimiter string + * @elems: Array to store the converted elements + * @elems_size: Size of the elems array in bytes + * @elem_size: Size of each element in bytes + * @convert: Function to convert each token to the desired type + * + * Return: 0 on success, negative error code on failure + */ +static inline int split_convert(char *s, const char* delim, void *elems, size_t elems_size, + size_t elem_size, int (*convert)(const char *, void *)) +{ + char *token; + int ret; + char *pos = (char *)elems; + + if (!s || !delim || !elems) + return -1; + + token = strtok(s, delim); + while (token) { + if (pos + elem_size > (char*)elems + elems_size) + return -ENOBUFS; + + ret = convert(token, pos); + if (ret) + return ret; + + pos += elem_size; + token = strtok(NULL, delim); + } + + return 0; +} + +/** + * str_to_int - Convert a string to an integer + * @src: Source string + * @dest: Pointer to store the converted integer + * + * Return: 0 on success, negative error code on failure + */ +static inline int str_to_int(const char *src, void *dest) +{ + *(int*)dest = strtol(src, NULL, 10); + return 0; +} + +/** + * show_stack_trace - Display a stack trace with symbol resolution + * @symbolizer: Blazesym symbolizer instance + * @stack: Array of stack addresses + * @stack_sz: Size of the stack array + * @pid: Process ID (0 for kernel) + */ +static void show_stack_trace(blaze_symbolizer *symbolizer, __u64 *stack, int stack_sz, pid_t pid) +{ + const struct blaze_syms *syms; + int i; + + // Choose symbolization source based on pid + if (pid) { + const struct blaze_symbolize_src_process src = { + .type_size = sizeof(src), + .pid = pid, + }; + syms = blaze_symbolize_process_abs_addrs(symbolizer, &src, (const uint64_t *)stack, stack_sz); + } else { + const struct blaze_symbolize_src_kernel src = { + .type_size = sizeof(src), + .kallsyms = NULL, + .vmlinux = NULL, + .debug_syms = false, + .reserved = {0}, + }; + syms = blaze_symbolize_kernel_abs_addrs(symbolizer, &src, (const uint64_t *)stack, stack_sz); + } + + if (!syms) { + fprintf(stderr, "Failed to symbolize stack trace\n"); + return; + } + + for (i = 0; i < stack_sz; i++) { + if (!stack[i]) + continue; + + if (i >= syms->cnt) { + printf(" [unknown]\n"); + continue; + } + + const struct blaze_sym *sym = &syms->syms[i]; + if (sym->name) { + printf(" %s\n", sym->name); + } else { + printf(" [unknown]\n"); + } + } + + blaze_syms_free(syms); +} + +/** + * show_stack_trace_folded - Display a stack trace in folded format for flamegraphs + * @symbolizer: Blazesym symbolizer instance + * @stack: Array of stack addresses + * @stack_sz: Size of the stack array + * @pid: Process ID (0 for kernel) + * @separator: Character to use as separator between frames (typically ';') + * @reverse: Whether to print the stack in reverse order (true for flamegraphs) + */ +static void show_stack_trace_folded(blaze_symbolizer *symbolizer, __u64 *stack, int stack_sz, + pid_t pid, char separator, bool reverse) +{ + const struct blaze_syms *syms; + int i; + bool first = true; + + // Choose symbolization source based on pid + if (pid) { + const struct blaze_symbolize_src_process src = { + .type_size = sizeof(src), + .pid = pid, + }; + syms = blaze_symbolize_process_abs_addrs(symbolizer, &src, (const uint64_t *)stack, stack_sz); + } else { + const struct blaze_symbolize_src_kernel src = { + .type_size = sizeof(src), + .kallsyms = NULL, + .vmlinux = NULL, + .debug_syms = false, + .reserved = {0}, + }; + syms = blaze_symbolize_kernel_abs_addrs(symbolizer, &src, (const uint64_t *)stack, stack_sz); + } + + if (!syms) { + fprintf(stderr, "Failed to symbolize stack trace\n"); + return; + } + + /* For flamegraphs, we need to print the stack in reverse order */ + if (reverse) { + for (i = stack_sz - 1; i >= 0; i--) { + if (!stack[i]) + continue; + + if (i >= syms->cnt || !syms->syms[i].name) { + if (!first) { + printf("%c", separator); + } + printf("[unknown]"); + first = false; + continue; + } + + const struct blaze_sym *sym = &syms->syms[i]; + if (!first) { + printf("%c", separator); + } + printf("%s", sym->name); + first = false; + } + } else { + /* Print stack in normal order */ + for (i = 0; i < stack_sz; i++) { + if (!stack[i]) + continue; + + if (i >= syms->cnt || !syms->syms[i].name) { + if (!first) { + printf("%c", separator); + } + printf("[unknown]"); + first = false; + continue; + } + + const struct blaze_sym *sym = &syms->syms[i]; + if (!first) { + printf("%c", separator); + } + printf("%s", sym->name); + first = false; + } + } + + blaze_syms_free(syms); +} + +/* Safe string duplication */ +static inline char *safe_strdup(const char *s) +{ + char *ret = strdup(s); + if (!ret) { + fprintf(stderr, "failed to allocate memory\n"); + exit(1); + } + return ret; +} + +#endif /* __COMMON_H */ \ No newline at end of file diff --git a/src/32-wallclock-profiler/offcputime.bpf.c b/src/32-wallclock-profiler/offcputime.bpf.c new file mode 100644 index 00000000..8ab67e56 --- /dev/null +++ b/src/32-wallclock-profiler/offcputime.bpf.c @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2021 Wenbo Zhang +#include +#include +#include +#include +#include "offcputime.h" + +#define PF_KTHREAD 0x00200000 /* I am a kernel thread */ +#define MAX_ENTRIES 10240 + +const volatile bool kernel_threads_only = false; +const volatile bool user_threads_only = false; +const volatile __u64 max_block_ns = -1; +const volatile __u64 min_block_ns = 0; +const volatile bool filter_by_tgid = false; +const volatile bool filter_by_pid = false; +const volatile long state = -1; + +struct internal_key { + u64 start_ts; + struct key_t key; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, struct internal_key); + __uint(max_entries, MAX_ENTRIES); +} start SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(key_size, sizeof(u32)); +} stackmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct key_t); + __type(value, struct val_t); + __uint(max_entries, MAX_ENTRIES); +} info SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, u8); + __uint(max_entries, MAX_PID_NR); +} tgids SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, u8); + __uint(max_entries, MAX_TID_NR); +} pids SEC(".maps"); + + +/** + * commit 2f064a59a1 ("sched: Change task_struct::state") changes + * the name of task_struct::state to task_struct::__state + * see: + * https://github.com/torvalds/linux/commit/2f064a59a1 + */ +struct task_struct___o { + volatile long int state; +} __attribute__((preserve_access_index)); + +struct task_struct___x { + unsigned int __state; +} __attribute__((preserve_access_index)); + +static __always_inline __s64 get_task_state(void *task) +{ + struct task_struct___x *t = task; + + if (bpf_core_field_exists(t->__state)) + return BPF_CORE_READ(t, __state); + return BPF_CORE_READ((struct task_struct___o *)task, state); +} + +static bool allow_record(struct task_struct *t) +{ + u32 tgid = BPF_CORE_READ(t, tgid); + u32 pid = BPF_CORE_READ(t, pid); + + if (filter_by_tgid && !bpf_map_lookup_elem(&tgids, &tgid)) + return false; + if (filter_by_pid && !bpf_map_lookup_elem(&pids, &pid)) + return false; + if (user_threads_only && (BPF_CORE_READ(t, flags) & PF_KTHREAD)) + return false; + else if (kernel_threads_only && !(BPF_CORE_READ(t, flags) & PF_KTHREAD)) + return false; + if (state != -1 && get_task_state(t) != state) + return false; + return true; +} + +static int handle_sched_switch(void *ctx, bool preempt, struct task_struct *prev, struct task_struct *next) +{ + struct internal_key *i_keyp, i_key; + struct val_t *valp, val; + s64 delta; + u32 pid; + + if (allow_record(prev)) { + pid = BPF_CORE_READ(prev, pid); + /* To distinguish idle threads of different cores */ + if (!pid) + pid = bpf_get_smp_processor_id(); + i_key.key.pid = pid; + i_key.key.tgid = BPF_CORE_READ(prev, tgid); + i_key.start_ts = bpf_ktime_get_ns(); + + if (BPF_CORE_READ(prev, flags) & PF_KTHREAD) + i_key.key.user_stack_id = -1; + else + i_key.key.user_stack_id = bpf_get_stackid(ctx, &stackmap, BPF_F_USER_STACK); + i_key.key.kern_stack_id = bpf_get_stackid(ctx, &stackmap, 0); + bpf_map_update_elem(&start, &pid, &i_key, 0); + bpf_probe_read_kernel_str(&val.comm, sizeof(prev->comm), BPF_CORE_READ(prev, comm)); + val.delta = 0; + bpf_map_update_elem(&info, &i_key.key, &val, BPF_NOEXIST); + } + + pid = BPF_CORE_READ(next, pid); + i_keyp = bpf_map_lookup_elem(&start, &pid); + if (!i_keyp) + return 0; + delta = (s64)(bpf_ktime_get_ns() - i_keyp->start_ts); + if (delta < 0) + goto cleanup; + if (delta < min_block_ns || delta > max_block_ns) + goto cleanup; + delta /= 1000U; + valp = bpf_map_lookup_elem(&info, &i_keyp->key); + if (!valp) + goto cleanup; + __sync_fetch_and_add(&valp->delta, delta); + +cleanup: + bpf_map_delete_elem(&start, &pid); + return 0; +} + +SEC("tp_btf/sched_switch") +int BPF_PROG(sched_switch, bool preempt, struct task_struct *prev, struct task_struct *next) +{ + return handle_sched_switch(ctx, preempt, prev, next); +} + +SEC("raw_tp/sched_switch") +int BPF_PROG(sched_switch_raw, bool preempt, struct task_struct *prev, struct task_struct *next) +{ + return handle_sched_switch(ctx, preempt, prev, next); +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/src/32-wallclock-profiler/offcputime.c b/src/32-wallclock-profiler/offcputime.c new file mode 100644 index 00000000..f77b2d44 --- /dev/null +++ b/src/32-wallclock-profiler/offcputime.c @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +// Copyright (c) 2021 Wenbo Zhang +// +// Based on offcputime(8) from BCC by Brendan Gregg. +// 19-Mar-2021 Wenbo Zhang Created this. +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "offcputime.h" +#include "offcputime.skel.h" +#include "blazesym.h" +#include "arg_parse.h" + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !env.verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static void sig_handler(int sig) +{ +} + +static blaze_symbolizer *symbolizer; + +static void print_map(struct offcputime_bpf *obj) +{ + struct key_t lookup_key = {}, next_key; + int err, fd_stackid, fd_info; + unsigned long *ip; + struct val_t val; + int idx; + bool has_kernel_stack, has_user_stack; + + ip = calloc(env.perf_max_stack_depth, sizeof(*ip)); + if (!ip) { + fprintf(stderr, "failed to alloc ip\n"); + return; + } + + fd_info = bpf_map__fd(obj->maps.info); + fd_stackid = bpf_map__fd(obj->maps.stackmap); + while (!bpf_map_get_next_key(fd_info, &lookup_key, &next_key)) { + idx = 0; + + err = bpf_map_lookup_elem(fd_info, &next_key, &val); + if (err < 0) { + fprintf(stderr, "failed to lookup info: %d\n", err); + goto cleanup; + } + lookup_key = next_key; + if (val.delta == 0) + continue; + + has_kernel_stack = next_key.kern_stack_id != -1; + has_user_stack = next_key.user_stack_id != -1; + + if (env.folded) { + /* folded stack output format */ + printf("%s", val.comm); + + /* Print user stack first for folded format */ + if (has_user_stack && !env.kernel_threads_only) { + if (bpf_map_lookup_elem(fd_stackid, &next_key.user_stack_id, ip) != 0) { + printf(";[Missed User Stack]"); + } else { + printf(";"); + show_stack_trace_folded(symbolizer, (__u64 *)ip, env.perf_max_stack_depth, next_key.tgid, ';', true); + } + } + + /* Then print kernel stack if it exists */ + if (has_kernel_stack && !env.user_threads_only) { + /* Add delimiter between user and kernel stacks if needed */ + if (has_user_stack && env.delimiter && !env.kernel_threads_only) + printf("-"); + + if (bpf_map_lookup_elem(fd_stackid, &next_key.kern_stack_id, ip) != 0) { + printf(";[Missed Kernel Stack]"); + } else { + printf(";"); + show_stack_trace_folded(symbolizer, (__u64 *)ip, env.perf_max_stack_depth, 0, ';', true); + } + } + + printf(" %lld\n", val.delta); + } else { + /* standard multi-line output format */ + if (has_kernel_stack && !env.user_threads_only) { + if (bpf_map_lookup_elem(fd_stackid, &next_key.kern_stack_id, ip) != 0) { + fprintf(stderr, " [Missed Kernel Stack]\n"); + } else { + show_stack_trace(symbolizer, (__u64 *)ip, env.perf_max_stack_depth, 0); + } + } + + /* Add delimiter between kernel and user stacks if both exist and delimiter is requested */ + if (env.delimiter && has_kernel_stack && has_user_stack && + !env.user_threads_only && !env.kernel_threads_only) { + printf(" --\n"); + } + + if (has_user_stack && !env.kernel_threads_only) { + if (bpf_map_lookup_elem(fd_stackid, &next_key.user_stack_id, ip) != 0) { + fprintf(stderr, " [Missed User Stack]\n"); + } else { + show_stack_trace(symbolizer, (__u64 *)ip, env.perf_max_stack_depth, next_key.tgid); + } + } + + printf(" %-16s %s (%d)\n", "-", val.comm, next_key.pid); + printf(" %lld\n\n", val.delta); + } + } + +cleanup: + free(ip); +} + +static bool probe_tp_btf(const char *name) +{ + LIBBPF_OPTS(bpf_prog_load_opts, opts, .expected_attach_type = BPF_TRACE_RAW_TP); + struct bpf_insn insns[] = { + { .code = BPF_ALU64 | BPF_MOV | BPF_K, .dst_reg = BPF_REG_0, .imm = 0 }, + { .code = BPF_JMP | BPF_EXIT }, + }; + int fd, insn_cnt = sizeof(insns) / sizeof(struct bpf_insn); + + opts.attach_btf_id = libbpf_find_vmlinux_btf_id(name, BPF_TRACE_RAW_TP); + fd = bpf_prog_load(BPF_PROG_TYPE_TRACING, NULL, "GPL", insns, insn_cnt, &opts); + if (fd >= 0) + close(fd); + return fd >= 0; +} + +static bool print_header_threads() +{ + int i; + bool printed = false; + + if (env.pids[0]) { + printf(" PID ["); + for (i = 0; i < MAX_PID_NR && env.pids[i]; i++) + printf("%d%s", env.pids[i], (i < MAX_PID_NR - 1 && env.pids[i + 1]) ? ", " : "]"); + printed = true; + } + + if (env.tids[0]) { + printf(" TID ["); + for (i = 0; i < MAX_TID_NR && env.tids[i]; i++) + printf("%d%s", env.tids[i], (i < MAX_TID_NR - 1 && env.tids[i + 1]) ? ", " : "]"); + printed = true; + } + + return printed; +} + +static void print_headers() +{ + if (env.folded) + return; // Don't print headers in folded format + + printf("Tracing off-CPU time (us) of"); + + if (!print_header_threads()) + printf(" all threads"); + + if (env.duration < 99999999) + printf(" for %d secs.\n", env.duration); + else + printf("... Hit Ctrl-C to end.\n"); +} + +int main(int argc, char **argv) +{ + struct offcputime_bpf *obj; + int pids_fd, tids_fd; + int err, i; + __u8 val = 0; + + err = parse_common_args(argc, argv, TOOL_OFFCPUTIME); + if (err) + return err; + + err = validate_common_args(); + if (err) + return err; + + libbpf_set_print(libbpf_print_fn); + + obj = offcputime_bpf__open(); + if (!obj) { + fprintf(stderr, "failed to open BPF object\n"); + return 1; + } + + /* initialize global data (filtering options) */ + obj->rodata->user_threads_only = env.user_threads_only; + obj->rodata->kernel_threads_only = env.kernel_threads_only; + obj->rodata->state = env.state; + obj->rodata->min_block_ns = env.min_block_time; + obj->rodata->max_block_ns = env.max_block_time; + + /* User space PID and TID correspond to TGID and PID in the kernel, respectively */ + if (env.pids[0]) + obj->rodata->filter_by_tgid = true; + if (env.tids[0]) + obj->rodata->filter_by_pid = true; + + bpf_map__set_value_size(obj->maps.stackmap, + env.perf_max_stack_depth * sizeof(unsigned long)); + bpf_map__set_max_entries(obj->maps.stackmap, env.stack_storage_size); + + if (!probe_tp_btf("sched_switch")) + bpf_program__set_autoload(obj->progs.sched_switch, false); + else + bpf_program__set_autoload(obj->progs.sched_switch_raw, false); + + err = offcputime_bpf__load(obj); + if (err) { + fprintf(stderr, "failed to load BPF programs\n"); + goto cleanup; + } + + if (env.pids[0]) { + /* User pids_fd points to the tgids map in the BPF program */ + int pids_fd = bpf_map__fd(obj->maps.tgids); + for (i = 0; i < MAX_PID_NR && env.pids[i]; i++) { + if (bpf_map_update_elem(pids_fd, &(env.pids[i]), &val, BPF_ANY) != 0) { + fprintf(stderr, "failed to init pids map: %s\n", strerror(errno)); + goto cleanup; + } + } + } + if (env.tids[0]) { + /* User tids_fd points to the pids map in the BPF program */ + int tids_fd = bpf_map__fd(obj->maps.pids); + for (i = 0; i < MAX_TID_NR && env.tids[i]; i++) { + if (bpf_map_update_elem(tids_fd, &(env.tids[i]), &val, BPF_ANY) != 0) { + fprintf(stderr, "failed to init tids map: %s\n", strerror(errno)); + goto cleanup; + } + } + } + + err = offcputime_bpf__attach(obj); + if (err) { + fprintf(stderr, "failed to attach BPF programs\n"); + goto cleanup; + } + + symbolizer = blaze_symbolizer_new(); + if (!symbolizer) { + fprintf(stderr, "Failed to create a symbolizer\n"); + err = 1; + goto cleanup; + } + + signal(SIGINT, sig_handler); + + print_headers(); + + sleep(env.duration); + + /* Get traces from info map and print them to stdout */ + print_map(obj); + +cleanup: + blaze_symbolizer_free(symbolizer); + offcputime_bpf__destroy(obj); + + return err != 0; +} diff --git a/src/32-wallclock-profiler/offcputime.h b/src/32-wallclock-profiler/offcputime.h new file mode 100644 index 00000000..2bcd0d0e --- /dev/null +++ b/src/32-wallclock-profiler/offcputime.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __OFFCPUTIME_H +#define __OFFCPUTIME_H + +#define TASK_COMM_LEN 16 +#define MAX_PID_NR 30 +#define MAX_TID_NR 30 + +struct key_t { + __u32 pid; + __u32 tgid; + int user_stack_id; + int kern_stack_id; +}; + +struct val_t { + __u64 delta; + char comm[TASK_COMM_LEN]; +}; + +#endif /* __OFFCPUTIME_H */ diff --git a/src/32-wallclock-profiler/oncputime.bpf.c b/src/32-wallclock-profiler/oncputime.bpf.c new file mode 100644 index 00000000..eb8af432 --- /dev/null +++ b/src/32-wallclock-profiler/oncputime.bpf.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* + * Copyright (c) 2022 LG Electronics + * + * Based on profile from BCC by Brendan Gregg and others. + * 28-Dec-2021 Eunseon Lee Created this. + */ +#include +#include +#include +#include +#include "oncputime.h" + +#define EEXIST 17 + +const volatile bool kernel_stacks_only = false; +const volatile bool user_stacks_only = false; +const volatile bool include_idle = false; +const volatile bool filter_by_pid = false; +const volatile bool filter_by_tid = false; +const volatile bool use_pidns = false; + +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __type(key, u32); +} stackmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct key_t); + __type(value, u64); + __uint(max_entries, MAX_ENTRIES); +} counts SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, u8); + __uint(max_entries, MAX_PID_NR); +} pids SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, u8); + __uint(max_entries, MAX_TID_NR); +} tids SEC(".maps"); + + +static __always_inline void * +bpf_map_lookup_or_try_init(void *map, const void *key, const void *init) +{ + void *val; + /* bpf helper functions like bpf_map_update_elem() below normally return + * long, but using int instead of long to store the result is a workaround + * to avoid incorrectly evaluating err in cases where the following criteria + * is met: + * the architecture is 64-bit + * the helper function return type is long + * the helper function returns the value of a call to a bpf_map_ops func + * the bpf_map_ops function return type is int + * the compiler inlines the helper function + * the compiler does not sign extend the result of the bpf_map_ops func + * + * if this criteria is met, at best an error can only be checked as zero or + * non-zero. it will not be possible to check for a negative value or a + * specific error value. this is because the sign bit would have been stuck + * at the 32nd bit of a 64-bit long int. + */ + int err; + + val = bpf_map_lookup_elem(map, key); + if (val) + return val; + + err = bpf_map_update_elem(map, key, init, BPF_NOEXIST); + if (err && err != -EEXIST) + return 0; + + return bpf_map_lookup_elem(map, key); +} + +SEC("perf_event") +int do_perf_event(struct bpf_perf_event_data *ctx) +{ + u64 *valp; + static const u64 zero; + struct key_t key = {}; + u64 id; + u32 pid; + u32 tid; + struct bpf_pidns_info ns = {}; + + id = bpf_get_current_pid_tgid(); + pid = id >> 32; + tid = id; + + if (!include_idle && tid == 0) + return 0; + + if (filter_by_pid && !bpf_map_lookup_elem(&pids, &pid)) + return 0; + + if (filter_by_tid && !bpf_map_lookup_elem(&tids, &tid)) + return 0; + + key.pid = pid; + bpf_get_current_comm(&key.name, sizeof(key.name)); + + if (user_stacks_only) + key.kern_stack_id = -1; + else + key.kern_stack_id = bpf_get_stackid(&ctx->regs, &stackmap, 0); + + if (kernel_stacks_only) + key.user_stack_id = -1; + else + key.user_stack_id = bpf_get_stackid(&ctx->regs, &stackmap, + BPF_F_USER_STACK); + + valp = bpf_map_lookup_or_try_init(&counts, &key, &zero); + if (valp) + __sync_fetch_and_add(valp, 1); + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/src/32-wallclock-profiler/oncputime.c b/src/32-wallclock-profiler/oncputime.c new file mode 100644 index 00000000..0bd958e2 --- /dev/null +++ b/src/32-wallclock-profiler/oncputime.c @@ -0,0 +1,414 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* + * profile Profile CPU usage by sampling stack traces at a timed interval. + * Copyright (c) 2022 LG Electronics + * + * Based on profile from BCC by Brendan Gregg and others. + * 28-Dec-2021 Eunseon Lee Created this. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "oncputime.h" +#include "oncputime.skel.h" +#include "blazesym.h" +#include "arg_parse.h" + +#define SYM_INFO_LEN 2048 + +/* + * -EFAULT in get_stackid normally means the stack-trace is not available, + * such as getting kernel stack trace in user mode + */ +#define STACK_ID_EFAULT(stack_id) (stack_id == -EFAULT) + +#define STACK_ID_ERR(stack_id) ((stack_id < 0) && !STACK_ID_EFAULT(stack_id)) + +/* hash collision (-EEXIST) suggests that stack map size may be too small */ +#define CHECK_STACK_COLLISION(ustack_id, kstack_id) \ + (kstack_id == -EEXIST || ustack_id == -EEXIST) + +#define MISSING_STACKS(ustack_id, kstack_id) \ + (!env.user_stacks_only && STACK_ID_ERR(kstack_id)) + (!env.kernel_stacks_only && STACK_ID_ERR(ustack_id)) + +/* This structure combines key_t and count which should be sorted together */ +struct key_ext_t { + struct key_t k; + __u64 v; +}; + +static blaze_symbolizer *symbolizer; + +static int nr_cpus; + +static int open_and_attach_perf_event(struct bpf_program *prog, + struct bpf_link *links[]) +{ + struct perf_event_attr attr = { + .type = PERF_TYPE_SOFTWARE, + .freq = env.freq, + .sample_freq = env.sample_freq, + .config = PERF_COUNT_SW_CPU_CLOCK, + }; + int i, fd; + + for (i = 0; i < nr_cpus; i++) { + if (env.cpu != -1 && env.cpu != i) + continue; + + fd = syscall(__NR_perf_event_open, &attr, -1, i, -1, 0); + if (fd < 0) { + /* Ignore CPU that is offline */ + if (errno == ENODEV) + continue; + + fprintf(stderr, "failed to init perf sampling: %s\n", + strerror(errno)); + return -1; + } + + links[i] = bpf_program__attach_perf_event(prog, fd); + if (!links[i]) { + fprintf(stderr, "failed to attach perf event on cpu: " + "%d\n", i); + links[i] = NULL; + close(fd); + return -1; + } + } + + return 0; +} + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !env.verbose) + return 0; + + return vfprintf(stderr, format, args); +} + +static void sig_handler(int sig) +{ +} + +static int cmp_counts(const void *a, const void *b) +{ + const __u64 x = ((struct key_ext_t *) a)->v; + const __u64 y = ((struct key_ext_t *) b)->v; + + /* descending order */ + return y - x; +} + +static int read_counts_map(int fd, struct key_ext_t *items, __u32 *count) +{ + struct key_t empty = {}; + struct key_t *lookup_key = ∅ + int i = 0; + int err; + + while (bpf_map_get_next_key(fd, lookup_key, &items[i].k) == 0) { + err = bpf_map_lookup_elem(fd, &items[i].k, &items[i].v); + if (err < 0) { + fprintf(stderr, "failed to lookup counts: %d\n", err); + return -err; + } + + if (items[i].v == 0) + continue; + + lookup_key = &items[i].k; + i++; + } + + *count = i; + return 0; +} + +static int print_count(struct key_t *event, __u64 count, int stack_map) +{ + unsigned long *ip; + int ret; + bool has_kernel_stack, has_user_stack; + + ip = calloc(env.perf_max_stack_depth, sizeof(unsigned long)); + if (!ip) { + fprintf(stderr, "failed to alloc ip\n"); + return -ENOMEM; + } + + has_kernel_stack = !STACK_ID_EFAULT(event->kern_stack_id); + has_user_stack = !STACK_ID_EFAULT(event->user_stack_id); + + if (!env.folded) { + /* multi-line stack output */ + /* Show kernel stack first */ + if (!env.user_stacks_only && has_kernel_stack) { + if (bpf_map_lookup_elem(stack_map, &event->kern_stack_id, ip) != 0) { + fprintf(stderr, " [Missed Kernel Stack]\n"); + } else { + show_stack_trace(symbolizer, (__u64 *)ip, env.perf_max_stack_depth, 0); + } + } + + if (env.delimiter && !env.user_stacks_only && !env.kernel_stacks_only && + has_user_stack && has_kernel_stack) { + printf(" --\n"); + } + + /* Then show user stack */ + if (!env.kernel_stacks_only && has_user_stack) { + if (bpf_map_lookup_elem(stack_map, &event->user_stack_id, ip) != 0) { + fprintf(stderr, " [Missed User Stack]\n"); + } else { + show_stack_trace(symbolizer, (__u64 *)ip, env.perf_max_stack_depth, event->pid); + } + } + + printf(" %-16s %s (%d)\n", "-", event->name, event->pid); + printf(" %lld\n", count); + } else { + /* folded stack output */ + printf("%s", event->name); + + /* Print user stack first for folded format */ + if (has_user_stack && !env.kernel_stacks_only) { + if (bpf_map_lookup_elem(stack_map, &event->user_stack_id, ip) != 0) { + printf(";[Missed User Stack]"); + } else { + printf(";"); + show_stack_trace_folded(symbolizer, (__u64 *)ip, env.perf_max_stack_depth, event->pid, ';', true); + } + } + + /* Then print kernel stack if it exists */ + if (has_kernel_stack && !env.user_stacks_only) { + /* Add delimiter between user and kernel stacks if needed */ + if (has_user_stack && env.delimiter && !env.kernel_stacks_only) + printf("-"); + + if (bpf_map_lookup_elem(stack_map, &event->kern_stack_id, ip) != 0) { + printf(";[Missed Kernel Stack]"); + } else { + printf(";"); + show_stack_trace_folded(symbolizer, (__u64 *)ip, env.perf_max_stack_depth, 0, ';', true); + } + } + + printf(" %lld\n", count); + } + + free(ip); + + return 0; +} + +static int print_counts(int counts_map, int stack_map) +{ + struct key_ext_t *counts; + struct key_t *event; + __u64 count; + __u32 nr_count = MAX_ENTRIES; + size_t nr_missing_stacks = 0; + bool has_collision = false; + int i, ret = 0; + + counts = calloc(MAX_ENTRIES, sizeof(struct key_ext_t)); + if (!counts) { + fprintf(stderr, "Out of memory\n"); + return -ENOMEM; + } + + ret = read_counts_map(counts_map, counts, &nr_count); + if (ret) + goto cleanup; + + qsort(counts, nr_count, sizeof(struct key_ext_t), cmp_counts); + + for (i = 0; i < nr_count; i++) { + event = &counts[i].k; + count = counts[i].v; + + print_count(event, count, stack_map); + + /* Add a newline between stack traces for better readability */ + if (!env.folded && i < nr_count - 1) + printf("\n"); + + /* handle stack id errors */ + nr_missing_stacks += MISSING_STACKS(event->user_stack_id, event->kern_stack_id); + has_collision = CHECK_STACK_COLLISION(event->user_stack_id, event->kern_stack_id); + } + + if (nr_missing_stacks > 0) { + fprintf(stderr, "WARNING: %zu stack traces could not be displayed.%s\n", + nr_missing_stacks, has_collision ? + " Consider increasing --stack-storage-size.":""); + } + +cleanup: + free(counts); + + return ret; +} + +static void print_headers() +{ + int i; + + if (env.folded) + return; // Don't print headers in folded format + + printf("Sampling at %d Hertz of", env.sample_freq); + + if (env.pids[0]) { + printf(" PID ["); + for (i = 0; i < MAX_PID_NR && env.pids[i]; i++) + printf("%d%s", env.pids[i], (i < MAX_PID_NR - 1 && env.pids[i + 1]) ? ", " : "]"); + } else if (env.tids[0]) { + printf(" TID ["); + for (i = 0; i < MAX_TID_NR && env.tids[i]; i++) + printf("%d%s", env.tids[i], (i < MAX_TID_NR - 1 && env.tids[i + 1]) ? ", " : "]"); + } else { + printf(" all threads"); + } + + if (env.user_stacks_only) + printf(" by user"); + else if (env.kernel_stacks_only) + printf(" by kernel"); + else + printf(" by user + kernel"); + + if (env.cpu != -1) + printf(" on CPU#%d", env.cpu); + + if (env.duration < INT_MAX) + printf(" for %d secs.\n", env.duration); + else + printf("... Hit Ctrl-C to end.\n"); +} + +int main(int argc, char **argv) +{ + struct bpf_link *links[MAX_CPU_NR] = {}; + struct oncputime_bpf *obj; + int pids_fd, tids_fd; + int err, i; + __u8 val = 0; + + err = parse_common_args(argc, argv, TOOL_PROFILE); + if (err) + return err; + + err = validate_common_args(); + if (err) + return err; + + libbpf_set_print(libbpf_print_fn); + + nr_cpus = libbpf_num_possible_cpus(); + if (nr_cpus < 0) { + printf("failed to get # of possible cpus: '%s'!\n", + strerror(-nr_cpus)); + return 1; + } + if (nr_cpus > MAX_CPU_NR) { + fprintf(stderr, "the number of cpu cores is too big, please " + "increase MAX_CPU_NR's value and recompile"); + return 1; + } + + symbolizer = blaze_symbolizer_new(); + if (!symbolizer) { + fprintf(stderr, "Failed to create a blazesym symbolizer\n"); + return 1; + } + + obj = oncputime_bpf__open(); + if (!obj) { + fprintf(stderr, "failed to open BPF object\n"); + blaze_symbolizer_free(symbolizer); + return 1; + } + + /* initialize global data (filtering options) */ + obj->rodata->user_stacks_only = env.user_stacks_only; + obj->rodata->kernel_stacks_only = env.kernel_stacks_only; + obj->rodata->include_idle = env.include_idle; + if (env.pids[0]) + obj->rodata->filter_by_pid = true; + else if (env.tids[0]) + obj->rodata->filter_by_tid = true; + + bpf_map__set_value_size(obj->maps.stackmap, + env.perf_max_stack_depth * sizeof(unsigned long)); + bpf_map__set_max_entries(obj->maps.stackmap, env.stack_storage_size); + + err = oncputime_bpf__load(obj); + if (err) { + fprintf(stderr, "failed to load BPF programs\n"); + goto cleanup; + } + + if (env.pids[0]) { + pids_fd = bpf_map__fd(obj->maps.pids); + for (i = 0; i < MAX_PID_NR && env.pids[i]; i++) { + if (bpf_map_update_elem(pids_fd, &(env.pids[i]), &val, BPF_ANY) != 0) { + fprintf(stderr, "failed to init pids map: %s\n", strerror(errno)); + goto cleanup; + } + } + } + else if (env.tids[0]) { + tids_fd = bpf_map__fd(obj->maps.tids); + for (i = 0; i < MAX_TID_NR && env.tids[i]; i++) { + if (bpf_map_update_elem(tids_fd, &(env.tids[i]), &val, BPF_ANY) != 0) { + fprintf(stderr, "failed to init tids map: %s\n", strerror(errno)); + goto cleanup; + } + } + } + + err = open_and_attach_perf_event(obj->progs.do_perf_event, links); + if (err) + goto cleanup; + + signal(SIGINT, sig_handler); + + if (!env.folded) + print_headers(); + + /* + * We'll get sleep interrupted when someone presses Ctrl-C. + * (which will be "handled" with noop by sig_handler) + */ + sleep(env.duration); + + print_counts(bpf_map__fd(obj->maps.counts), + bpf_map__fd(obj->maps.stackmap)); + +cleanup: + if (env.cpu != -1) + bpf_link__destroy(links[env.cpu]); + else { + for (i = 0; i < nr_cpus; i++) + bpf_link__destroy(links[i]); + } + + blaze_symbolizer_free(symbolizer); + oncputime_bpf__destroy(obj); + + return err != 0; +} diff --git a/src/32-wallclock-profiler/oncputime.h b/src/32-wallclock-profiler/oncputime.h new file mode 100644 index 00000000..c0877011 --- /dev/null +++ b/src/32-wallclock-profiler/oncputime.h @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +#ifndef __ONCPUTIME_H +#define __ONCPUTIME_H + +#define TASK_COMM_LEN 16 +#define MAX_CPU_NR 128 +#define MAX_ENTRIES 10240 +#define MAX_PID_NR 30 +#define MAX_TID_NR 30 + +struct key_t { + __u32 pid; + int user_stack_id; + int kern_stack_id; + char name[TASK_COMM_LEN]; +}; + +#endif /* __PROFILE_H */ diff --git a/src/32-wallclock-profiler/tests/demo_results.svg b/src/32-wallclock-profiler/tests/demo_results.svg new file mode 100644 index 00000000..34c78c51 --- /dev/null +++ b/src/32-wallclock-profiler/tests/demo_results.svg @@ -0,0 +1,550 @@ + + + + + + + + + + + + + + +Combined On-CPU and Off-CPU Profile + +Reset Zoom +Search +ic + + + +bpf_trace_run4 (465 samples, 31.81%) +bpf_trace_run4 + + +bpf_prog_8959afe9a7823bdd_handle_sched_switch (465 samples, 31.81%) +bpf_prog_8959afe9a7823bdd_handle_sched_switch + + +x64_sys_call (465 samples, 31.81%) +x64_sys_call + + +bpf_prog_4e3e014a51a65e79_sched_switch (465 samples, 31.81%) +bpf_prog_4e3e014a51a65e79_sched_switch + + +schedule (465 samples, 31.81%) +schedule + + +__bpf_trace_sched_switch (465 samples, 31.81%) +__bpf_trace_sched_switch + + +irqentry_exit_to_user_mode (1 samples, 0.07%) + + + +entry_SYSCALL_64_after_hwframe (465 samples, 31.81%) +entry_SYSCALL_64_after_hwframe + + +hrtimer_nanosleep (465 samples, 31.81%) +hrtimer_nanosleep + + +do_nanosleep (465 samples, 31.81%) +do_nanosleep + + +__schedule (1 samples, 0.07%) + + + +do_syscall_64 (465 samples, 31.81%) +do_syscall_64 + + +common_nsleep (465 samples, 31.81%) +common_nsleep + + +sysvec_apic_timer_interrupt (1 samples, 0.07%) + + + +all (1,462 samples, 100%) + + + +asm_sysvec_apic_timer_interrupt (1 samples, 0.07%) + + + +cpu_work_[c] (996 samples, 68.13%) +cpu_work_[c] + + +__libc_init_first (1,462 samples, 100.00%) +__libc_init_first + + +cpu_work (1 samples, 0.07%) + + + +finish_task_switch.isra.0_[c] (1 samples, 0.07%) + + + +clock_nanosleep (465 samples, 31.81%) +clock_nanosleep + + +__schedule (465 samples, 31.81%) +__schedule + + +__traceiter_sched_switch (465 samples, 31.81%) +__traceiter_sched_switch + + +irqentry_exit (1 samples, 0.07%) + + + +test_work (1,462 samples, 100.00%) +test_work + + +__x64_sys_clock_nanosleep (465 samples, 31.81%) +__x64_sys_clock_nanosleep + + +main (1,462 samples, 100.00%) +main + + +bpf_prog_8959afe9a7823bdd_handle_sched_switch_[o] (465 samples, 31.81%) +bpf_prog_8959afe9a7823bdd_handle_sched_switch_[o] + + +schedule (1 samples, 0.07%) + + + + diff --git a/src/32-wallclock-profiler/tests/double_bandwidth.c b/src/32-wallclock-profiler/tests/double_bandwidth.c new file mode 100644 index 00000000..e0b7a116 --- /dev/null +++ b/src/32-wallclock-profiler/tests/double_bandwidth.c @@ -0,0 +1,203 @@ +/** + * double_bandwidth.c - CXL double bandwidth microbenchmark + * + * This microbenchmark measures the bandwidth of CXL memory + * using reader and writer threads. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct { + size_t bytes_processed; + size_t operations; +} ThreadStats; + +typedef struct { + void* buffer; + size_t buffer_size; + size_t block_size; + atomic_bool* stop_flag; + ThreadStats* stats; +} ThreadArgs; + +void* reader_thread(void* arg) { + ThreadArgs* args = (ThreadArgs*)arg; + char* local_buffer = (char*)malloc(args->block_size); + if (!local_buffer) return NULL; + + size_t offset = 0; + + while (!atomic_load(args->stop_flag)) { + // Read block from the buffer + memcpy(local_buffer, (char*)args->buffer + offset, args->block_size); + + // Move to next block with wrap-around + offset = (offset + args->block_size) % (args->buffer_size - args->block_size); + + // Update statistics + args->stats->bytes_processed += args->block_size; + args->stats->operations++; + } + + free(local_buffer); + return NULL; +} + +void* writer_thread(void* arg) { + ThreadArgs* args = (ThreadArgs*)arg; + char* local_buffer = (char*)malloc(args->block_size); + if (!local_buffer) return NULL; + + // Fill with 'W' for writers + memset(local_buffer, 'W', args->block_size); + size_t offset = 0; + + while (!atomic_load(args->stop_flag)) { + // Write block to the buffer + memcpy((char*)args->buffer + offset, local_buffer, args->block_size); + // for (size_t i = 0; i < args->block_size; i++) { + // ((char*)args->buffer)[offset + i] = local_buffer[i]; + // } + + // Move to next block with wrap-around + offset = (offset + args->block_size) % (args->buffer_size - args->block_size); + + // Update statistics + args->stats->bytes_processed += args->block_size; + args->stats->operations++; + } + + free(local_buffer); + return NULL; +} + +int main() { + // Basic configuration + size_t buffer_size = 1 * 1024 * 1024 * 1024UL; // 1GB + size_t block_size = 4096; // 4KB + int duration = 100; // 10 seconds + int num_readers = 2; // 2 reader threads + int num_writers = 2; // 2 writer threads + int total_threads = num_readers + num_writers; + + printf("=== CXL Double Bandwidth Microbenchmark ===\n"); + printf("Buffer size: %zu bytes\n", buffer_size); + printf("Block size: %zu bytes\n", block_size); + printf("Duration: %d seconds\n", duration); + printf("Reader threads: %d\n", num_readers); + printf("Writer threads: %d\n", num_writers); + printf("\nStarting benchmark...\n"); + + // Allocate memory buffer + void* buffer = aligned_alloc(4096, buffer_size); + if (!buffer) { + fprintf(stderr, "Failed to allocate memory\n"); + return 1; + } + + // Initialize buffer with some data + memset(buffer, 'A', buffer_size); + + // Prepare threads and resources + pthread_t* threads = (pthread_t*)malloc(total_threads * sizeof(pthread_t)); + ThreadStats* thread_stats = (ThreadStats*)calloc(total_threads, sizeof(ThreadStats)); + ThreadArgs* thread_args = (ThreadArgs*)malloc(total_threads * sizeof(ThreadArgs)); + atomic_bool stop_flag = ATOMIC_VAR_INIT(0); + + if (!threads || !thread_stats || !thread_args) { + fprintf(stderr, "Failed to allocate thread resources\n"); + free(buffer); + return 1; + } + + // Initialize thread arguments + for (int i = 0; i < total_threads; i++) { + thread_args[i].buffer = buffer; + thread_args[i].buffer_size = buffer_size; + thread_args[i].block_size = block_size; + thread_args[i].stop_flag = &stop_flag; + thread_args[i].stats = &thread_stats[i]; + } + + // Create reader threads + for (int i = 0; i < num_readers; i++) { + if (pthread_create(&threads[i], NULL, reader_thread, &thread_args[i]) != 0) { + fprintf(stderr, "Failed to create reader thread %d\n", i); + return 1; + } + } + + // Create writer threads + for (int i = 0; i < num_writers; i++) { + if (pthread_create(&threads[num_readers + i], NULL, writer_thread, &thread_args[num_readers + i]) != 0) { + fprintf(stderr, "Failed to create writer thread %d\n", i); + return 1; + } + } + + // Run the benchmark for the specified duration + struct timespec start_time, end_time; + clock_gettime(CLOCK_MONOTONIC, &start_time); + sleep(duration); + atomic_store(&stop_flag, 1); + clock_gettime(CLOCK_MONOTONIC, &end_time); + + // Wait for all threads to finish + for (int i = 0; i < total_threads; i++) { + pthread_join(threads[i], NULL); + } + + // Calculate elapsed time + double elapsed_seconds = (end_time.tv_sec - start_time.tv_sec) + + (end_time.tv_nsec - start_time.tv_nsec) / 1e9; + + // Calculate total stats + size_t total_read_bytes = 0; + size_t total_read_ops = 0; + size_t total_write_bytes = 0; + size_t total_write_ops = 0; + + for (int i = 0; i < num_readers; i++) { + total_read_bytes += thread_stats[i].bytes_processed; + total_read_ops += thread_stats[i].operations; + } + + for (int i = 0; i < num_writers; i++) { + total_write_bytes += thread_stats[num_readers + i].bytes_processed; + total_write_ops += thread_stats[num_readers + i].operations; + } + + // Print results + printf("\n=== Results ===\n"); + printf("Test duration: %.2f seconds\n", elapsed_seconds); + + double read_bandwidth_mbps = (total_read_bytes / (1024.0 * 1024.0)) / elapsed_seconds; + double read_iops = total_read_ops / elapsed_seconds; + printf("Read bandwidth: %.2f MB/s\n", read_bandwidth_mbps); + printf("Read IOPS: %.2f ops/s\n", read_iops); + + double write_bandwidth_mbps = (total_write_bytes / (1024.0 * 1024.0)) / elapsed_seconds; + double write_iops = total_write_ops / elapsed_seconds; + printf("Write bandwidth: %.2f MB/s\n", write_bandwidth_mbps); + printf("Write IOPS: %.2f ops/s\n", write_iops); + + double total_bandwidth_mbps = ((total_read_bytes + total_write_bytes) / (1024.0 * 1024.0)) / elapsed_seconds; + double total_iops = (total_read_ops + total_write_ops) / elapsed_seconds; + printf("Total bandwidth: %.2f MB/s\n", total_bandwidth_mbps); + printf("Total IOPS: %.2f ops/s\n", total_iops); + + // Clean up resources + free(threads); + free(thread_stats); + free(thread_args); + free(buffer); + + return 0; +} \ No newline at end of file diff --git a/src/32-wallclock-profiler/tests/double_bandwidth.cpp b/src/32-wallclock-profiler/tests/double_bandwidth.cpp new file mode 100644 index 00000000..4b0677d6 --- /dev/null +++ b/src/32-wallclock-profiler/tests/double_bandwidth.cpp @@ -0,0 +1,414 @@ +/** + * double_bandwidth.cpp - CXL double bandwidth microbenchmark + * + * This microbenchmark measures the bandwidth of CXL by adjusting + * the ratio of readers to writers, simulating bidirectional traffic. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Default parameters +constexpr size_t DEFAULT_BUFFER_SIZE = 1 * 1024 * 1024 * 1024UL; // 1GB +constexpr size_t DEFAULT_BLOCK_SIZE = 4096; // 4KB +constexpr int DEFAULT_DURATION = 1000; // seconds +constexpr int DEFAULT_NUM_THREADS = 20; // total threads +constexpr float DEFAULT_READ_RATIO = 0.5; // 50% readers, 50% writers + +struct ThreadStats { + size_t bytes_processed = 0; + size_t operations = 0; +}; + +struct BenchmarkConfig { + size_t buffer_size = DEFAULT_BUFFER_SIZE; + size_t block_size = DEFAULT_BLOCK_SIZE; + int duration = DEFAULT_DURATION; + int num_threads = DEFAULT_NUM_THREADS; + float read_ratio = DEFAULT_READ_RATIO; + std::string device_path; + bool use_mmap = false; + bool is_cxl_mem = false; +}; + +void print_usage(const char* prog_name) { + std::cerr << "Usage: " << prog_name << " [OPTIONS]\n" + << "Options:\n" + << " -b, --buffer-size=SIZE Total buffer size in bytes (default: 1GB)\n" + << " -s, --block-size=SIZE Block size for read/write operations (default: 4KB)\n" + << " -t, --threads=NUM Total number of threads (default: 4)\n" + << " -d, --duration=SECONDS Test duration in seconds (default: 10)\n" + << " -r, --read-ratio=RATIO Ratio of readers (0.0-1.0, default: 0.5)\n" + << " -D, --device=PATH CXL device path (if not specified, memory is used)\n" + << " -m, --mmap Use mmap instead of read/write syscalls\n" + << " -c, --cxl-mem Indicate the device is CXL memory\n" + << " -h, --help Show this help message\n"; +} + +BenchmarkConfig parse_args(int argc, char* argv[]) { + BenchmarkConfig config; + + static struct option long_options[] = { + {"buffer-size", required_argument, 0, 'b'}, + {"block-size", required_argument, 0, 's'}, + {"threads", required_argument, 0, 't'}, + {"duration", required_argument, 0, 'd'}, + {"read-ratio", required_argument, 0, 'r'}, + {"device", required_argument, 0, 'D'}, + {"mmap", no_argument, 0, 'm'}, + {"cxl-mem", no_argument, 0, 'c'}, + {"help", no_argument, 0, 'h'}, + {0, 0, 0, 0} + }; + + int opt, option_index = 0; + while ((opt = getopt_long(argc, argv, "b:s:t:d:r:D:mch", long_options, &option_index)) != -1) { + switch (opt) { + case 'b': + config.buffer_size = std::stoull(optarg); + break; + case 's': + config.block_size = std::stoull(optarg); + break; + case 't': + config.num_threads = std::stoi(optarg); + break; + case 'd': + config.duration = std::stoi(optarg); + break; + case 'r': + config.read_ratio = std::stof(optarg); + if (config.read_ratio < 0.0 || config.read_ratio > 1.0) { + std::cerr << "Read ratio must be between 0.0 and 1.0\n"; + exit(1); + } + break; + case 'D': + config.device_path = optarg; + break; + case 'm': + config.use_mmap = true; + break; + case 'c': + config.is_cxl_mem = true; + break; + case 'h': + print_usage(argv[0]); + exit(0); + default: + print_usage(argv[0]); + exit(1); + } + } + + return config; +} + +void reader_thread(void* buffer, size_t buffer_size, size_t block_size, + std::atomic& stop_flag, ThreadStats& stats) { + std::vector local_buffer(block_size); + size_t offset = 0; + + while (!stop_flag.load(std::memory_order_relaxed)) { + // Read block from the buffer + std::memcpy(local_buffer.data(), static_cast(buffer) + offset, block_size); + + // Move to next block with wrap-around + offset = (offset + block_size) % (buffer_size - block_size); + + // Update statistics + stats.bytes_processed += block_size; + stats.operations++; + } +} + +void writer_thread(void* buffer, size_t buffer_size, size_t block_size, + std::atomic& stop_flag, ThreadStats& stats) { + std::vector local_buffer(block_size, 'W'); // Fill with 'W' for writers + size_t offset = 0; + + while (!stop_flag.load(std::memory_order_relaxed)) { + // Write block to the buffer + std::memcpy(static_cast(buffer) + offset, local_buffer.data(), block_size); + + // Move to next block with wrap-around + offset = (offset + block_size) % (buffer_size - block_size); + + // Update statistics + stats.bytes_processed += block_size; + stats.operations++; + } +} + +void device_reader_thread(int fd, size_t file_size, size_t block_size, + std::atomic& stop_flag, ThreadStats& stats) { + std::vector local_buffer(block_size); + size_t offset = 0; + + while (!stop_flag.load(std::memory_order_relaxed)) { + // Seek to the position + lseek(fd, offset, SEEK_SET); + + // Read block from the device + ssize_t bytes_read = read(fd, local_buffer.data(), block_size); + if (bytes_read <= 0) { + std::cerr << "Error reading from device: " << strerror(errno) << std::endl; + break; + } + + // Move to next block with wrap-around + offset = (offset + block_size) % (file_size - block_size); + + // Update statistics + stats.bytes_processed += bytes_read; + stats.operations++; + } +} + +void device_writer_thread(int fd, size_t file_size, size_t block_size, + std::atomic& stop_flag, ThreadStats& stats) { + std::vector local_buffer(block_size, 'W'); // Fill with 'W' for writers + size_t offset = 0; + + while (!stop_flag.load(std::memory_order_relaxed)) { + // Seek to the position + lseek(fd, offset, SEEK_SET); + + // Write block to the device + ssize_t bytes_written = write(fd, local_buffer.data(), block_size); + if (bytes_written <= 0) { + std::cerr << "Error writing to device: " << strerror(errno) << std::endl; + break; + } + + // Move to next block with wrap-around + offset = (offset + block_size) % (file_size - block_size); + + // Update statistics + stats.bytes_processed += bytes_written; + stats.operations++; + } +} + +void mmap_reader_thread(void* mapped_area, size_t file_size, size_t block_size, + std::atomic& stop_flag, ThreadStats& stats) { + std::vector local_buffer(block_size); + size_t offset = 0; + + while (!stop_flag.load(std::memory_order_relaxed)) { + // Read block from the mapped area + std::memcpy(local_buffer.data(), static_cast(mapped_area) + offset, block_size); + + // Move to next block with wrap-around + offset = (offset + block_size) % (file_size - block_size); + + // Update statistics + stats.bytes_processed += block_size; + stats.operations++; + } +} + +void mmap_writer_thread(void* mapped_area, size_t file_size, size_t block_size, + std::atomic& stop_flag, ThreadStats& stats) { + std::vector local_buffer(block_size, 'W'); // Fill with 'W' for writers + size_t offset = 0; + + while (!stop_flag.load(std::memory_order_relaxed)) { + // Write block to the mapped area + std::memcpy(static_cast(mapped_area) + offset, local_buffer.data(), block_size); + + // Move to next block with wrap-around + offset = (offset + block_size) % (file_size - block_size); + + // Update statistics + stats.bytes_processed += block_size; + stats.operations++; + } +} + +int main(int argc, char* argv[]) { + BenchmarkConfig config = parse_args(argc, argv); + + // Calculate reader and writer thread counts + int num_readers = static_cast(config.num_threads * config.read_ratio); + int num_writers = config.num_threads - num_readers; + + std::cout << "=== CXL Double Bandwidth Microbenchmark ===" << std::endl; + std::cout << "Buffer size: " << config.buffer_size << " bytes" << std::endl; + std::cout << "Block size: " << config.block_size << " bytes" << std::endl; + std::cout << "Duration: " << config.duration << " seconds" << std::endl; + std::cout << "Total threads: " << config.num_threads << std::endl; + std::cout << "Read ratio: " << config.read_ratio << " (" << num_readers << " readers, " + << num_writers << " writers)" << std::endl; + + if (!config.device_path.empty()) { + std::cout << "Device: " << config.device_path << std::endl; + std::cout << "Access method: " << (config.use_mmap ? "mmap" : "read/write") << std::endl; + std::cout << "Memory type: " << (config.is_cxl_mem ? "CXL memory" : "Regular device") << std::endl; + } else { + std::cout << "Using system memory for testing" << std::endl; + } + + std::cout << "\nStarting benchmark..." << std::endl; + + // Prepare threads and resources + std::vector threads; + std::vector thread_stats(config.num_threads); + std::atomic stop_flag(false); + + void* buffer = nullptr; + int fd = -1; + void* mapped_area = nullptr; + + try { + if (config.device_path.empty()) { + // Allocate memory buffer + buffer = aligned_alloc(4096, config.buffer_size); + if (!buffer) { + std::cerr << "Failed to allocate memory: " << strerror(errno) << std::endl; + return 1; + } + + // Initialize buffer with some data + std::memset(buffer, 'A', config.buffer_size); + + // Create reader and writer threads for memory + for (int i = 0; i < num_readers; i++) { + threads.emplace_back(reader_thread, buffer, config.buffer_size, + config.block_size, std::ref(stop_flag), + std::ref(thread_stats[i])); + } + + for (int i = 0; i < num_writers; i++) { + threads.emplace_back(writer_thread, buffer, config.buffer_size, + config.block_size, std::ref(stop_flag), + std::ref(thread_stats[num_readers + i])); + } + } else { + // Open the device + fd = open(config.device_path.c_str(), O_RDWR | O_DIRECT); + if (fd < 0) { + std::cerr << "Failed to open device " << config.device_path + << ": " << strerror(errno) << std::endl; + return 1; + } + + if (config.use_mmap) { + // Use mmap for device access + mapped_area = mmap(NULL, config.buffer_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (mapped_area == MAP_FAILED) { + std::cerr << "Failed to mmap device: " << strerror(errno) << std::endl; + close(fd); + return 1; + } + + // Create reader and writer threads for mmap + for (int i = 0; i < num_readers; i++) { + threads.emplace_back(mmap_reader_thread, mapped_area, config.buffer_size, + config.block_size, std::ref(stop_flag), + std::ref(thread_stats[i])); + } + + for (int i = 0; i < num_writers; i++) { + threads.emplace_back(mmap_writer_thread, mapped_area, config.buffer_size, + config.block_size, std::ref(stop_flag), + std::ref(thread_stats[num_readers + i])); + } + } else { + // Use read/write for device access + for (int i = 0; i < num_readers; i++) { + threads.emplace_back(device_reader_thread, fd, config.buffer_size, + config.block_size, std::ref(stop_flag), + std::ref(thread_stats[i])); + } + + for (int i = 0; i < num_writers; i++) { + threads.emplace_back(device_writer_thread, fd, config.buffer_size, + config.block_size, std::ref(stop_flag), + std::ref(thread_stats[num_readers + i])); + } + } + } + + // Run the benchmark for the specified duration + auto start_time = std::chrono::steady_clock::now(); + std::this_thread::sleep_for(std::chrono::seconds(config.duration)); + stop_flag.store(true, std::memory_order_relaxed); + auto end_time = std::chrono::steady_clock::now(); + + // Wait for all threads to finish + for (auto& t : threads) { + if (t.joinable()) { + t.join(); + } + } + + // Calculate total stats + double elapsed_seconds = std::chrono::duration(end_time - start_time).count(); + size_t total_read_bytes = 0; + size_t total_read_ops = 0; + size_t total_write_bytes = 0; + size_t total_write_ops = 0; + + for (int i = 0; i < num_readers; i++) { + total_read_bytes += thread_stats[i].bytes_processed; + total_read_ops += thread_stats[i].operations; + } + + for (int i = 0; i < num_writers; i++) { + total_write_bytes += thread_stats[num_readers + i].bytes_processed; + total_write_ops += thread_stats[num_readers + i].operations; + } + + // Print results + std::cout << "\n=== Results ===" << std::endl; + std::cout << "Test duration: " << elapsed_seconds << " seconds" << std::endl; + + if (num_readers > 0) { + double read_bandwidth_mbps = (total_read_bytes / (1024.0 * 1024.0)) / elapsed_seconds; + double read_iops = total_read_ops / elapsed_seconds; + std::cout << "Read bandwidth: " << read_bandwidth_mbps << " MB/s" << std::endl; + std::cout << "Read IOPS: " << read_iops << " ops/s" << std::endl; + } + + if (num_writers > 0) { + double write_bandwidth_mbps = (total_write_bytes / (1024.0 * 1024.0)) / elapsed_seconds; + double write_iops = total_write_ops / elapsed_seconds; + std::cout << "Write bandwidth: " << write_bandwidth_mbps << " MB/s" << std::endl; + std::cout << "Write IOPS: " << write_iops << " ops/s" << std::endl; + } + + double total_bandwidth_mbps = ((total_read_bytes + total_write_bytes) / (1024.0 * 1024.0)) / elapsed_seconds; + double total_iops = (total_read_ops + total_write_ops) / elapsed_seconds; + std::cout << "Total bandwidth: " << total_bandwidth_mbps << " MB/s" << std::endl; + std::cout << "Total IOPS: " << total_iops << " ops/s" << std::endl; + + } catch (const std::exception& e) { + std::cerr << "Error: " << e.what() << std::endl; + } + + // Clean up resources + if (mapped_area && mapped_area != MAP_FAILED) { + munmap(mapped_area, config.buffer_size); + } + + if (fd >= 0) { + close(fd); + } + + if (buffer) { + free(buffer); + } + + return 0; +} \ No newline at end of file diff --git a/src/32-wallclock-profiler/tests/example.svg b/src/32-wallclock-profiler/tests/example.svg new file mode 100644 index 00000000..a63b142a --- /dev/null +++ b/src/32-wallclock-profiler/tests/example.svg @@ -0,0 +1,2830 @@ + + + + + + + + + + + + + + +Thread 1222177 (worker_3_1222177) + +Reset Zoom +Search +ic + + + +from_kgid_munged_[c] (1 samples, 0.05%) + + + +current_time (2 samples, 0.10%) + + + +__traceiter_sched_switch (1 samples, 0.05%) + + + +[unknown]_[c] (2 samples, 0.10%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (1 samples, 0.05%) + + + +handle_softirqs (1 samples, 0.05%) + + + +futex_wait_queue (166 samples, 8.16%) +futex_wait_.. + + +bpf_trace_run4 (3 samples, 0.15%) + + + +__traceiter_sched_switch (1 samples, 0.05%) + + + +bpf_prog_f519f9b271c34c2a_sched_switch (3 samples, 0.15%) + + + +sem_trywait_[c] (1 samples, 0.05%) + + + +mtree_alloc_cyclic (1 samples, 0.05%) + + + +futex_wait_queue (3 samples, 0.15%) + + + +file_update_time (4 samples, 0.20%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (1 samples, 0.05%) + + + +vfs_read (1 samples, 0.05%) + + + +__dev_queue_xmit (1 samples, 0.05%) + + + +__schedule (3 samples, 0.15%) + + + +bpf_trace_run4 (3 samples, 0.15%) + + + +fstat (4 samples, 0.20%) + + + +schedule_hrtimeout_range (831 samples, 40.86%) +schedule_hrtimeout_range + + +irqentry_exit (1 samples, 0.05%) + + + +unlink (2 samples, 0.10%) + + + +[Missed Kernel Stack]_[c] (1 samples, 0.05%) + + + +sem_trywait_[c] (1 samples, 0.05%) + + + +schedule (165 samples, 8.11%) +schedule + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (1 samples, 0.05%) + + + +syscall_exit_to_user_mode (1 samples, 0.05%) + + + +irqentry_exit (1 samples, 0.05%) + + + +bpf_prog_f519f9b271c34c2a_sched_switch (2 samples, 0.10%) + + + +__traceiter_sched_switch (1 samples, 0.05%) + + + +bpf_prog_f519f9b271c34c2a_sched_switch (2 samples, 0.10%) + + + +bpf_prog_f519f9b271c34c2a_sched_switch (3 samples, 0.15%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (2 samples, 0.10%) + + + +__bpf_trace_sched_switch (1 samples, 0.05%) + + + +schedule (1 samples, 0.05%) + + + +futex_hash_[c] (1 samples, 0.05%) + + + +irqentry_exit (3 samples, 0.15%) + + + +bpf_prog_f519f9b271c34c2a_sched_switch (1 samples, 0.05%) + + + +irqentry_exit (1 samples, 0.05%) + + + +wake_up_q_[c] (5 samples, 0.25%) + + + +__x64_sys_newfstat (2 samples, 0.10%) + + + +fstat_[c] (2 samples, 0.10%) + + + +__lll_lock_wait_private (212 samples, 10.42%) +__lll_lock_wait.. + + +__x64_sys_futex (201 samples, 9.88%) +__x64_sys_futex + + +do_syscall_64 (1 samples, 0.05%) + + + +__traceiter_sched_switch (831 samples, 40.86%) +__traceiter_sched_switch + + +security_file_permission_[c] (1 samples, 0.05%) + + + +fsubl_[c] (30 samples, 1.47%) + + + +sysvec_apic_timer_interrupt (27 samples, 1.33%) + + + +futex_wait_queue (1 samples, 0.05%) + + + +bpf_trace_run4 (1 samples, 0.05%) + + + +bpf_trace_run4 (3 samples, 0.15%) + + + +_raw_spin_lock (1 samples, 0.05%) + + + +__schedule (27 samples, 1.33%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (3 samples, 0.15%) + + + +all (2,034 samples, 100%) + + + +bpf_prog_f519f9b271c34c2a_sched_switch (1 samples, 0.05%) + + + +__schedule (6 samples, 0.29%) + + + +finish_task_switch.isra.0_[c] (37 samples, 1.82%) +f.. + + +__strcat_chk_[c] (224 samples, 11.01%) +__strcat_chk_[c] + + +get_futex_key_[c] (1 samples, 0.05%) + + + +irqentry_exit_to_user_mode (3 samples, 0.15%) + + + +sysvec_apic_timer_interrupt (2 samples, 0.10%) + + + +do_swap_page (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (1 samples, 0.05%) + + + +mas_wr_store_entry (1 samples, 0.05%) + + + +entry_SYSCALL_64_after_hwframe (1 samples, 0.05%) + + + +simple_offset_remove (1 samples, 0.05%) + + + +schedule (3 samples, 0.15%) + + + +neigh_hh_output (1 samples, 0.05%) + + + +do_syscall_64 (1 samples, 0.05%) + + + +__traceiter_sched_switch (6 samples, 0.29%) + + + +irqentry_exit_to_user_mode (1 samples, 0.05%) + + + +irqentry_exit (1 samples, 0.05%) + + + +random_r_[c] (1 samples, 0.05%) + + + +irqentry_exit_to_user_mode (1 samples, 0.05%) + + + +__x64_sys_read (1 samples, 0.05%) + + + +__schedule (6 samples, 0.29%) + + + +bpf_prog_f519f9b271c34c2a_sched_switch (3 samples, 0.15%) + + + +futex_hash_[c] (1 samples, 0.05%) + + + +expf64_[c] (18 samples, 0.88%) + + + +sysvec_apic_timer_interrupt (6 samples, 0.29%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (2 samples, 0.10%) + + + +generic_update_time (2 samples, 0.10%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (1 samples, 0.05%) + + + +futex_wait_queue (2 samples, 0.10%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (1 samples, 0.05%) + + + +__schedule (14 samples, 0.69%) + + + +rw_verify_area (2 samples, 0.10%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (27 samples, 1.33%) + + + +do_softirq.part.0 (1 samples, 0.05%) + + + +irqentry_exit_to_user_mode (1 samples, 0.05%) + + + +vscanf (13 samples, 0.64%) + + + +irqentry_exit (27 samples, 1.33%) + + + +simulate_cpu_string_work (386 samples, 18.98%) +simulate_cpu_string_work + + +schedule (1 samples, 0.05%) + + + +do_syscall_64 (3 samples, 0.15%) + + + +_raw_spin_lock (9 samples, 0.44%) + + + +schedule (3 samples, 0.15%) + + + +__lll_lock_wake_private_[c] (1 samples, 0.05%) + + + +asm_sysvec_apic_timer_interrupt (3 samples, 0.15%) + + + +syscall_exit_to_user_mode (1 samples, 0.05%) + + + +do_syscall_64 (2 samples, 0.10%) + + + +__bpf_trace_sched_switch (1 samples, 0.05%) + + + +schedule (3 samples, 0.15%) + + + +__traceiter_sched_switch (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (128 samples, 6.29%) +bpf_prog.. + + +native_queued_spin_lock_slowpath_[c] (9 samples, 0.44%) + + + +__x64_sys_poll (831 samples, 40.86%) +__x64_sys_poll + + +irqentry_exit (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (3 samples, 0.15%) + + + +bpf_trace_run4 (1 samples, 0.05%) + + + +futex_wake (51 samples, 2.51%) +fu.. + + +schedule (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (831 samples, 40.86%) +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] + + +schedule (2 samples, 0.10%) + + + +__do_softirq (1 samples, 0.05%) + + + +inode_update_timestamps (2 samples, 0.10%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (6 samples, 0.29%) + + + +__schedule (3 samples, 0.15%) + + + +irqentry_exit_to_user_mode (2 samples, 0.10%) + + + +do_syscall_64 (202 samples, 9.93%) +do_syscall_64 + + +irqentry_exit (1 samples, 0.05%) + + + +__traceiter_sched_switch (2 samples, 0.10%) + + + +irqentry_exit (2 samples, 0.10%) + + + +apparmor_file_permission_[c] (1 samples, 0.05%) + + + +try_to_wake_up_[c] (1 samples, 0.05%) + + + +entry_SYSCALL_64_after_hwframe (3 samples, 0.15%) + + + +schedule (1 samples, 0.05%) + + + +__traceiter_sched_switch (1 samples, 0.05%) + + + +wake_up_q (28 samples, 1.38%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (1 samples, 0.05%) + + + +__x64_sys_futex (1 samples, 0.05%) + + + +futex_wait_setup_[c] (12 samples, 0.59%) + + + +irqentry_exit (2 samples, 0.10%) + + + +__bpf_trace_sched_switch (1 samples, 0.05%) + + + +_init (3 samples, 0.15%) + + + +[Missed Kernel Stack]_[c] (9 samples, 0.44%) + + + +expf64 (1 samples, 0.05%) + + + +__x64_sys_futex (3 samples, 0.15%) + + + +fdget_raw_[c] (1 samples, 0.05%) + + + +__ip_finish_output (1 samples, 0.05%) + + + +__snprintf_chk (13 samples, 0.64%) + + + +bpf_trace_run4 (14 samples, 0.69%) + + + +bpf_trace_run4 (1 samples, 0.05%) + + + +schedule (1 samples, 0.05%) + + + +futex_wait (3 samples, 0.15%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (1 samples, 0.05%) + + + +asm_sysvec_apic_timer_interrupt (3 samples, 0.15%) + + + +shmem_unlink (1 samples, 0.05%) + + + +bpf_trace_run4 (1 samples, 0.05%) + + + +sysvec_apic_timer_interrupt (1 samples, 0.05%) + + + +shmem_get_folio_gfp_[c] (1 samples, 0.05%) + + + +schedule (1 samples, 0.05%) + + + +__schedule (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (2 samples, 0.10%) + + + +sysvec_apic_timer_interrupt (1 samples, 0.05%) + + + +x64_sys_call (1 samples, 0.05%) + + + +x64_sys_call (3 samples, 0.15%) + + + +irqentry_exit (3 samples, 0.15%) + + + +__schedule (831 samples, 40.86%) +__schedule + + +sendto (1 samples, 0.05%) + + + +do_futex (2 samples, 0.10%) + + + +ksys_write (11 samples, 0.54%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (1 samples, 0.05%) + + + +read_tsc_[c] (1 samples, 0.05%) + + + +__traceiter_sched_switch (3 samples, 0.15%) + + + +schedule (1 samples, 0.05%) + + + +__schedule (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (1 samples, 0.05%) + + + +bpf_trace_run4 (1 samples, 0.05%) + + + +bpf_trace_run4 (1 samples, 0.05%) + + + +__sys_sendto (1 samples, 0.05%) + + + +bpf_trace_run4 (1 samples, 0.05%) + + + +x64_sys_call (2 samples, 0.10%) + + + +entry_SYSCALL_64_after_hwframe (3 samples, 0.15%) + + + +do_futex (201 samples, 9.88%) +do_futex + + +_raw_spin_unlock_irqrestore_[c] (26 samples, 1.28%) + + + +[Missed Kernel Stack]_[c] (6 samples, 0.29%) + + + +fsubl_[c] (155 samples, 7.62%) +fsubl_[c] + + +sem_trywait (14 samples, 0.69%) + + + +do_poll.constprop.0 (831 samples, 40.86%) +do_poll.constprop.0 + + +do_syscall_64_[c] (1 samples, 0.05%) + + + +do_user_addr_fault (1 samples, 0.05%) + + + +__futex_wait (2 samples, 0.10%) + + + +schedule (3 samples, 0.15%) + + + +irqentry_exit (1 samples, 0.05%) + + + +mas_topiary_replace_[c] (1 samples, 0.05%) + + + +asm_sysvec_apic_timer_interrupt (14 samples, 0.69%) + + + +bpf_trace_run4 (3 samples, 0.15%) + + + +asm_sysvec_apic_timer_interrupt (2 samples, 0.10%) + + + +fdget_pos_[c] (1 samples, 0.05%) + + + +sysvec_apic_timer_interrupt (6 samples, 0.29%) + + + +bpf_trace_run4 (3 samples, 0.15%) + + + +memcpy_[c] (27 samples, 1.33%) + + + +vfs_write_[c] (1 samples, 0.05%) + + + +asm_sysvec_apic_timer_interrupt (1 samples, 0.05%) + + + +schedule (2 samples, 0.10%) + + + +__schedule (1 samples, 0.05%) + + + +fsync (1 samples, 0.05%) + + + +bpf_prog_f519f9b271c34c2a_sched_switch (1 samples, 0.05%) + + + +__futex_wait (201 samples, 9.88%) +__futex_wait + + +do_futex (76 samples, 3.74%) +do_f.. + + +syscall_exit_to_user_mode (2 samples, 0.10%) + + + +irqentry_exit_to_user_mode (6 samples, 0.29%) + + + +bpf_prog_f519f9b271c34c2a_sched_switch (14 samples, 0.69%) + + + +sysvec_apic_timer_interrupt (3 samples, 0.15%) + + + +__x64_sys_write (12 samples, 0.59%) + + + +bpf_prog_f519f9b271c34c2a_sched_switch (6 samples, 0.29%) + + + +[unknown]_[c] (3 samples, 0.15%) + + + +schedule (27 samples, 1.33%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (27 samples, 1.33%) + + + +irqentry_exit_to_user_mode (1 samples, 0.05%) + + + +down_write_[c] (1 samples, 0.05%) + + + +__schedule (1 samples, 0.05%) + + + +sysvec_apic_timer_interrupt (1 samples, 0.05%) + + + +do_syscall_64 (2 samples, 0.10%) + + + +generic_write_check_limits_[c] (1 samples, 0.05%) + + + +random (2 samples, 0.10%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (1 samples, 0.05%) + + + +sysvec_apic_timer_interrupt (1 samples, 0.05%) + + + +bpf_trace_run4 (2 samples, 0.10%) + + + +rand (369 samples, 18.14%) +rand + + +bpf_trace_run4 (831 samples, 40.86%) +bpf_trace_run4 + + +random_[c] (1 samples, 0.05%) + + + +asm_sysvec_apic_timer_interrupt (1 samples, 0.05%) + + + +__bpf_trace_sched_switch (1 samples, 0.05%) + + + +pthread_condattr_setpshared (1,986 samples, 97.64%) +pthread_condattr_setpshared + + +do_futex (1 samples, 0.05%) + + + +irqentry_exit_to_user_mode (1 samples, 0.05%) + + + +bpf_prog_f519f9b271c34c2a_sched_switch (831 samples, 40.86%) +bpf_prog_f519f9b271c34c2a_sched_switch + + +bpf_prog_f519f9b271c34c2a_sched_switch (1 samples, 0.05%) + + + +irqentry_exit (1 samples, 0.05%) + + + +irqentry_exit (3 samples, 0.15%) + + + +__x64_sys_futex (3 samples, 0.15%) + + + +mas_alloc_cyclic (1 samples, 0.05%) + + + +__schedule (3 samples, 0.15%) + + + +[unknown] (1 samples, 0.05%) + + + +__lll_lock_wait_private (2 samples, 0.10%) + + + +entry_SYSCALL_64_after_hwframe (2 samples, 0.10%) + + + +simple_offset_add (1 samples, 0.05%) + + + +syscall_exit_to_user_mode_prepare_[c] (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (2 samples, 0.10%) + + + +expf64 (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (3 samples, 0.15%) + + + +memcpy (3 samples, 0.15%) + + + +bpf_trace_run4 (1 samples, 0.05%) + + + +irqentry_exit_to_user_mode (6 samples, 0.29%) + + + +__schedule (1 samples, 0.05%) + + + +futex_wait_setup (23 samples, 1.13%) + + + +bpf_trace_run4 (1 samples, 0.05%) + + + +__lll_lock_wait_private (1 samples, 0.05%) + + + +irqentry_exit_to_user_mode (3 samples, 0.15%) + + + +do_syscall_64 (831 samples, 40.86%) +do_syscall_64 + + +native_queued_spin_lock_slowpath_[c] (1 samples, 0.05%) + + + +schedule (1 samples, 0.05%) + + + +generic_perform_write_[c] (1 samples, 0.05%) + + + +do_syscall_64_[c] (2 samples, 0.10%) + + + +__snprintf_chk_[c] (1 samples, 0.05%) + + + +__traceiter_sched_switch (3 samples, 0.15%) + + + +ktime_get_coarse_real_ts64_mg_[c] (1 samples, 0.05%) + + + +schedule (1 samples, 0.05%) + + + +asm_sysvec_apic_timer_interrupt (1 samples, 0.05%) + + + +futex_wait (1 samples, 0.05%) + + + +irqentry_exit (3 samples, 0.15%) + + + +irqentry_exit_to_user_mode (1 samples, 0.05%) + + + +shmem_file_write_iter (8 samples, 0.39%) + + + +udp_send_skb (1 samples, 0.05%) + + + +write (15 samples, 0.74%) + + + +__bpf_trace_sched_switch (1 samples, 0.05%) + + + +wake_up_q (1 samples, 0.05%) + + + +x64_sys_call (12 samples, 0.59%) + + + +sysvec_apic_timer_interrupt (1 samples, 0.05%) + + + +__bpf_trace_sched_switch (3 samples, 0.15%) + + + +__traceiter_sched_switch (1 samples, 0.05%) + + + +__x64_sys_sendto (1 samples, 0.05%) + + + +[Missed User Stack] (1 samples, 0.05%) + + + +__bpf_trace_sched_switch (831 samples, 40.86%) +__bpf_trace_sched_switch + + +try_to_wake_up (1 samples, 0.05%) + + + +__traceiter_sched_switch (1 samples, 0.05%) + + + +__schedule (1 samples, 0.05%) + + + +__bpf_trace_sched_switch (1 samples, 0.05%) + + + +ip_finish_output2 (1 samples, 0.05%) + + + +mas_wr_bnode (1 samples, 0.05%) + + + +__bpf_trace_sched_switch (1 samples, 0.05%) + + + +__futex_queue (1 samples, 0.05%) + + + +__schedule (2 samples, 0.10%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (14 samples, 0.69%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (6 samples, 0.29%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (1 samples, 0.05%) + + + +do_syscall_64_[c] (3 samples, 0.15%) + + + +sysvec_apic_timer_interrupt (14 samples, 0.69%) + + + +__lll_lock_wait_private_[c] (1 samples, 0.05%) + + + +sem_trywait (831 samples, 40.86%) +sem_trywait + + +bpf_trace_run4 (27 samples, 1.33%) + + + +inode_set_ctime_current_[c] (1 samples, 0.05%) + + + +bpf_prog_f519f9b271c34c2a_sched_switch (1 samples, 0.05%) + + + +sysvec_apic_timer_interrupt (2 samples, 0.10%) + + + +irqentry_exit_to_user_mode (3 samples, 0.15%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (3 samples, 0.15%) + + + +__traceiter_sched_switch (1 samples, 0.05%) + + + +sysvec_apic_timer_interrupt (1 samples, 0.05%) + + + +x64_sys_call (1 samples, 0.05%) + + + +do_syscall_64 (3 samples, 0.15%) + + + +__schedule (1 samples, 0.05%) + + + +clone (1,986 samples, 97.64%) +clone + + +bpf_prog_f519f9b271c34c2a_sched_switch (3 samples, 0.15%) + + + +__traceiter_sched_switch (1 samples, 0.05%) + + + +__netif_receive_skb (1 samples, 0.05%) + + + +__read (1 samples, 0.05%) + + + +entry_SYSCALL_64_after_hwframe (2 samples, 0.10%) + + + +udp_sendmsg (1 samples, 0.05%) + + + +__strtof128_nan_[c] (6 samples, 0.29%) + + + +__open (2 samples, 0.10%) + + + +irqentry_exit (6 samples, 0.29%) + + + +x64_sys_call (831 samples, 40.86%) +x64_sys_call + + +do_sys_poll (831 samples, 40.86%) +do_sys_poll + + +__futex_wait (3 samples, 0.15%) + + + +shmem_evict_inode_[c] (1 samples, 0.05%) + + + +__schedule (1 samples, 0.05%) + + + +__poll (831 samples, 40.86%) +__poll + + +bpf_prog_f519f9b271c34c2a_sched_switch (3 samples, 0.15%) + + + +__futex_unqueue_[c] (3 samples, 0.15%) + + + +sysvec_apic_timer_interrupt (3 samples, 0.15%) + + + +cp_new_stat (1 samples, 0.05%) + + + +path_openat (1 samples, 0.05%) + + + +__snprintf_chk_[c] (1 samples, 0.05%) + + + +do_syscall_64 (3 samples, 0.15%) + + + +schedule (1 samples, 0.05%) + + + +bpf_prog_f519f9b271c34c2a_sched_switch (1 samples, 0.05%) + + + +bpf_prog_f519f9b271c34c2a_sched_switch (1 samples, 0.05%) + + + +__schedule (1 samples, 0.05%) + + + +try_to_wake_up (26 samples, 1.28%) + + + +asm_sysvec_apic_timer_interrupt (1 samples, 0.05%) + + + +mtree_range_walk_[c] (1 samples, 0.05%) + + + +__traceiter_sched_switch (3 samples, 0.15%) + + + +schedule (1 samples, 0.05%) + + + +__schedule (165 samples, 8.11%) +__schedule + + +bpf_prog_f519f9b271c34c2a_sched_switch (1 samples, 0.05%) + + + +irqentry_exit_to_user_mode (3 samples, 0.15%) + + + +do_filp_open (1 samples, 0.05%) + + + +__schedule (1 samples, 0.05%) + + + +mas_spanning_rebalance (1 samples, 0.05%) + + + +ktime_get_real_ts64_mg_[c] (1 samples, 0.05%) + + + +do_syscall_64 (1 samples, 0.05%) + + + +__strcat_chk (6 samples, 0.29%) + + + +asm_sysvec_apic_timer_interrupt (1 samples, 0.05%) + + + +handle_mm_fault (1 samples, 0.05%) + + + +vfs_fstat (1 samples, 0.05%) + + + +__traceiter_sched_switch (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (3 samples, 0.15%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (6 samples, 0.29%) + + + +__bpf_trace_sched_switch (1 samples, 0.05%) + + + +x64_sys_call (2 samples, 0.10%) + + + +asm_sysvec_apic_timer_interrupt (1 samples, 0.05%) + + + +__napi_poll (1 samples, 0.05%) + + + +simulate_cpu_string_work_[c] (5 samples, 0.25%) + + + +__schedule (1 samples, 0.05%) + + + +random_r (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (1 samples, 0.05%) + + + +__lll_lock_wake_private (91 samples, 4.47%) +__lll.. + + +__bpf_trace_sched_switch (2 samples, 0.10%) + + + +ip_output (1 samples, 0.05%) + + + +futex_wake_[c] (25 samples, 1.23%) + + + +entry_SYSCALL_64_after_hwframe (1 samples, 0.05%) + + + +asm_sysvec_apic_timer_interrupt (3 samples, 0.15%) + + + +bpf_trace_run4 (2 samples, 0.10%) + + + +schedule (6 samples, 0.29%) + + + +parse_printf_format (1 samples, 0.05%) + + + +plist_add_[c] (1 samples, 0.05%) + + + +x64_sys_call (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (1 samples, 0.05%) + + + +_raw_spin_unlock_irqrestore_[c] (1 samples, 0.05%) + + + +x64_sys_call (76 samples, 3.74%) +x64_.. + + +__bpf_trace_sched_switch (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (6 samples, 0.29%) + + + +bpf_prog_f519f9b271c34c2a_sched_switch (6 samples, 0.29%) + + + +schedule (1 samples, 0.05%) + + + +entry_SYSCALL_64_after_hwframe (1 samples, 0.05%) + + + +x64_sys_call (1 samples, 0.05%) + + + +__bpf_trace_sched_switch (14 samples, 0.69%) + + + +bpf_prog_f519f9b271c34c2a_sched_switch (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (1 samples, 0.05%) + + + +futex_wake_[c] (1 samples, 0.05%) + + + +[unknown] (1 samples, 0.05%) + + + +__bpf_trace_sched_switch (3 samples, 0.15%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (3 samples, 0.15%) + + + +sysvec_apic_timer_interrupt (3 samples, 0.15%) + + + +do_syscall_64 (1 samples, 0.05%) + + + +__x64_sys_futex (76 samples, 3.74%) +__x6.. + + +__traceiter_sched_switch (1 samples, 0.05%) + + + +futex_wait (201 samples, 9.88%) +futex_wait + + +do_futex (3 samples, 0.15%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (3 samples, 0.15%) + + + +_raw_spin_lock_irqsave_[c] (1 samples, 0.05%) + + + +futex_wake (2 samples, 0.10%) + + + +__traceiter_sched_switch (1 samples, 0.05%) + + + +__bpf_trace_sched_switch (1 samples, 0.05%) + + + +__traceiter_sched_switch (128 samples, 6.29%) +__tracei.. + + +simulate_cpu_math_work_[c] (74 samples, 3.64%) +simu.. + + +do_syscall_64_[c] (1 samples, 0.05%) + + + +bpf_prog_f519f9b271c34c2a_sched_switch (2 samples, 0.10%) + + + +__bpf_trace_sched_switch (6 samples, 0.29%) + + + +vfprintf_[c] (5 samples, 0.25%) + + + +mas_rebalance (1 samples, 0.05%) + + + +asm_sysvec_apic_timer_interrupt (27 samples, 1.33%) + + + +migration_entry_wait_on_locked (1 samples, 0.05%) + + + +__bpf_trace_sched_switch (3 samples, 0.15%) + + + +shmem_mknod (1 samples, 0.05%) + + + +entry_SYSCALL_64_after_hwframe (831 samples, 40.86%) +entry_SYSCALL_64_after_hwframe + + +sysvec_apic_timer_interrupt (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (14 samples, 0.69%) + + + +rand (8 samples, 0.39%) + + + +irqentry_exit_to_user_mode (1 samples, 0.05%) + + + +rand (4 samples, 0.20%) + + + +__bpf_trace_sched_switch (1 samples, 0.05%) + + + +__schedule (2 samples, 0.10%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (3 samples, 0.15%) + + + +net_rx_action (1 samples, 0.05%) + + + +asm_sysvec_apic_timer_interrupt (1 samples, 0.05%) + + + +mas_split (1 samples, 0.05%) + + + +__traceiter_sched_switch (1 samples, 0.05%) + + + +bpf_prog_f519f9b271c34c2a_sched_switch (27 samples, 1.33%) + + + +__bpf_trace_sched_switch (6 samples, 0.29%) + + + +sem_trywait (2 samples, 0.10%) + + + +mtree_erase (1 samples, 0.05%) + + + +fsubl (27 samples, 1.33%) + + + +do_syscall_64 (12 samples, 0.59%) + + + +__traceiter_sched_switch (1 samples, 0.05%) + + + +handle_pte_fault (1 samples, 0.05%) + + + +do_futex (3 samples, 0.15%) + + + +__schedule (1 samples, 0.05%) + + + +__schedule (3 samples, 0.15%) + + + +futex_wait (2 samples, 0.10%) + + + +bpf_prog_f519f9b271c34c2a_sched_switch (1 samples, 0.05%) + + + +__traceiter_sched_switch (1 samples, 0.05%) + + + +vfs_write (10 samples, 0.49%) + + + +irqentry_exit_to_user_mode (2 samples, 0.10%) + + + +__x64_sys_futex (2 samples, 0.10%) + + + +ip_rcv_finish_core_[c] (1 samples, 0.05%) + + + +__traceiter_sched_switch (14 samples, 0.69%) + + + +worker_thread (1,986 samples, 97.64%) +worker_thread + + +shmem_write_begin (1 samples, 0.05%) + + + +random_r_[c] (27 samples, 1.33%) + + + +migration_entry_wait (1 samples, 0.05%) + + + +_dl_mcount_wrapper (2 samples, 0.10%) + + + +bpf_trace_run4 (6 samples, 0.29%) + + + +random_[c] (30 samples, 1.47%) + + + +irqentry_exit (1 samples, 0.05%) + + + +ksys_read (1 samples, 0.05%) + + + +security_file_permission (1 samples, 0.05%) + + + +simulate_file_io (20 samples, 0.98%) + + + +__do_sys_newfstat (2 samples, 0.10%) + + + +irqentry_exit (1 samples, 0.05%) + + + +expf64_[c] (4 samples, 0.20%) + + + +bpf_trace_run4 (1 samples, 0.05%) + + + +__traceiter_sched_switch (3 samples, 0.15%) + + + +__bpf_trace_sched_switch (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (128 samples, 6.29%) +bpf_prog.. + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (1 samples, 0.05%) + + + +inode_needs_update_time.part.0 (2 samples, 0.10%) + + + +bpf_prog_f519f9b271c34c2a_sched_switch (1 samples, 0.05%) + + + +bpf_trace_run4 (2 samples, 0.10%) + + + +schedule (1 samples, 0.05%) + + + +irqentry_exit_to_user_mode (1 samples, 0.05%) + + + +__schedule (3 samples, 0.15%) + + + +sysvec_apic_timer_interrupt (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (2 samples, 0.10%) + + + +irqentry_exit_to_user_mode (1 samples, 0.05%) + + + +sysvec_apic_timer_interrupt (1 samples, 0.05%) + + + +asm_exc_page_fault (1 samples, 0.05%) + + + +asm_sysvec_apic_timer_interrupt (6 samples, 0.29%) + + + +bpf_trace_run4 (1 samples, 0.05%) + + + +do_sys_openat2 (1 samples, 0.05%) + + + +__netif_receive_skb_one_core (1 samples, 0.05%) + + + +bpf_trace_run4 (1 samples, 0.05%) + + + +__x64_sys_openat (1 samples, 0.05%) + + + +entry_SYSCALL_64_after_hwframe (81 samples, 3.98%) +entr.. + + +__schedule (1 samples, 0.05%) + + + +entry_SYSCALL_64_after_hwframe (4 samples, 0.20%) + + + +[Missed Kernel Stack]_[c] (1 samples, 0.05%) + + + +asm_sysvec_apic_timer_interrupt (2 samples, 0.10%) + + + +__bpf_trace_sched_switch (2 samples, 0.10%) + + + +__traceiter_sched_switch (2 samples, 0.10%) + + + +__traceiter_sched_switch (6 samples, 0.29%) + + + +__bpf_trace_sched_switch (1 samples, 0.05%) + + + +__bpf_trace_sched_switch (3 samples, 0.15%) + + + +vfs_unlink (1 samples, 0.05%) + + + +__schedule (1 samples, 0.05%) + + + +shmem_file_read_iter_[c] (1 samples, 0.05%) + + + +__traceiter_sched_switch (3 samples, 0.15%) + + + +mas_wr_store_entry (1 samples, 0.05%) + + + +_dl_mcount_wrapper_[c] (8 samples, 0.39%) + + + +ip_send_skb (1 samples, 0.05%) + + + +futex_wake_mark (3 samples, 0.15%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (1 samples, 0.05%) + + + +simulate_cpu_sort_work_[c] (94 samples, 4.62%) +simul.. + + +__bpf_trace_sched_switch (1 samples, 0.05%) + + + +sem_trywait (1 samples, 0.05%) + + + +entry_SYSCALL_64_after_hwframe (1 samples, 0.05%) + + + +__traceiter_sched_switch (2 samples, 0.10%) + + + +simulate_cpu_math_work (218 samples, 10.72%) +simulate_cpu_ma.. + + +irqentry_exit (14 samples, 0.69%) + + + +do_syscall_64 (78 samples, 3.83%) +do_s.. + + +process_backlog (1 samples, 0.05%) + + + +do_unlinkat (2 samples, 0.10%) + + + +irqentry_exit_to_user_mode (27 samples, 1.33%) + + + +_raw_spin_lock (6 samples, 0.29%) + + + +inode_set_ctime_current (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (831 samples, 40.86%) +bpf_prog_5de8bb2634f3b21a_handle_sched_switch + + +asm_sysvec_apic_timer_interrupt (6 samples, 0.29%) + + + +bpf_prog_f519f9b271c34c2a_sched_switch (1 samples, 0.05%) + + + +futex_q_lock (7 samples, 0.34%) + + + +sysvec_apic_timer_interrupt (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (1 samples, 0.05%) + + + +entry_SYSCALL_64_after_hwframe (204 samples, 10.03%) +entry_SYSCALL_.. + + +lookup_open.isra.0 (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (1 samples, 0.05%) + + + +__lll_lock_wait_private_[c] (6 samples, 0.29%) + + + +futex_q_unlock_[c] (6 samples, 0.29%) + + + +open_last_lookups (1 samples, 0.05%) + + + +schedule (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (1 samples, 0.05%) + + + +futex_q_lock_[c] (8 samples, 0.39%) + + + +__lll_lock_wait_private (3 samples, 0.15%) + + + +__local_bh_enable_ip (1 samples, 0.05%) + + + +bpf_prog_f519f9b271c34c2a_sched_switch (128 samples, 6.29%) +bpf_prog.. + + +asm_sysvec_apic_timer_interrupt (1 samples, 0.05%) + + + +__schedule (1 samples, 0.05%) + + + +schedule (831 samples, 40.86%) +schedule + + +evict (1 samples, 0.05%) + + + +mas_wr_bnode (1 samples, 0.05%) + + + +irqentry_exit_to_user_mode (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (2 samples, 0.10%) + + + +inet_sendmsg (1 samples, 0.05%) + + + +_init_[c] (15 samples, 0.74%) + + + +__clock_gettime (1 samples, 0.05%) + + + +__bpf_trace_sched_switch (3 samples, 0.15%) + + + +asm_sysvec_apic_timer_interrupt (1 samples, 0.05%) + + + +bpf_trace_run4 (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (3 samples, 0.15%) + + + +asm_sysvec_apic_timer_interrupt (3 samples, 0.15%) + + + +__strtof128_nan (1 samples, 0.05%) + + + +__bpf_trace_sched_switch (128 samples, 6.29%) +__bpf_tr.. + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (1 samples, 0.05%) + + + +bpf_prog_f519f9b271c34c2a_sched_switch (1 samples, 0.05%) + + + +__futex_wait (1 samples, 0.05%) + + + +__bpf_trace_sched_switch (27 samples, 1.33%) + + + +simulate_cpu_sort_work (11 samples, 0.54%) + + + +[Missed User Stack]_[c] (47 samples, 2.31%) +[.. + + +generic_perform_write (1 samples, 0.05%) + + + +irqentry_exit (6 samples, 0.29%) + + + +__bpf_trace_sched_switch (2 samples, 0.10%) + + + +__lll_lock_wake_private (3 samples, 0.15%) + + + +io_schedule (1 samples, 0.05%) + + + +bpf_prog_f519f9b271c34c2a_sched_switch (1 samples, 0.05%) + + + +mas_erase (1 samples, 0.05%) + + + +schedule (2 samples, 0.10%) + + + +__bpf_trace_sched_switch (1 samples, 0.05%) + + + +bpf_trace_run4 (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (3 samples, 0.15%) + + + +schedule (1 samples, 0.05%) + + + +exc_page_fault (1 samples, 0.05%) + + + +native_queued_spin_lock_slowpath_[c] (6 samples, 0.29%) + + + +__traceiter_sched_switch (27 samples, 1.33%) + + + +fsubl (6 samples, 0.29%) + + + +schedule (6 samples, 0.29%) + + + +bpf_trace_run4 (128 samples, 6.29%) +bpf_trac.. + + +iput (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (1 samples, 0.05%) + + + +irqentry_exit_to_user_mode (14 samples, 0.69%) + + + +x64_sys_call (2 samples, 0.10%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (1 samples, 0.05%) + + + +__schedule (2 samples, 0.10%) + + + +bpf_prog_f519f9b271c34c2a_sched_switch (1 samples, 0.05%) + + + +bpf_trace_run4 (1 samples, 0.05%) + + + +futex_wake_mark_[c] (6 samples, 0.29%) + + + +sem_trywait (1 samples, 0.05%) + + + +x64_sys_call (3 samples, 0.15%) + + + +x64_sys_call (201 samples, 9.88%) +x64_sys_call + + +simulate_network_operation (832 samples, 40.90%) +simulate_network_operation + + +asm_sysvec_apic_timer_interrupt (1 samples, 0.05%) + + + +ip_finish_output (1 samples, 0.05%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch (1 samples, 0.05%) + + + +entry_SYSCALL_64_after_hwframe (13 samples, 0.64%) + + + +__handle_mm_fault (1 samples, 0.05%) + + + +__x64_sys_unlink (2 samples, 0.10%) + + + +schedule (14 samples, 0.69%) + + + +sysvec_apic_timer_interrupt (3 samples, 0.15%) + + + +bpf_prog_5de8bb2634f3b21a_handle_sched_switch_[o] (1 samples, 0.05%) + + + +bpf_trace_run4 (6 samples, 0.29%) + + + +schedule (3 samples, 0.15%) + + + +shmem_create (1 samples, 0.05%) + + + + diff --git a/src/32-wallclock-profiler/tests/run_combined_test.sh b/src/32-wallclock-profiler/tests/run_combined_test.sh new file mode 100755 index 00000000..1047d0f3 --- /dev/null +++ b/src/32-wallclock-profiler/tests/run_combined_test.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +set -e + +echo "==============================================" +echo "Combined Profiler Demonstration" +echo "==============================================" +echo + +# Build the test program +echo "🔨 Building test program..." +gcc -o test_program tests/test_combined.c +echo "✅ Test program built successfully" +echo + +# Make the profiler script executable +chmod +x wallclock_profiler.py + +echo "🚀 Starting test program in background..." +./test_program & +TEST_PID=$! + +echo "📊 Test program PID: $TEST_PID" +echo "⏱️ Waiting 3 seconds for test program to stabilize..." +sleep 3 + +echo +echo "🔥 Starting combined profiling for 15 seconds..." +echo " This will capture both CPU usage and blocking behavior" +echo " Note: Requires root privileges for BPF access" +echo + +# Check if running as root +if [ "$EUID" -ne 0 ]; then + echo "⚠️ Not running as root. The profiler may fail to access BPF features." + echo " Consider running: sudo ./demo.sh" + echo +fi + +# Run the combined profiler +python3 wallclock_profiler.py $TEST_PID -d 15 -f 99 -m 500 -o demo_results + +echo +echo "🧹 Cleaning up test program..." + +# Kill the test program gracefully +kill $TEST_PID 2>/dev/null || true +wait $TEST_PID 2>/dev/null || true + +echo "✅ Test program stopped" +echo +echo "==============================================" +echo "Demo Complete!" +echo "==============================================" +echo +echo "Generated files should include:" +echo "• demo_results.folded - Raw flamegraph data" +echo "• demo_results.svg - Interactive flamegraph visualization" +echo +echo "Open demo_results.svg in a web browser to explore the results!" +echo +echo "What to look for in the flamegraph:" +echo "• 'oncpu:' sections show CPU-intensive operations (cpu_work function)" +echo "• 'offcpu:' sections show blocking operations (usleep calls)" +echo "• Width indicates time spent - wider = more time" \ No newline at end of file diff --git a/src/32-wallclock-profiler/tests/run_memory_bandwidth.sh b/src/32-wallclock-profiler/tests/run_memory_bandwidth.sh new file mode 100755 index 00000000..b412eff5 --- /dev/null +++ b/src/32-wallclock-profiler/tests/run_memory_bandwidth.sh @@ -0,0 +1,89 @@ +#!/bin/bash + +set -e + +echo "==============================================" +echo "Multi-threaded Profiler Test" +echo "==============================================" +echo + +# Build the multi-threaded test program +echo "🔨 Building multi-threaded test program..." +g++ -o test_multithread tests/double_bandwidth.cpp -g -fno-omit-frame-pointer -lpthread -lrt -lm -O3 +# gcc -o test_multithread tests/double_bandwidth.c -g -fno-omit-frame-pointer -lpthread -lrt -lm -O3 +# gcc -o test_multithread tests/test_multithread.c -lpthread -lrt -lm +echo "✅ Multi-threaded test program built successfully" +echo + +# Make the profiler script executable +chmod +x wallclock_profiler.py + +echo "🚀 Starting multi-threaded web server simulation..." +./test_multithread & +TEST_PID=$! + +echo "📊 Multi-threaded test PID: $TEST_PID" +echo "⏱️ Waiting 5 seconds for application to stabilize and generate load..." +sleep 5 + +echo +echo "🧵 Discovering threads..." +ps -T -p $TEST_PID +echo + +echo "🔥 Starting per-thread profiling for 20 seconds..." +echo " This will profile each thread individually with:" +echo " - On-CPU profiling to show CPU-intensive work" +echo " - Off-CPU profiling to show blocking I/O operations" +echo " - Individual flamegraphs for each thread" +echo + +# Check if running as root +if [ "$EUID" -ne 0 ]; then + echo "⚠️ Not running as root. The profiler may fail to access BPF features." + echo " Consider running: sudo $0" + echo +fi + +# Run the multi-threaded profiler +python3 wallclock_profiler.py $TEST_PID -d 60 -f 99 -m 1000 +# python3 wallclock_profiler.py $TEST_PID -d 30 -f 99 -m 1000 + + +echo +echo "🧹 Cleaning up test program..." + +# Kill the test program gracefully +kill $TEST_PID 2>/dev/null || true +wait $TEST_PID 2>/dev/null || true + +echo "✅ Test program stopped" +echo +echo "==============================================" +echo "Multi-threaded Profiling Test Complete!" +echo "==============================================" +echo +echo "Generated files should include:" +echo "• Individual .folded files for each thread" +echo "• Individual .svg flamegraphs for each thread" +echo "• Thread analysis summary file" +echo +echo "What to look for in the thread-specific flamegraphs:" +echo "🔧 Worker threads:" +echo " • CPU-intensive sections (compute work)" +echo " • I/O blocking sections (file operations, network calls)" +echo " • Database operation latencies" +echo +echo "📈 Request generator thread:" +echo " • Light CPU usage for request creation" +echo " • Some blocking on queue operations" +echo +echo "📊 Statistics monitor thread:" +echo " • Mostly sleeping (off-CPU time)" +echo " • Periodic wake-ups for statistics collection" +echo +echo "🎯 Analysis tips:" +echo " • Compare thread roles and their performance patterns" +echo " • Identify which threads are CPU-bound vs I/O-bound" +echo " • Look for synchronization bottlenecks" +echo " • Check for load balancing across worker threads" \ No newline at end of file diff --git a/src/32-wallclock-profiler/tests/run_multithread_test.sh b/src/32-wallclock-profiler/tests/run_multithread_test.sh new file mode 100755 index 00000000..260e15e2 --- /dev/null +++ b/src/32-wallclock-profiler/tests/run_multithread_test.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +set -e + +echo "==============================================" +echo "Multi-threaded Profiler Test" +echo "==============================================" +echo + +# Build the multi-threaded test program +echo "🔨 Building multi-threaded test program..." +gcc -o test_multithread tests/test_multithread.c -g -fno-omit-frame-pointer -lpthread -lrt -lm -O3 +echo "✅ Multi-threaded test program built successfully" +echo + +# Make the profiler script executable +chmod +x wallclock_profiler.py + +echo "🚀 Starting multi-threaded web server simulation..." +./test_multithread & +TEST_PID=$! + +echo "📊 Multi-threaded test PID: $TEST_PID" +echo "⏱️ Waiting 5 seconds for application to stabilize and generate load..." +sleep 5 + +echo +echo "🧵 Discovering threads..." +ps -T -p $TEST_PID +echo + +echo "🔥 Starting per-thread profiling for 20 seconds..." +echo " This will profile each thread individually with:" +echo " - On-CPU profiling to show CPU-intensive work" +echo " - Off-CPU profiling to show blocking I/O operations" +echo " - Individual flamegraphs for each thread" +echo + +# Check if running as root +if [ "$EUID" -ne 0 ]; then + echo "⚠️ Not running as root. The profiler may fail to access BPF features." + echo " Consider running: sudo $0" + echo +fi + +# Run the multi-threaded profiler +python3 wallclock_profiler.py $TEST_PID -d 20 -f 99 -m 1000 + +echo +echo "🧹 Cleaning up test program..." + +# Kill the test program gracefully +kill $TEST_PID 2>/dev/null || true +wait $TEST_PID 2>/dev/null || true + +echo "✅ Test program stopped" +echo +echo "==============================================" +echo "Multi-threaded Profiling Test Complete!" +echo "==============================================" +echo +echo "Generated files should include:" +echo "• Individual .folded files for each thread" +echo "• Individual .svg flamegraphs for each thread" +echo "• Thread analysis summary file" +echo +echo "What to look for in the thread-specific flamegraphs:" +echo "🔧 Worker threads:" +echo " • CPU-intensive sections (compute work)" +echo " • I/O blocking sections (file operations, network calls)" +echo " • Database operation latencies" +echo +echo "📈 Request generator thread:" +echo " • Light CPU usage for request creation" +echo " • Some blocking on queue operations" +echo +echo "📊 Statistics monitor thread:" +echo " • Mostly sleeping (off-CPU time)" +echo " • Periodic wake-ups for statistics collection" +echo +echo "🎯 Analysis tips:" +echo " • Compare thread roles and their performance patterns" +echo " • Identify which threads are CPU-bound vs I/O-bound" +echo " • Look for synchronization bottlenecks" +echo " • Check for load balancing across worker threads" \ No newline at end of file diff --git a/src/32-wallclock-profiler/tests/test_combined.c b/src/32-wallclock-profiler/tests/test_combined.c new file mode 100644 index 00000000..035e1846 --- /dev/null +++ b/src/32-wallclock-profiler/tests/test_combined.c @@ -0,0 +1,161 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +// Global statistics +static volatile int running = 1; +static long total_cpu_time_ns = 0; +static long total_sleep_time_ns = 0; +static int total_iterations = 0; +static struct timespec program_start; + +// Signal handler for clean exit +void signal_handler(int sig) { + running = 0; + printf("\n🛑 Interrupted! Generating profiling expectations...\n"); +} + +// Get current time in nanoseconds +long get_time_ns() { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * 1000000000L + ts.tv_nsec; +} + +// CPU-intensive function with configurable work +void cpu_work(int intensity) { + volatile long sum = 0; + int iterations = intensity * 10000000; // Scale work by intensity (1-10) + + for (int i = 0; i < iterations; i++) { + sum += i * i + (i % 3) * (i % 7); // More complex computation + if (i % 1000000 == 0 && !running) break; // Check for interruption + } +} + +// Blocking function with configurable sleep time +void blocking_work(int sleep_ms) { + usleep(sleep_ms * 1000); // Convert ms to microseconds +} + +// Perform one iteration of work and measure times +void do_work_iteration(int cpu_intensity, int sleep_ms) { + struct timespec start, end; + + // Measure CPU work + clock_gettime(CLOCK_MONOTONIC, &start); + cpu_work(cpu_intensity); + clock_gettime(CLOCK_MONOTONIC, &end); + long cpu_time = (end.tv_sec - start.tv_sec) * 1000000000L + (end.tv_nsec - start.tv_nsec); + + // Measure sleep time + clock_gettime(CLOCK_MONOTONIC, &start); + blocking_work(sleep_ms); + clock_gettime(CLOCK_MONOTONIC, &end); + long sleep_time = (end.tv_sec - start.tv_sec) * 1000000000L + (end.tv_nsec - start.tv_nsec); + + // Update statistics + total_cpu_time_ns += cpu_time; + total_sleep_time_ns += sleep_time; + total_iterations++; + + printf("Iteration %3d: CPU=%3ldms, Sleep=%3ldms, Total=%3ldms\n", + total_iterations, + cpu_time / 1000000, + sleep_time / 1000000, + (cpu_time + sleep_time) / 1000000); +} + +// Display current statistics and profiler expectations +void show_statistics() { + struct timespec current_time; + clock_gettime(CLOCK_MONOTONIC, ¤t_time); + + long program_runtime_ns = (current_time.tv_sec - program_start.tv_sec) * 1000000000L + + (current_time.tv_nsec - program_start.tv_nsec); + + double program_runtime_s = program_runtime_ns / 1000000000.0; + double total_cpu_s = total_cpu_time_ns / 1000000000.0; + double total_sleep_s = total_sleep_time_ns / 1000000000.0; + double measured_time_s = total_cpu_s + total_sleep_s; + + printf("\n============================================================\n"); + printf("📊 PROFILING STATISTICS (PID: %d)\n", getpid()); + printf("============================================================\n"); + printf("Program runtime: %.3f seconds\n", program_runtime_s); + printf("Total iterations: %d\n", total_iterations); + printf("Measured work time: %.3f seconds (%.1f%% of runtime)\n", + measured_time_s, (measured_time_s / program_runtime_s) * 100.0); + + printf("\n🔥 ON-CPU TIME (computation):\n"); + printf(" Total: %.3f seconds\n", total_cpu_s); + printf(" Percentage: %.1f%% of measured time\n", (total_cpu_s / measured_time_s) * 100.0); + printf(" Average per iter: %.1f ms\n", (total_cpu_s * 1000.0) / total_iterations); + + printf("\n❄️ OFF-CPU TIME (blocking/sleep):\n"); + printf(" Total: %.3f seconds\n", total_sleep_s); + printf(" Percentage: %.1f%% of measured time\n", (total_sleep_s / measured_time_s) * 100.0); + printf(" Average per iter: %.1f ms\n", (total_sleep_s * 1000.0) / total_iterations); + + printf("\n🎯 EXPECTED PROFILER OUTPUT:\n"); + printf(" Wall clock coverage: ~%.1f%% (if profiler captures full runtime)\n", + (measured_time_s / program_runtime_s) * 100.0); + printf(" On-CPU μs expected: ~%.0f μs\n", total_cpu_s * 1000000.0); + printf(" Off-CPU μs expected: ~%.0f μs\n", total_sleep_s * 1000000.0); + printf(" Stack traces: cpu_work_[c] and usleep→clock_nanosleep_[o]\n"); + + printf("\n💡 PROFILER COMMAND:\n"); + printf(" sudo ./profiler -a wallclock -p %d -t 10\n", getpid()); + printf("============================================================\n\n"); +} + +int main() { + // Set up signal handler + signal(SIGINT, signal_handler); + signal(SIGTERM, signal_handler); + + // Record program start time + clock_gettime(CLOCK_MONOTONIC, &program_start); + + printf("🚀 WALLCLOCK PROFILER TEST PROGRAM\n"); + printf("=================================\n"); + printf("PID: %d\n", getpid()); + printf("This program alternates between CPU work and blocking operations\n"); + printf("Press Ctrl+C to stop and see profiling expectations\n"); + printf("=================================\n\n"); + + // Work parameters (can be adjusted for different test scenarios) + int cpu_intensity = 8; // 1-10 scale, affects computation time + int sleep_ms = 200; // milliseconds to sleep each iteration + + printf("Configuration:\n"); + printf(" CPU intensity: %d/10 (expect ~%dms per iteration)\n", cpu_intensity, cpu_intensity * 10); + printf(" Sleep time: %dms per iteration\n", sleep_ms); + + // Show statistics every 10 iterations + int stats_interval = 10; + + while (running) { + do_work_iteration(cpu_intensity, sleep_ms); + + // Show periodic statistics + if (total_iterations % stats_interval == 0) { + show_statistics(); + } + + // Slight variation to make it more realistic + if (total_iterations % 20 == 0) { + cpu_intensity = (cpu_intensity % 10) + 1; // Cycle intensity 1-10 + } + } + + // Final statistics + printf("\n🏁 FINAL STATISTICS:\n"); + show_statistics(); + + return 0; +} \ No newline at end of file diff --git a/src/32-wallclock-profiler/tests/test_multithread.c b/src/32-wallclock-profiler/tests/test_multithread.c new file mode 100644 index 00000000..d7a799c7 --- /dev/null +++ b/src/32-wallclock-profiler/tests/test_multithread.c @@ -0,0 +1,573 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Configuration +#define NUM_WORKER_THREADS 4 +#define REQUEST_QUEUE_SIZE 100 +#define SIMULATION_DURATION 6000 // seconds +#define BUFFER_SIZE 4096 + +// Global state +volatile int running = 1; +pthread_mutex_t queue_mutex = PTHREAD_MUTEX_INITIALIZER; +pthread_cond_t queue_cond = PTHREAD_COND_INITIALIZER; + +// Pipe for blocking operations +int blocking_pipe[2]; +int dummy_eventfd; + +// Request queue simulation +typedef struct { + int request_id; + int request_type; // 0=cpu_math, 1=cpu_string, 2=cpu_sort, 3=file_io, 4=network, 5=poll_wait + struct timespec arrival_time; +} request_t; + +request_t request_queue[REQUEST_QUEUE_SIZE]; +int queue_head = 0; +int queue_tail = 0; +int queue_size = 0; + +// Statistics +volatile long total_requests_generated = 0; +volatile long total_requests_processed = 0; +volatile long cpu_math_requests = 0; +volatile long cpu_string_requests = 0; +volatile long cpu_sort_requests = 0; +volatile long file_io_requests = 0; +volatile long network_requests = 0; +volatile long poll_wait_requests = 0; + +// Signal handler +void signal_handler(int sig) { + printf("\n🛑 Received signal %d, shutting down gracefully...\n", sig); + running = 0; +} + +// Get current timestamp +void get_timestamp(struct timespec *ts) { + clock_gettime(CLOCK_MONOTONIC, ts); +} + +// Calculate time difference in microseconds +long time_diff_us(struct timespec *start, struct timespec *end) { + return (end->tv_sec - start->tv_sec) * 1000000 + + (end->tv_nsec - start->tv_nsec) / 1000; +} + +// Enqueue request +int enqueue_request(request_t *req) { + pthread_mutex_lock(&queue_mutex); + + if (queue_size >= REQUEST_QUEUE_SIZE) { + pthread_mutex_unlock(&queue_mutex); + return -1; // Queue full + } + + request_queue[queue_tail] = *req; + queue_tail = (queue_tail + 1) % REQUEST_QUEUE_SIZE; + queue_size++; + + pthread_cond_signal(&queue_cond); + pthread_mutex_unlock(&queue_mutex); + return 0; +} + +// Dequeue request +int dequeue_request(request_t *req) { + pthread_mutex_lock(&queue_mutex); + + while (queue_size == 0 && running) { + pthread_cond_wait(&queue_cond, &queue_mutex); + } + + if (!running && queue_size == 0) { + pthread_mutex_unlock(&queue_mutex); + return -1; // Shutting down + } + + *req = request_queue[queue_head]; + queue_head = (queue_head + 1) % REQUEST_QUEUE_SIZE; + queue_size--; + + pthread_mutex_unlock(&queue_mutex); + return 0; +} + +// CPU-intensive: Mathematical computation +void simulate_cpu_math_work(int request_id, int intensity) { + volatile double result = 0.0; + int iterations = intensity * 50000; + + // Complex mathematical operations + for (int i = 1; i < iterations; i++) { + result += sin(i) * cos(i) + sqrt(i); + result += log(i + 1) * exp(i % 100 / 100.0); + + // Matrix-like operations + double matrix[4][4]; + for (int j = 0; j < 4; j++) { + for (int k = 0; k < 4; k++) { + matrix[j][k] = sin(i + j + k) * result; + } + } + + if (i % 10000 == 0) { + __asm__("" : "+g" (result) : :); // Prevent optimization + } + } + + // Prime number computation + for (int n = 2; n < intensity * 1000; n++) { + int is_prime = 1; + for (int d = 2; d * d <= n; d++) { + if (n % d == 0) { + is_prime = 0; + break; + } + } + if (is_prime) result += n; + } +} + +// CPU-intensive: String processing +void simulate_cpu_string_work(int request_id, int intensity) { + char *buffer = malloc(BUFFER_SIZE * intensity); + char *temp = malloc(BUFFER_SIZE); + + // String generation and manipulation + for (int i = 0; i < intensity * 1000; i++) { + snprintf(temp, BUFFER_SIZE, + "Request_%d_Processing_String_Operation_%d_With_Data_%d", + request_id, i, rand()); + + // String operations + strcat(buffer, temp); + char *found = strstr(buffer, "Processing"); + if (found) { + memmove(found, found + 10, strlen(found + 10) + 1); + } + + // Base64-like encoding simulation + for (char *p = temp; *p; p++) { + *p = (*p + i) % 128; + } + + if (strlen(buffer) > BUFFER_SIZE * intensity - 1000) { + buffer[0] = '\0'; // Reset buffer + } + } + + // Pattern matching + const char *patterns[] = {"Request", "Processing", "Data", "Operation"}; + for (int i = 0; i < 4; i++) { + char *pos = buffer; + while ((pos = strstr(pos, patterns[i])) != NULL) { + pos++; + } + } + + free(buffer); + free(temp); +} + +// CPU-intensive: Sorting and data structures +void simulate_cpu_sort_work(int request_id, int intensity) { + int size = intensity * 500; + int *array = malloc(size * sizeof(int)); + + // Generate random data + for (int i = 0; i < size; i++) { + array[i] = rand() % (size * 10); + } + + // Bubble sort (intentionally inefficient for CPU load) + for (int i = 0; i < size - 1; i++) { + for (int j = 0; j < size - i - 1; j++) { + if (array[j] > array[j + 1]) { + int temp = array[j]; + array[j] = array[j + 1]; + array[j + 1] = temp; + } + } + + // Add some computation every 100 iterations + if (i % 100 == 0) { + volatile long hash = 0; + for (int k = 0; k < 1000; k++) { + hash = hash * 31 + array[k % size]; + } + } + } + + // Binary search operations + int target = array[size / 2]; + for (int search = 0; search < intensity * 100; search++) { + int left = 0, right = size - 1; + while (left <= right) { + int mid = (left + right) / 2; + if (array[mid] == target) break; + else if (array[mid] < target) left = mid + 1; + else right = mid - 1; + } + } + + free(array); +} + +// Blocking I/O: File operations with real syscalls +void simulate_file_io(int request_id) { + char filename[256]; + snprintf(filename, sizeof(filename), "/tmp/webserver_test_%d_%ld.tmp", + request_id, time(NULL)); + + // Write operation with multiple syscalls + int fd = open(filename, O_CREAT | O_WRONLY | O_TRUNC, 0644); + if (fd >= 0) { + char data[4096]; + for (int i = 0; i < 200; i++) { + snprintf(data, sizeof(data), + "Request %d chunk %d: Lorem ipsum dolor sit amet, consectetur adipiscing elit.\n", + request_id, i); + + ssize_t written = write(fd, data, strlen(data)); + if (written > 0) { + fsync(fd); // Force disk write - blocking syscall + } + + // Add some file metadata operations + struct stat st; + fstat(fd, &st); + } + close(fd); + } + + // Read operation with blocking + fd = open(filename, O_RDONLY); + if (fd >= 0) { + char buffer[1024]; + ssize_t bytes_read; + while ((bytes_read = read(fd, buffer, sizeof(buffer))) > 0) { + // Process data with some CPU work + for (int i = 0; i < bytes_read; i++) { + buffer[i] = buffer[i] ^ 0x55; // Simple XOR + } + } + close(fd); + } + + // File system operations + chmod(filename, 0755); + access(filename, R_OK | W_OK); + + // Cleanup + unlink(filename); + __sync_fetch_and_add(&file_io_requests, 1); +} + +// Blocking I/O: Network operations +void simulate_network_operation(int request_id, int op_type) { + if (op_type % 4 == 0) { + // TCP socket creation and connection attempt + int sockfd = socket(AF_INET, SOCK_STREAM, 0); + if (sockfd >= 0) { + struct sockaddr_in addr; + addr.sin_family = AF_INET; + addr.sin_port = htons(80); + addr.sin_addr.s_addr = inet_addr("127.0.0.1"); // Local connection + + // Non-blocking socket for controlled timing + fcntl(sockfd, F_SETFL, O_NONBLOCK); + connect(sockfd, (struct sockaddr*)&addr, sizeof(addr)); // Will likely fail + + // Use poll to wait + struct pollfd pfd = {sockfd, POLLOUT, 0}; + poll(&pfd, 1, 10 + (request_id % 50)); // 10-60ms timeout + + close(sockfd); + } + } else if (op_type % 4 == 1) { + // UDP socket operations + int sockfd = socket(AF_INET, SOCK_DGRAM, 0); + if (sockfd >= 0) { + char buffer[1024]; + snprintf(buffer, sizeof(buffer), "Request %d data", request_id); + + struct sockaddr_in addr; + addr.sin_family = AF_INET; + addr.sin_port = htons(12345); + addr.sin_addr.s_addr = inet_addr("127.0.0.1"); + + sendto(sockfd, buffer, strlen(buffer), 0, (struct sockaddr*)&addr, sizeof(addr)); + + // Try to receive (will block briefly) + struct pollfd pfd = {sockfd, POLLIN, 0}; + poll(&pfd, 1, 5 + (request_id % 20)); // 5-25ms timeout + + close(sockfd); + } + } else { + // Pipe operations for blocking + char buffer[256]; + snprintf(buffer, sizeof(buffer), "data_%d", request_id); + + // Write to pipe (might block if full) + write(blocking_pipe[1], buffer, strlen(buffer)); + + // Read from pipe (blocking operation) + char read_buf[256]; + struct pollfd pfd = {blocking_pipe[0], POLLIN, 0}; + if (poll(&pfd, 1, 1 + (request_id % 10)) > 0) { // 1-11ms timeout + read(blocking_pipe[0], read_buf, sizeof(read_buf)); + } + } + + __sync_fetch_and_add(&network_requests, 1); +} + +// Blocking I/O: Poll/select operations +void simulate_poll_wait(int request_id) { + // Create multiple file descriptors + int fds[3]; + fds[0] = eventfd(0, EFD_NONBLOCK); + fds[1] = blocking_pipe[0]; + fds[2] = open("/dev/null", O_RDONLY); + + struct pollfd pollfds[3]; + for (int i = 0; i < 3; i++) { + pollfds[i].fd = fds[i]; + pollfds[i].events = POLLIN; + pollfds[i].revents = 0; + } + + // Poll with timeout - real blocking syscall + int timeout = 20 + (request_id % 80); // 20-100ms + int result = poll(pollfds, 3, timeout); + + if (result > 0) { + // Process any ready file descriptors + for (int i = 0; i < 3; i++) { + if (pollfds[i].revents & POLLIN) { + char dummy[256]; + read(pollfds[i].fd, dummy, sizeof(dummy)); + } + } + } + + // Cleanup + if (fds[0] >= 0) close(fds[0]); + if (fds[2] >= 0) close(fds[2]); + + __sync_fetch_and_add(&poll_wait_requests, 1); +} + +// Worker thread function +void* worker_thread(void* arg) { + int thread_id = *(int*)arg; + char thread_name[32]; + snprintf(thread_name, sizeof(thread_name), "worker-%d", thread_id); + pthread_setname_np(pthread_self(), thread_name); + + printf("🔧 Worker thread %d started\n", thread_id); + + while (running) { + request_t req; + if (dequeue_request(&req) < 0) break; + + struct timespec start, end; + get_timestamp(&start); + + // Process based on request type + switch (req.request_type) { + case 0: // CPU-intensive math + simulate_cpu_math_work(req.request_id, 2 + rand() % 4); + __sync_fetch_and_add(&cpu_math_requests, 1); + break; + case 1: // CPU-intensive string processing + simulate_cpu_string_work(req.request_id, 1 + rand() % 3); + __sync_fetch_and_add(&cpu_string_requests, 1); + break; + case 2: // CPU-intensive sorting + simulate_cpu_sort_work(req.request_id, 1 + rand() % 2); + __sync_fetch_and_add(&cpu_sort_requests, 1); + break; + case 3: // File I/O + case 4: // File I/O + case 5: + simulate_file_io(req.request_id); + break; + case 6: // Network operations + case 7: + case 8: + simulate_network_operation(req.request_id, req.request_id); + break; + case 9: // Poll/wait operations + case 10: + case 11: + simulate_poll_wait(req.request_id); + break; + } + + get_timestamp(&end); + long processing_time = time_diff_us(&req.arrival_time, &end); + + __sync_fetch_and_add(&total_requests_processed, 1); + + if (req.request_id % 500 == 0) { + printf("📊 Worker %d processed request %d (type %d) in %ld μs\n", + thread_id, req.request_id, req.request_type, processing_time); + } + } + + printf("🔧 Worker thread %d finished\n", thread_id); + return NULL; +} + +// Request generator thread +void* request_generator(void* arg) { + pthread_setname_np(pthread_self(), "req-generator"); + printf("📈 Request generator started\n"); + + int request_id = 0; + while (running) { + request_t req; + req.request_id = request_id++; + req.request_type = rand() % 12; // 6 different request types + get_timestamp(&req.arrival_time); + + if (enqueue_request(&req) < 0) { + // Queue full, use nanosleep for backpressure + struct timespec ts = {0, 1000000}; // 1ms + // nanosleep(&ts, NULL); + continue; + } + + __sync_fetch_and_add(&total_requests_generated, 1); + + // Variable request rate using nanosleep + struct timespec delay = {0, (500000 + rand() % 2000000)}; // 0.5-2.5ms + // nanosleep(&delay, NULL); + } + + printf("📈 Request generator finished\n"); + return NULL; +} + +// Statistics thread +void* stats_thread(void* arg) { + pthread_setname_np(pthread_self(), "stats-monitor"); + printf("📊 Statistics monitor started\n"); + + while (running) { + sleep(5); // Real sleep syscall + printf("\n=== Statistics ===\n"); + printf("Generated: %ld, Processed: %ld, Queue: %d\n", + total_requests_generated, total_requests_processed, queue_size); + printf("Math: %ld, String: %ld, Sort: %ld\n", + cpu_math_requests, cpu_string_requests, cpu_sort_requests); + printf("File I/O: %ld, Network: %ld, Poll: %ld\n", + file_io_requests, network_requests, poll_wait_requests); + printf("=================\n\n"); + } + + printf("📊 Statistics monitor finished\n"); + return NULL; +} + +int main() { + printf("🚀 Multi-threaded Web Server Simulation Starting...\n"); + printf("Configuration:\n"); + printf(" - PID: %d\n", getpid()); + printf(" - Worker threads: %d\n", NUM_WORKER_THREADS); + printf(" - Simulation duration: %d seconds\n", SIMULATION_DURATION); + printf(" - Request types: Math CPU, String CPU, Sort CPU, File I/O, Network, Poll\n\n"); + + // Setup signal handling + signal(SIGINT, signal_handler); + signal(SIGTERM, signal_handler); + + srand(time(NULL)); + + // Initialize blocking mechanisms + if (pipe(blocking_pipe) < 0) { + perror("pipe"); + exit(1); + } + + // Make pipe non-blocking for controlled behavior + fcntl(blocking_pipe[0], F_SETFL, O_NONBLOCK); + fcntl(blocking_pipe[1], F_SETFL, O_NONBLOCK); + + // Create threads + pthread_t threads[NUM_WORKER_THREADS + 2]; // +2 for generator and stats + int thread_ids[NUM_WORKER_THREADS]; + + // Start request generator + pthread_create(&threads[0], NULL, request_generator, NULL); + + // Start statistics monitor + pthread_create(&threads[1], NULL, stats_thread, NULL); + + // Start worker threads + for (int i = 0; i < NUM_WORKER_THREADS; i++) { + thread_ids[i] = i; + pthread_create(&threads[i + 2], NULL, worker_thread, &thread_ids[i]); + } + + printf("✅ All threads started. Running for %d seconds...\n", SIMULATION_DURATION); + printf(" Press Ctrl+C to stop early\n\n"); + + // Run for specified duration + sleep(SIMULATION_DURATION); + + // Shutdown + printf("\n🛑 Simulation time completed, initiating shutdown...\n"); + running = 0; + + // Wake up any waiting threads + pthread_cond_broadcast(&queue_cond); + + // Wait for all threads to finish + for (int i = 0; i < NUM_WORKER_THREADS + 2; i++) { + pthread_join(threads[i], NULL); + } + + // Cleanup + close(blocking_pipe[0]); + close(blocking_pipe[1]); + + // Final statistics + printf("\n"); + printf("==================================================\n"); + printf("FINAL STATISTICS\n"); + printf("==================================================\n"); + printf("Total requests generated: %ld\n", total_requests_generated); + printf("Total requests processed: %ld\n", total_requests_processed); + printf("Request breakdown:\n"); + printf(" - Math CPU ops: %ld\n", cpu_math_requests); + printf(" - String CPU ops: %ld\n", cpu_string_requests); + printf(" - Sort CPU ops: %ld\n", cpu_sort_requests); + printf(" - File I/O ops: %ld\n", file_io_requests); + printf(" - Network ops: %ld\n", network_requests); + printf(" - Poll/wait ops: %ld\n", poll_wait_requests); + printf("==================================================\n"); + + printf("✅ Web server simulation completed successfully!\n"); + return 0; +} \ No newline at end of file diff --git a/src/32-wallclock-profiler/wallclock_profiler.py b/src/32-wallclock-profiler/wallclock_profiler.py new file mode 100755 index 00000000..a7b64c51 --- /dev/null +++ b/src/32-wallclock-profiler/wallclock_profiler.py @@ -0,0 +1,919 @@ +#!/usr/bin/env python3 +""" +Combined On-CPU and Off-CPU Profiler + +This script runs both 'oncputime' and 'offcputime' tools simultaneously to capture +both on-CPU and off-CPU activity for a given process, then combines the results +into a unified flamegraph. + +Usage: + python3 wallclock_profiler.py [OPTIONS] +""" + +import argparse +import subprocess +import sys +import os +import threading +import time +import tempfile +from pathlib import Path +from collections import defaultdict + +class CombinedProfiler: + def __init__(self, pid, duration=30, freq=49, min_block_us=1000): + self.pid = pid + self.duration = duration + self.freq = freq + self.min_block_us = min_block_us + self.profile_output = [] + self.offcpu_output = [] + self.profile_error = None + self.offcpu_error = None + + # Find tool paths + self.script_dir = Path(__file__).parent + self.oncpu_tool = self.script_dir / "oncputime" + self.offcpu_tool = self.script_dir / "offcputime" + + # Check if tools exist + if not self.oncpu_tool.exists(): + raise FileNotFoundError(f"Oncputime tool not found at {self.oncpu_tool}") + if not self.offcpu_tool.exists(): + raise FileNotFoundError(f"Offcputime tool not found at {self.offcpu_tool}") + + def discover_threads(self): + """Discover threads and determine if multi-threaded""" + try: + result = subprocess.run( + ["ps", "-T", "-p", str(self.pid)], + capture_output=True, text=True + ) + + if result.returncode != 0: + return False, [] + + lines = result.stdout.strip().split('\n')[1:] # Skip header + threads = [] + for line in lines: + parts = line.split() + if len(parts) >= 4: + pid, tid, tty, time_str, *cmd_parts = parts + tid = int(tid) + cmd = ' '.join(cmd_parts) + threads.append((tid, cmd)) + + return len(threads) > 1, threads + + except Exception: + return False, [] + + def run_oncpu_tool(self): + """Run the oncputime tool in a separate thread""" + try: + cmd = [ + str(self.oncpu_tool), + # "./profiler oncputime", + "-p", str(self.pid), + "-F", str(self.freq), + "-f", # Folded output format + str(self.duration) + ] + + print(f"Running: {' '.join(cmd)}") + result = subprocess.run(cmd, capture_output=True, text=True, timeout=self.duration + 10) + + if result.returncode != 0: + self.profile_error = f"Oncputime tool failed: {result.stderr}" + return + + self.profile_output = result.stdout.strip().split('\n') if result.stdout.strip() else [] + + except subprocess.TimeoutExpired: + self.profile_error = "Oncputime tool timed out" + except Exception as e: + self.profile_error = f"Oncputime tool error: {str(e)}" + + def run_offcpu_tool(self): + """Run the offcputime tool in a separate thread""" + try: + cmd = [ + str(self.offcpu_tool), + # "./profiler offcputime", + "-p", str(self.pid), + "-m", str(self.min_block_us), + "-f", # Folded output format + str(self.duration) + ] + + print(f"Running: {' '.join(cmd)}") + result = subprocess.run(cmd, capture_output=True, text=True, timeout=self.duration + 10) + + if result.returncode != 0: + self.offcpu_error = f"Offcputime tool failed: {result.stderr}" + return + + self.offcpu_output = result.stdout.strip().split('\n') if result.stdout.strip() else [] + + except subprocess.TimeoutExpired: + self.offcpu_error = "Offcputime tool timed out" + except Exception as e: + self.offcpu_error = f"Offcputime tool error: {str(e)}" + + def run_profiling(self): + """Run both profiling tools simultaneously""" + # Check if multi-threaded first + is_multithread, threads = self.discover_threads() + + if is_multithread: + print(f"Multi-threaded application detected ({len(threads)} threads)") + print(f"Profiling each thread separately...") + self.profile_individual_threads(threads) + else: + print(f"Starting combined profiling for PID {self.pid} for {self.duration} seconds...") + + # Create threads for both tools + oncpu_thread = threading.Thread(target=self.run_oncpu_tool) + offcpu_thread = threading.Thread(target=self.run_offcpu_tool) + + # Start both threads + oncpu_thread.start() + offcpu_thread.start() + + # Wait for both to complete + oncpu_thread.join() + offcpu_thread.join() + + # Check for errors + if self.profile_error: + print(f"Oncpu tool error: {self.profile_error}", file=sys.stderr) + if self.offcpu_error: + print(f"Offcpu tool error: {self.offcpu_error}", file=sys.stderr) + + if self.profile_error and self.offcpu_error: + raise RuntimeError("Both profiling tools failed") + + def profile_individual_threads(self, threads): + """Profile each thread individually but simultaneously""" + self.thread_results = {} + + print(f"Starting simultaneous profiling of all {len(threads)} threads for {self.duration} seconds...") + + # Create profiling threads for parallel execution + profiling_threads = [] + thread_data = {} + + for tid, cmd in threads: + # Initialize result storage + thread_data[tid] = { + 'cmd': cmd, + 'oncpu_data': [], + 'offcpu_data': [], + 'oncpu_error': None, + 'offcpu_error': None + } + + # Create on-CPU profiling thread + oncpu_thread = threading.Thread( + target=self._profile_thread_oncpu_worker, + args=(tid, thread_data[tid]) + ) + profiling_threads.append(oncpu_thread) + + # Create off-CPU profiling thread + offcpu_thread = threading.Thread( + target=self._profile_thread_offcpu_worker, + args=(tid, thread_data[tid]) + ) + profiling_threads.append(offcpu_thread) + + # Start all profiling threads simultaneously + start_time = time.time() + for thread in profiling_threads: + thread.start() + + # Wait for all to complete + for thread in profiling_threads: + thread.join() + + end_time = time.time() + actual_duration = end_time - start_time + print(f"Profiling completed in {actual_duration:.1f} seconds") + + # Store results + self.thread_results = thread_data + + # Report any errors + for tid, data in thread_data.items(): + if data['oncpu_error']: + print(f"On-CPU profiling error for thread {tid}: {data['oncpu_error']}") + if data['offcpu_error']: + print(f"Off-CPU profiling error for thread {tid}: {data['offcpu_error']}") + + def _profile_thread_oncpu_worker(self, tid, thread_data): + """Worker function for on-CPU profiling of a specific thread""" + try: + cmd = [ + str(self.oncpu_tool), + "-L", str(tid), # Specific thread + "-F", str(self.freq), + "-f", # Folded output + str(self.duration) + ] + + result = subprocess.run(cmd, capture_output=True, text=True, timeout=self.duration + 10) + + if result.returncode == 0 and result.stdout.strip(): + thread_data['oncpu_data'] = result.stdout.strip().split('\n') + else: + thread_data['oncpu_data'] = [] + if result.stderr: + thread_data['oncpu_error'] = result.stderr + + except Exception as e: + thread_data['oncpu_error'] = str(e) + thread_data['oncpu_data'] = [] + + def _profile_thread_offcpu_worker(self, tid, thread_data): + """Worker function for off-CPU profiling of a specific thread""" + try: + cmd = [ + str(self.offcpu_tool), + "-t", str(tid), # Specific thread + "-m", str(self.min_block_us), + "-f", # Folded output + str(self.duration) + ] + + result = subprocess.run(cmd, capture_output=True, text=True, timeout=self.duration + 10) + + if result.returncode == 0 and result.stdout.strip(): + thread_data['offcpu_data'] = result.stdout.strip().split('\n') + else: + thread_data['offcpu_data'] = [] + if result.stderr: + thread_data['offcpu_error'] = result.stderr + + except Exception as e: + thread_data['offcpu_error'] = str(e) + thread_data['offcpu_data'] = [] + + def parse_folded_line(self, line): + """Parse a folded format line into stack trace and value""" + if not line.strip(): + return None, None + + parts = line.rsplit(' ', 1) + if len(parts) != 2: + return None, None + + stack_trace = parts[0] + try: + value = int(parts[1]) + return stack_trace, value + except ValueError: + return None, None + + def normalize_and_combine_stacks(self): + """Combine and normalize stack traces from both tools""" + oncpu_stacks = {} + offcpu_stacks = {} + + # Process on-CPU data (oncputime tool) + print(f"Processing {len(self.profile_output)} on-CPU stack traces...") + oncpu_total_samples = 0 + for line in self.profile_output: + stack, value = self.parse_folded_line(line) + if stack and value: + oncpu_total_samples += value + # remove the first part of the stack trace and add annotation + stack_parts = stack.split(";")[1:] + # Add _[c] annotation for CPU-intensive (on-CPU) stacks + annotated_stack = ";".join(stack_parts) + "_[c]" + oncpu_stacks[annotated_stack] = oncpu_stacks.get(annotated_stack, 0) + value + + # Process off-CPU data (offcputime tool) + print(f"Processing {len(self.offcpu_output)} off-CPU stack traces...") + offcpu_total_us = 0 + for line in self.offcpu_output: + stack, value = self.parse_folded_line(line) + if stack and value: + offcpu_total_us += value + # remove the first part of the stack trace and add annotation + stack_parts = stack.split(";")[1:] + # Add _[o] annotation for off-CPU (I/O/blocking) stacks + annotated_stack = ";".join(stack_parts) + "_[o]" + offcpu_stacks[annotated_stack] = offcpu_stacks.get(annotated_stack, 0) + value + + # Store counts for summary + self.oncpu_count = len(oncpu_stacks) + self.offcpu_count = len(offcpu_stacks) + + # Combine stacks with annotations + combined_stacks = {} + + # Add on-CPU stacks directly + for stack, value in oncpu_stacks.items(): + combined_stacks[stack] = combined_stacks.get(stack, 0) + value + + # Normalize and add off-CPU stacks + if offcpu_total_us > 0 and oncpu_total_samples > 0: + # Calculate normalization factor + # Assume each on-CPU sample represents 1/freq seconds of CPU time + avg_oncpu_sample_us = (1.0 / self.freq) * 1_000_000 # microseconds per sample + normalization_factor = avg_oncpu_sample_us # Use microseconds directly + + # Calculate expected vs actual samples + expected_samples = self.duration * self.freq + sample_rate = (oncpu_total_samples / expected_samples) * 100 if expected_samples > 0 else 0 + + print(f"On-CPU: {oncpu_total_samples} samples (expected: {expected_samples}, {sample_rate:.1f}% sampled)") + print(f"Off-CPU: {offcpu_total_us:,} μs ({offcpu_total_us/1_000_000:.2f} seconds)") + print(f"Normalization factor: {normalization_factor:.0f} μs/sample") + + # Add normalized off-CPU stacks + for stack, value in offcpu_stacks.items(): + # Convert microseconds to equivalent samples + normalized_value = int(value / normalization_factor) + if normalized_value > 0: # Only include if it results in at least 1 equivalent sample + combined_stacks[stack] = combined_stacks.get(stack, 0) + normalized_value + else: + # If no normalization needed, just add off-CPU stacks as-is + for stack, value in offcpu_stacks.items(): + combined_stacks[stack] = combined_stacks.get(stack, 0) + value + + return combined_stacks + + def setup_flamegraph_tools(self): + """Ensure FlameGraph tools are available and create custom color palette""" + flamegraph_dir = self.script_dir / "FlameGraph" + flamegraph_script = flamegraph_dir / "flamegraph.pl" + + if flamegraph_script.exists(): + # Create a custom flamegraph script with our color palette + custom_script = self.script_dir / "combined_flamegraph.pl" + self.create_custom_flamegraph_script(flamegraph_script, custom_script) + return custom_script + + print("FlameGraph tools not found, cloning repository...") + try: + result = subprocess.run([ + "git", "clone", + "https://github.com/brendangregg/FlameGraph.git", + str(flamegraph_dir), "--depth=1" + ], capture_output=True, text=True, cwd=self.script_dir) + + if result.returncode != 0: + print(f"Failed to clone FlameGraph: {result.stderr}") + return None + + if flamegraph_script.exists(): + # Make it executable (owner only for security) + os.chmod(flamegraph_script, 0o700) # rwx------ + print("FlameGraph tools cloned successfully") + # Create custom script + custom_script = self.script_dir / "combined_flamegraph.pl" + self.create_custom_flamegraph_script(flamegraph_script, custom_script) + return custom_script + else: + print("FlameGraph script not found after cloning") + return None + + except Exception as e: + print(f"Error setting up FlameGraph tools: {e}") + return None + + def create_custom_flamegraph_script(self, original_script, custom_script): + """Create a custom flamegraph script with our color palette""" + try: + with open(original_script, 'r') as f: + content = f.read() + + # Add our custom color palette for combined profiling + # Insert after the existing "chain" palette logic + custom_palette = ''' + if (defined $type and $type eq "combined") { + if ($name =~ m:_\\[c\\]$:) { # CPU annotation (on-CPU) + $type = "red"; + } elsif ($name =~ m:_\\[o\\]$:) { # off-CPU annotation (I/O/blocking) + $type = "blue"; + } else { # default + $type = "yellow"; + } + # fall-through to color palettes + }''' + + # Find the insertion point after the chain palette + insertion_point = content.find(' if (defined $type and $type eq "chain") {') + if insertion_point != -1: + # Find the end of the chain block + end_point = content.find('\t# color palettes', insertion_point) + if end_point != -1: + # Insert our custom palette before the color palettes section + content = content[:end_point] + custom_palette + '\n\n\t' + content[end_point:] + + with open(custom_script, 'w') as f: + f.write(content) + + # Make it executable (owner only for security) + os.chmod(custom_script, 0o700) # rwx------ + print("Custom flamegraph script created with combined color palette") + + except Exception as e: + print(f"Error creating custom flamegraph script: {e}") + # Fall back to original script + return original_script + + def generate_flamegraph_data(self, output_prefix=None): + """Generate combined flamegraph data and SVG""" + # Check if multi-threaded + is_multithread, threads = self.discover_threads() + + if is_multithread and hasattr(self, 'thread_results'): + return self.generate_multithread_flamegraphs(output_prefix) + else: + return self.generate_single_flamegraph(output_prefix) + + def generate_single_flamegraph(self, output_prefix): + """Generate single flamegraph for single-threaded or combined analysis""" + if output_prefix is None: + output_prefix = f"combined_profile_pid{self.pid}_{int(time.time())}" + + folded_file = f"{output_prefix}.folded" + svg_file = f"{output_prefix}.svg" + + combined_stacks = self.normalize_and_combine_stacks() + + if not combined_stacks: + print("No stack traces collected from either tool") + return None, None + + # Calculate time statistics for single thread case + single_thread_times = self.calculate_thread_times(self.profile_output, self.offcpu_output) + + # Sort by value for better visualization + sorted_stacks = sorted(combined_stacks.items(), key=lambda x: x[1], reverse=True) + + # Generate folded output + output_lines = [] + for stack, value in sorted_stacks: + output_lines.append(f"{stack} {value}") + + # Write folded data to file + try: + with open(folded_file, 'w') as f: + f.write('\n'.join(output_lines)) + print(f"Combined flamegraph data written to: {folded_file}") + except Exception as e: + print(f"Error writing folded data: {e}") + return None, None + + # Generate SVG flamegraph + svg_file = self.generate_svg_from_folded(folded_file, svg_file) + + # Generate time analysis file for single thread + self.generate_single_thread_analysis_file(output_prefix, single_thread_times) + + # Print summary + print(f"\nSummary:") + print(f"Total unique stack traces: {len(sorted_stacks)}") + oncpu_stacks = sum(1 for stack, _ in sorted_stacks if stack.endswith("_[c]")) + offcpu_stacks = sum(1 for stack, _ in sorted_stacks if stack.endswith("_[o]")) + print(f"On-CPU stack traces: {oncpu_stacks}") + print(f"Off-CPU stack traces: {offcpu_stacks}") + + # Print time verification + print(f"\nTime Analysis:") + print(f"On-CPU time: {single_thread_times['oncpu_time_sec']:.3f}s") + print(f"Off-CPU time: {single_thread_times['offcpu_time_sec']:.3f}s") + print(f"Total measured time: {single_thread_times['total_time_sec']:.3f}s") + print(f"Wall clock coverage: {single_thread_times['wall_clock_coverage_pct']:.1f}% of {self.duration}s profiling duration") + + return folded_file, svg_file + + def generate_multithread_flamegraphs(self, output_prefix): + """Generate separate flamegraphs for each thread""" + base_name = f"combined_profile_pid{self.pid}_{int(time.time())}" + output_dir = f"multithread_{base_name}" + os.makedirs(output_dir, exist_ok=True) + + print(f"Results will be saved to: {output_dir}/") + + generated_files = [] + total_threads_with_data = 0 + + for tid, thread_data in self.thread_results.items(): + cmd = thread_data['cmd'] + oncpu_data = thread_data['oncpu_data'] + offcpu_data = thread_data['offcpu_data'] + + # Skip threads with no data + if not oncpu_data and not offcpu_data: + continue + + total_threads_with_data += 1 + + # Determine thread role + role = self.get_thread_role(tid, cmd) + + # Generate combined folded file for this thread + folded_file = f"{output_dir}/thread_{tid}_{role}.folded" + + combined_stacks = self.combine_thread_stacks(oncpu_data, offcpu_data) + + if combined_stacks: + # Write folded data + with open(folded_file, 'w') as f: + for stack, value in sorted(combined_stacks.items(), key=lambda x: x[1], reverse=True): + f.write(f"{stack} {value}\n") + + # Generate SVG + svg_file = f"{output_dir}/thread_{tid}_{role}.svg" + svg_file = self.generate_svg_from_folded(folded_file, svg_file, f"Thread {tid} ({role})") + + # Generate individual thread analysis report + analysis_file = f"{output_dir}/thread_{tid}_{role}_analysis.txt" + self.generate_individual_thread_analysis(analysis_file, tid, thread_data, combined_stacks, role) + + generated_files.append((folded_file, svg_file)) + print(f"Generated: {folded_file} and {svg_file}") + + # Generate thread analysis + self.generate_thread_analysis_file(output_dir, base_name) + + print(f"\nGenerated {len(generated_files)} thread profiles with data out of {len(self.thread_results)} total threads") + + return generated_files[0] if generated_files else (None, None) + + def get_thread_role(self, tid, cmd): + """Get thread role based on TID and command""" + if tid == self.pid: + return "main" + elif "cuda" in cmd.lower() and "evthandlr" in cmd.lower(): + return "cuda-event" + elif "cuda" in cmd.lower(): + return "cuda-compute" + elif "eal-intr" in cmd.lower(): + return "dpdk-interrupt" + elif "rte_mp" in cmd.lower(): + return "dpdk-multiprocess" + elif "telemetry" in cmd.lower(): + return "telemetry" + else: + return cmd.lower().replace(' ', '_').replace('-', '_') + f"_{tid}" + + def combine_thread_stacks(self, oncpu_data, offcpu_data): + """Combine on-CPU and off-CPU data for a single thread""" + combined_stacks = {} + + # Process on-CPU data + for line in oncpu_data: + parts = line.rsplit(' ', 1) + if len(parts) == 2: + stack, count_str = parts + try: + count = int(count_str) + # Remove process name prefix and add CPU annotation + clean_stack = ';'.join(stack.split(';')[1:]) + '_[c]' + combined_stacks[clean_stack] = combined_stacks.get(clean_stack, 0) + count + except ValueError: + continue + + # Process off-CPU data with normalization + if offcpu_data: + norm_factor = (1.0 / self.freq) * 1_000_000 # microseconds per sample + for line in offcpu_data: + parts = line.rsplit(' ', 1) + if len(parts) == 2: + stack, time_str = parts + try: + time_us = int(time_str) + normalized_samples = max(1, int(time_us / norm_factor)) + # Remove process name prefix and add off-CPU annotation + clean_stack = ';'.join(stack.split(';')[1:]) + '_[o]' + combined_stacks[clean_stack] = combined_stacks.get(clean_stack, 0) + normalized_samples + except ValueError: + continue + + return combined_stacks + + def generate_svg_from_folded(self, folded_file, svg_file, title=None): + """Generate SVG flamegraph from folded file""" + flamegraph_script = self.setup_flamegraph_tools() + if flamegraph_script: + try: + cmd_args = [ + "perl", str(flamegraph_script), + "--colors", "combined", + folded_file + ] + + if title: + cmd_args.extend(["--title", title]) + else: + cmd_args.extend(["--title", "Combined On-CPU and Off-CPU Profile"]) + + result = subprocess.run(cmd_args, capture_output=True, text=True) + + if result.returncode == 0: + with open(svg_file, 'w') as f: + f.write(result.stdout) + return svg_file + else: + print(f"Error generating flamegraph {svg_file}: {result.stderr}") + return None + except Exception as e: + print(f"Error running flamegraph.pl: {e}") + return None + else: + print("FlameGraph tools not available, skipping SVG generation") + return None + + def generate_individual_thread_analysis(self, analysis_file, tid, thread_data, combined_stacks, role): + """Generate individual thread-level analysis report""" + time_stats = self.calculate_thread_times(thread_data['oncpu_data'], thread_data['offcpu_data']) + + # Count stack types + oncpu_stacks = sum(1 for stack in combined_stacks.keys() if stack.endswith('_[c]')) + offcpu_stacks = sum(1 for stack in combined_stacks.keys() if stack.endswith('_[o]')) + + with open(analysis_file, 'w') as f: + f.write("Thread-Level Analysis Report\n") + f.write("=" * 50 + "\n\n") + + f.write("Profiling Parameters:\n") + f.write(f"Duration: {self.duration} seconds\n") + f.write(f"Sampling frequency: {self.freq} Hz\n") + f.write(f"\n") + + f.write("Thread Information:\n") + f.write("-" * 40 + "\n") + f.write(f"Thread ID: {tid}\n") + f.write(f"Role: {role}\n") + f.write(f"Command: {thread_data['cmd']}\n") + f.write(f"\n") + + f.write("Time Analysis:\n") + f.write("-" * 40 + "\n") + oncpu_us = int(time_stats['oncpu_time_sec'] * 1_000_000) + f.write(f"On-CPU time: {time_stats['oncpu_time_sec']:.3f}s ({oncpu_us:,} μs)\n") + f.write(f"Off-CPU time: {time_stats['offcpu_time_sec']:.3f}s ({time_stats['offcpu_us']:,} μs)\n") + f.write(f"Total measured time: {time_stats['total_time_sec']:.3f}s\n") + f.write(f"Wall clock coverage: {time_stats['wall_clock_coverage_pct']:.3f}% of {self.duration}s actual process runtime\n") + f.write(f"\n") + + f.write("Stack Trace Summary:\n") + f.write("-" * 40 + "\n") + f.write(f"On-CPU stack traces: {oncpu_stacks}\n") + f.write(f"Off-CPU stack traces: {offcpu_stacks}\n") + f.write(f"Total unique stacks: {len(combined_stacks)}\n") + f.write(f"\n") + + f.write("Coverage Assessment:\n") + f.write("-" * 40 + "\n") + if time_stats['wall_clock_coverage_pct'] < 50: + f.write("⚠️ Low coverage - thread may be mostly idle or data collection incomplete\n") + elif time_stats['wall_clock_coverage_pct'] > 150: + f.write("⚠️ High coverage - possible overlap or measurement anomaly\n") + else: + f.write("✓ Coverage appears reasonable for active process\n") + f.write(f"\n") + + print(f"Generated thread analysis: {analysis_file}") + + def generate_thread_analysis_file(self, output_dir, base_name): + """Generate thread analysis summary file""" + summary_file = f"{output_dir}/{base_name}_thread_analysis.txt" + + # Calculate time statistics for all threads + total_process_oncpu_time = 0 + total_process_offcpu_time = 0 + thread_time_data = {} + + for tid, data in self.thread_results.items(): + time_stats = self.calculate_thread_times(data['oncpu_data'], data['offcpu_data']) + thread_time_data[tid] = time_stats + total_process_oncpu_time += time_stats['oncpu_time_sec'] + total_process_offcpu_time += time_stats['offcpu_time_sec'] + + total_process_time = total_process_oncpu_time + total_process_offcpu_time + + with open(summary_file, 'w') as f: + f.write("Multi-Thread Analysis Report\n") + f.write("="*50 + "\n\n") + f.write(f"Process ID: {self.pid}\n") + f.write(f"Total threads: {len(self.thread_results)}\n") + f.write(f"Profiling duration: {self.duration} seconds\n") + f.write(f"Sampling frequency: {self.freq} Hz\n\n") + + # Wall clock time analysis + f.write("Time Analysis Summary:\n") + f.write("-" * 40 + "\n") + f.write(f"Expected wall clock time: {self.duration:.1f} seconds\n") + f.write(f"Total measured on-CPU time: {total_process_oncpu_time:.3f} seconds\n") + f.write(f"Total measured off-CPU time: {total_process_offcpu_time:.3f} seconds\n") + f.write(f"Total measured time: {total_process_time:.3f} seconds\n") + + if self.duration > 0: + coverage_pct = (total_process_time / self.duration) * 100 + f.write(f"Wall clock coverage: {coverage_pct:.1f}% of expected duration\n") + + if coverage_pct < 50: + f.write("⚠️ Low coverage - threads may be mostly idle or data collection incomplete\n") + elif coverage_pct > 150: + f.write("⚠️ High coverage - possible overlap or measurement anomaly\n") + else: + f.write("✓ Coverage appears reasonable for active threads\n") + f.write("\n") + + f.write("Thread Details:\n") + f.write("-" * 40 + "\n") + for tid, data in self.thread_results.items(): + role = self.get_thread_role(tid, data['cmd']) + oncpu_count = len(data['oncpu_data']) + offcpu_count = len(data['offcpu_data']) + time_stats = thread_time_data[tid] + + f.write(f"TID {tid:8} ({role:15}): {data['cmd']}\n") + f.write(f" Events: on-CPU: {oncpu_count}, off-CPU: {offcpu_count}\n") + f.write(f" Times: on-CPU: {time_stats['oncpu_time_sec']:.3f}s, off-CPU: {time_stats['offcpu_time_sec']:.3f}s\n") + f.write(f" Total: {time_stats['total_time_sec']:.3f}s ({time_stats['wall_clock_coverage_pct']:.1f}% of wall clock)\n") + f.write(f" Samples: on-CPU: {time_stats['oncpu_samples']}, off-CPU: {time_stats['offcpu_us']:,} μs\n\n") + + f.write(f"Individual Analysis:\n") + f.write("-" * 40 + "\n") + f.write(f"Each thread has been profiled separately.\n") + f.write(f"Individual flamegraph files show per-thread behavior.\n") + f.write(f"Compare thread profiles to identify bottlenecks and parallelization opportunities.\n\n") + + f.write(f"Time Verification Notes:\n") + f.write("-" * 40 + "\n") + f.write(f"• On-CPU time = samples / sampling_frequency ({self.freq} Hz)\n") + f.write(f"• Off-CPU time = blocking_time_μs / 1,000,000\n") + f.write(f"• Total time per thread = on-CPU + off-CPU time\n") + f.write(f"• Wall clock coverage shows how much of the profiling period was active\n") + f.write(f"• Low coverage may indicate idle threads or missed events\n") + f.write(f"• High coverage (>100%) may indicate overlapping measurements or high activity\n") + + print(f"Thread analysis saved to: {summary_file}") + + def calculate_thread_times(self, oncpu_data, offcpu_data): + """Calculate actual wall clock times from profiling data""" + # Calculate on-CPU time from samples + oncpu_samples = 0 + for line in oncpu_data: + parts = line.rsplit(' ', 1) + if len(parts) == 2: + try: + count = int(parts[1]) + oncpu_samples += count + except ValueError: + continue + + # Calculate off-CPU time from microseconds + offcpu_us = 0 + for line in offcpu_data: + parts = line.rsplit(' ', 1) + if len(parts) == 2: + try: + time_us = int(parts[1]) + offcpu_us += time_us + except ValueError: + continue + + # Convert to wall clock times + oncpu_time_sec = oncpu_samples / self.freq if self.freq > 0 else 0 + offcpu_time_sec = offcpu_us / 1_000_000 + total_time_sec = oncpu_time_sec + offcpu_time_sec + + return { + 'oncpu_samples': oncpu_samples, + 'oncpu_time_sec': oncpu_time_sec, + 'offcpu_us': offcpu_us, + 'offcpu_time_sec': offcpu_time_sec, + 'total_time_sec': total_time_sec, + 'wall_clock_coverage_pct': (total_time_sec / self.duration * 100) if self.duration > 0 else 0 + } + + def generate_single_thread_analysis_file(self, output_prefix, single_thread_times): + """Generate single thread analysis file""" + analysis_file = f"{output_prefix}_single_thread_analysis.txt" + + with open(analysis_file, 'w') as f: + f.write("Single-Thread Analysis Report\n") + f.write("="*50 + "\n\n") + f.write(f"Process ID: {self.pid}\n") + f.write(f"Profiling duration: {self.duration} seconds\n") + f.write(f"Sampling frequency: {self.freq} Hz\n\n") + + # Time analysis + f.write("Time Analysis:\n") + f.write("-" * 40 + "\n") + f.write(f"On-CPU time: {single_thread_times['oncpu_time_sec']:.3f}s\n") + f.write(f"Off-CPU time: {single_thread_times['offcpu_time_sec']:.3f}s\n") + f.write(f"Total measured time: {single_thread_times['total_time_sec']:.3f}s\n") + f.write(f"Wall clock coverage: {single_thread_times['wall_clock_coverage_pct']:.1f}% of {self.duration}s profiling duration\n") + + if single_thread_times['wall_clock_coverage_pct'] < 50: + f.write("⚠️ Low coverage - thread may be mostly idle or data collection incomplete\n") + elif single_thread_times['wall_clock_coverage_pct'] > 150: + f.write("⚠️ High coverage - possible overlap or measurement anomaly\n") + else: + f.write("✓ Coverage appears reasonable for active thread\n") + + f.write(f"\nTime Verification Notes:\n") + f.write("-" * 40 + "\n") + f.write(f"• On-CPU time = samples / sampling_frequency ({self.freq} Hz)\n") + f.write(f"• Off-CPU time = blocking_time_μs / 1,000,000\n") + f.write(f"• Total time = on-CPU + off-CPU time\n") + f.write(f"• Wall clock coverage shows how much of the profiling period was active\n") + f.write(f"• Coverage values depend on thread activity and system load\n") + + print(f"Single thread analysis saved to: {analysis_file}") + +def main(): + parser = argparse.ArgumentParser( + description="Combined On-CPU and Off-CPU Profiler", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Profile PID 1234 for 30 seconds (default) + python3 wallclock_profiler.py 1234 + + # Profile for 60 seconds with custom sampling frequency + python3 wallclock_profiler.py 1234 -d 60 -f 99 + + # Use custom output prefix for generated files + python3 wallclock_profiler.py 1234 -o myapp_profile -m 5000 + + # Build and run test program first: + gcc -o test_program test_program.c + ./test_program & + python3 wallclock_profiler.py $! + """ + ) + + parser.add_argument("pid", type=int, help="Process ID to profile") + parser.add_argument("-d", "--duration", type=int, default=30, + help="Duration to profile in seconds (default: 30)") + parser.add_argument("-f", "--frequency", type=int, default=49, + help="On-CPU sampling frequency in Hz (default: 49)") + parser.add_argument("-m", "--min-block-us", type=int, default=1000, + help="Minimum off-CPU block time in microseconds (default: 1000)") + parser.add_argument("-o", "--output", type=str, + help="Output file prefix for generated files (default: combined_profile_pid_)") + + args = parser.parse_args() + + # Check if running as root + if os.geteuid() != 0: + print("Warning: This script typically requires root privileges to access BPF features", + file=sys.stderr) + + # Check if PID exists + try: + os.kill(args.pid, 0) + except OSError: + print(f"Error: Process {args.pid} does not exist", file=sys.stderr) + sys.exit(1) + + try: + profiler = CombinedProfiler( + pid=args.pid, + duration=args.duration, + freq=args.frequency, + min_block_us=args.min_block_us + ) + + profiler.run_profiling() + folded_file, svg_file = profiler.generate_flamegraph_data(args.output) + + print(f"\n" + "="*60) + print("PROFILING COMPLETE") + print("="*60) + if folded_file: + print(f"📊 Folded data: {folded_file}") + if svg_file: + print(f"🔥 Flamegraph: {svg_file}") + print(f" Open {svg_file} in a web browser to view the interactive flamegraph") + else: + print("⚠️ SVG flamegraph generation failed") + if folded_file: + print(f" You can manually generate it with:") + print(f" ./FlameGraph/flamegraph.pl {folded_file} > flamegraph.svg") + + print("\n📝 Interpretation guide:") + print(" • Red frames show CPU-intensive code paths (on-CPU) marked with _[c]") + print(" • Blue frames show blocking/waiting operations (off-CPU) marked with _[o]") + print(" • Wider sections represent more time spent in those functions") + print(" • Values are normalized to make on-CPU and off-CPU time comparable") + + except KeyboardInterrupt: + print("\nProfiling interrupted by user") + sys.exit(1) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/third_party/blazesym b/src/third_party/blazesym index c57e6d62..e4f0557f 160000 --- a/src/third_party/blazesym +++ b/src/third_party/blazesym @@ -1 +1 @@ -Subproject commit c57e6d623b88340d500e2ab0b2700ec9e9d4f398 +Subproject commit e4f0557f6a0d02dc03526bca6977a6100e9cce94