From fd2660ce05b43e75d500b48306dcf4423c62307e Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Fri, 8 Jul 2022 07:14:55 +0000 Subject: [PATCH 1/4] Redesign syscallbuf to always unwind on interruption This is a major redesign of the syscallbuf code with the goal of establishing the invariant that we never switch away from a tracee while it's in the syscallbuf code. Instead, we unwind the syscallbuf code completely and execute the syscall at a special syscall instruction now placed in the extended jump patch. The primary motivation for this that this fixes #3285, but I think the change is overall very beneficial. We have significant complexity in the recorder to deal with the possibility of interrupting the tracee during the syscallbuf. This commit does not yet remove this complexity (the change is already very big), but that should be easy to do as a follow up. Additionally, we used to be unable to perform syscall buffering for syscalls performed inside a signal handler that interrupted a syscall. This had performance implications on use cases like stack walkers, which often perform multiple memory-probing system calls for every frame to deal with the possibility of invalid unwind info. There are many details here, but here's a high level overview. The layout of the new extended jump patch is: ``` call // Bail path returns here syscall // Non-bail path returns here. jmp return_addr ``` One detail worth mentioning is what happens if a signal gets delivered once the tracee is out of the syscallbuf, but still in the extended jump patch (i.e. after the stack restore). In this case, rr will rewrite the ip of the signal frame to point to the equivalent ip in the original, now patched code section. Of course the instructions in question are no longer there, but the CFI will nevertheless be generally accurate for the current register state (excluding weird CFI that explicitly references the ip of course). This allows unwinders in the end-user-application to never have to unwind through any frame in the rr syscallbuf, which seems like a desirable property. Of course, `sigreturn` must perform the opposite transformation to avoid actually returning into a patched-out location. The main drawback of this scheme is that while the application will never see a location without CFI, GDB does still lack unwind information in the extended jump stub. This is not a new problem, but syscall events are now in the extended jump stub, so they come up quite frequently. I don't think this is a huge problem - it's basically the same situation we used to have before the vdso changes. I believe the best way to fix this would be to establish some way of having rr inform gdb of its jump patches (in fact gdb already has this kind of mechanism for tracepoints, it's just not exposed for tracepoints initiated by the gdb server), but I don't intend to do this myself anytime in the near future. That said, I should note that doing this would not require any changes on the record side, so could be done anytime and start working retroactively for already recorded traces. --- src/DiversionSession.cc | 7 +- src/Monkeypatcher.cc | 290 ++++++------ src/Monkeypatcher.h | 20 +- src/RecordSession.cc | 241 +++++++--- src/RecordTask.cc | 11 +- src/Registers.h | 5 + src/assembly_templates.py | 76 ++-- src/kernel_abi.h | 33 ++ src/preload/preload_interface.h | 3 +- src/preload/syscall_hook.S | 233 +++++++++- src/preload/syscallbuf.c | 684 ++++++++++++++++++---------- src/record_signal.cc | 216 ++++----- src/record_syscall.cc | 11 +- src/syscalls.py | 4 +- src/test/doublesegv.c | 4 +- src/test/execve_loop.c | 5 + src/test/expect_in_atomic_printf.py | 6 + src/test/expect_in_exit.py | 6 +- src/test/get_thread_list.py | 8 +- src/test/step_thread.py | 8 +- src/test/vdso_stack.py | 5 +- 21 files changed, 1218 insertions(+), 658 deletions(-) diff --git a/src/DiversionSession.cc b/src/DiversionSession.cc index 3933eaf0aa3..d52ab85fb96 100644 --- a/src/DiversionSession.cc +++ b/src/DiversionSession.cc @@ -72,9 +72,10 @@ static void process_syscall_arch(Task* t, int syscallno) { if (syscallno == t->session().syscall_number_for_rrcall_rdtsc()) { uint64_t rdtsc_value = static_cast(&t->session())->next_rdtsc_value(); LOG(debug) << "Faking rrcall_rdtsc syscall with value " << rdtsc_value; - remote_ptr out_param(t->regs().arg1()); - t->write_mem(out_param, rdtsc_value); - finish_emulated_syscall_with_ret(t, 0); + Registers r = t->regs(); + r.set_dx(rdtsc_value >> 32); + t->set_regs(r); + finish_emulated_syscall_with_ret(t, (uint32_t)rdtsc_value); return; } diff --git a/src/Monkeypatcher.cc b/src/Monkeypatcher.cc index 03880946b59..23115cc823a 100644 --- a/src/Monkeypatcher.cc +++ b/src/Monkeypatcher.cc @@ -159,51 +159,56 @@ template static void substitute_extended_jump(uint8_t* buffer, uint64_t patch_addr, uint64_t return_addr, uint64_t target_addr, - uint32_t fake_syscall_number); + uint32_t fake_syscall_number, + uint8_t stub[20]); template <> void substitute_extended_jump( uint8_t* buffer, uint64_t patch_addr, uint64_t return_addr, - uint64_t target_addr, uint32_t) { + uint64_t target_addr, uint32_t, uint8_t stub[STUB_PATCH_LENGTH]) { int64_t offset = target_addr - (patch_addr + X86SyscallStubExtendedJump::trampoline_relative_addr_end); + int64_t ret_offset = + return_addr - + (patch_addr + X86SyscallStubExtendedJump::return_addr_relative_end); // An offset that appears to be > 2GB is OK here, since EIP will just // wrap around. - X86SyscallStubExtendedJump::substitute(buffer, (uint32_t)return_addr, - (uint32_t)offset); + X86SyscallStubExtendedJump::substitute(buffer, (uint32_t)offset, (char*)stub, + (uint32_t)ret_offset); } template <> void substitute_extended_jump( uint8_t* buffer, uint64_t, uint64_t return_addr, uint64_t target_addr, - uint32_t) { - X64SyscallStubExtendedJump::substitute(buffer, (uint32_t)return_addr, - (uint32_t)(return_addr >> 32), - target_addr); + uint32_t, uint8_t stub[STUB_PATCH_LENGTH]) { + X64SyscallStubExtendedJump::substitute(buffer, (char*)stub, + target_addr, return_addr); } template <> void substitute_extended_jump( uint8_t* buffer, uint64_t patch_addr, uint64_t return_addr, - uint64_t target_addr, uint32_t fake_syscall_number) { + uint64_t target_addr, uint32_t fake_syscall_number, uint8_t stub[STUB_PATCH_LENGTH]) { int64_t offset = target_addr - (patch_addr + X86SyscallStubExtendedJump::trampoline_relative_addr_end); + int64_t ret_offset = + return_addr - + (patch_addr + X86SyscallStubExtendedJump::return_addr_relative_end); // An offset that appears to be > 2GB is OK here, since EIP will just // wrap around. - X86TrapInstructionStubExtendedJump::substitute(buffer, (uint32_t)return_addr, - fake_syscall_number, (uint32_t)offset); + X86TrapInstructionStubExtendedJump::substitute(buffer, + fake_syscall_number, (uint32_t)offset, + (char*)stub, (uint32_t)ret_offset); } template <> void substitute_extended_jump( uint8_t* buffer, uint64_t, uint64_t return_addr, uint64_t target_addr, - uint32_t fake_syscall_number) { - X64TrapInstructionStubExtendedJump::substitute(buffer, (uint32_t)return_addr, - (uint32_t)(return_addr >> 32), - fake_syscall_number, - target_addr); + uint32_t fake_syscall_number, uint8_t stub[STUB_PATCH_LENGTH]) { + X64TrapInstructionStubExtendedJump::substitute(buffer, fake_syscall_number, (char*)stub, + target_addr, return_addr); } /** @@ -308,12 +313,14 @@ static void encode_immediate_aarch64(std::vector &buff, * movk x30, #:abs_g0_nc:_syscall_hook_trampoline // Might be shorter depending on the address * blr x30 * ldp x15, x30, [x15] -.Lreturn: - * b syscall_return_address + * b .Lreturn +.Lbail: + * ldp x15, x30, [x15] + ** // Safe suffix starts here .Lnosys: * svc 0x0 // the test relies on invalid syscall triggering an event. - * // mov x0, -ENOSYS - * b .Lreturn +.Lreturn: + * b syscall_return_address * .long * * And return the instruction index of `.Lreturn`. @@ -337,15 +344,16 @@ static uint32_t encode_extended_jump_aarch64(std::vector &buff, buff.push_back(0xd63f03c0); // ldp x15, x30, [x15] buff.push_back(0xa94079ef); + // b .+ 12 + buff.push_back(0x14000003); + // ldp x15, x30, [x15] + buff.push_back(0xa94079ef); + buff.push_back(0xd4000001); // svc 0 uint32_t ret_idx = buff.size(); buff.push_back(0); // place holder // b.hi . + (ret_inst + 4 - .) - buff[b_hi_idx] = 0x54000000 | ((ret_idx + 1 - b_hi_idx) << 5) | 0x8; - // movn x0, (ENOSYS - 1), i.e. mov x0, -ENOSYS - // buff.push_back(0x92800000 | ((ENOSYS - 1) << 5) | 0); - buff.push_back(0xd4000001); // svc 0 - // b .-2 - buff.push_back(0x17fffffe); + buff[b_hi_idx] = 0x54000000 | ((ret_idx - 1 - b_hi_idx) << 5) | 0x8; + uint32_t retaddr_idx = buff.size(); if (_retaddr_idx) *_retaddr_idx = retaddr_idx; @@ -449,20 +457,34 @@ static remote_ptr allocate_extended_jump_aarch64( return jump_addr; } -bool Monkeypatcher::is_jump_stub_instruction(remote_code_ptr ip, bool include_safearea) { +Monkeypatcher::patched_syscall *Monkeypatcher::find_jump_stub(remote_code_ptr ip, bool include_safearea) { remote_ptr pp = ip.to_data_ptr(); - auto it = syscallbuf_stubs.upper_bound(pp); - if (it == syscallbuf_stubs.begin()) { - return false; + auto it = syscallbuf_stubs_by_extended_patch.upper_bound(pp); + if (it == syscallbuf_stubs_by_extended_patch.begin()) { + return nullptr; } --it; auto begin = it->first; - auto end = begin + it->second.size; + patched_syscall *ps = &syscall_stub_list[it->second]; + auto end = begin + ps->size; if (!include_safearea) { - begin += it->second.safe_prefix; - end -= it->second.safe_suffix; + begin += ps->safe_prefix; + end -= ps->safe_suffix; + } + return begin <= pp && pp < end ? ps : nullptr; +} + +Monkeypatcher::patched_syscall *Monkeypatcher::find_syscall_patch(remote_code_ptr ip) { + remote_ptr pp = ip.to_data_ptr(); + auto it = syscallbuf_stubs_by_patch_addr.upper_bound(pp); + if (it == syscallbuf_stubs_by_patch_addr.begin()) { + return nullptr; } - return begin <= pp && pp < end; + --it; + auto begin = it->first; + patched_syscall *ps = &syscall_stub_list[it->second]; + auto end = begin + ps->hook->patch_region_length; + return begin <= pp && pp < end ? ps : nullptr; } remote_code_ptr Monkeypatcher::get_jump_stub_exit_breakpoint(remote_code_ptr ip, @@ -471,18 +493,61 @@ remote_code_ptr Monkeypatcher::get_jump_stub_exit_breakpoint(remote_code_ptr ip, return nullptr; } remote_ptr pp = ip.to_data_ptr(); - auto it = syscallbuf_stubs.upper_bound(pp); - if (it == syscallbuf_stubs.begin()) { + auto it = syscallbuf_stubs_by_extended_patch.upper_bound(pp); + if (it == syscallbuf_stubs_by_extended_patch.begin()) { return nullptr; } --it; - auto bp = it->first + it->second.size - it->second.safe_suffix; - if (pp == bp || pp == bp - 4) { - return remote_code_ptr(bp.as_int()); + patched_syscall *ps = &syscall_stub_list[it->second]; + auto bp = it->first + ps->size - ps->safe_suffix; + if (pp == bp - 4 || pp == bp - 8) { + return remote_code_ptr((it->first + ps->size - 4).as_int()); } return nullptr; } +template +uint64_t get_safe_suffix_length(); + +/* These need to match the size of the post-stack-restore region in assembly_templates.py */ +template <> +uint64_t get_safe_suffix_length() { + return 8 + 8 + 6 + 20 + 2; +} + +template <> +uint64_t get_safe_suffix_length() { + return 2 + 20 + 1 + 4; +} + + +static void fill_with_x86_nops(uint8_t *buf, size_t len) { + for (size_t i = 0; i < len;) { + switch (len - i) { + case 1: buf[i] = 0x90; return; + case 2: buf[i] = 0x60; buf[i+1] = 0x90; return; + case 3: buf[i] = 0x0f; buf[i+1] = 0x1f; buf[i+2] = 0x00; return; + case 4: buf[i] = 0x0f; buf[i+1] = 0x1f; buf[i+2] = 0x40; buf[i+3] = 0x00; break; + case 5: buf[i] = 0x0f; buf[i+1] = 0x1f; buf[i+2] = 0x44; + buf[i+3] = 0x00; buf[i+4] = 0x00; return; + case 6: buf[i] = 0x66; buf[i+1] = 0x0f; buf[i+2] = 0x1f; + buf[i+3] = 0x44; buf[i+4] = 0x00; buf[i+5] = 0x00; return; + case 7: buf[i] = 0x0f; buf[i+1] = 0x1f; buf[i+2] = 0x80; + buf[i+3] = 0x00; buf[i+4] = 0x00; buf[i+5] = 0x00; + buf[i+6] = 0x00; return; + case 8: buf[i] = 0x0f; buf[i+1] = 0x1f; buf[i+2] = 0x84; + buf[i+3] = 0x00; buf[i+4] = 0x00; buf[i+5] = 0x00; + buf[i+6] = 0x00; buf[i+7] = 0x00; return; + default: + case 9: + buf[i] = 0x66; buf[i+1] = 0x0f; buf[i+2] = 0x1f; + buf[i+3] = 0x84; buf[i+4] = 0x00; buf[i+5] = 0x00; + buf[i+6] = 0x00; buf[i+7] = 0x00; buf[i+8] = 0x00; + i += 9; continue; + } + } +} + /** * Some functions make system calls while storing local variables in memory * below the stack pointer. We need to decrement the stack pointer by @@ -539,26 +604,40 @@ static bool patch_syscall_with_hook_x86ish(Monkeypatcher& patcher, return false; } + uint8_t stub[20]; + memset(stub, 0x90, sizeof(stub)); + if (!(hook.flags & PATCH_SYSCALL_INSTRUCTION_IS_LAST)) { + memcpy(stub, hook.patch_region_bytes, hook.patch_region_length); + fill_with_x86_nops(stub + hook.patch_region_length, sizeof(stub) - hook.patch_region_length); + } + + uint16_t safe_suffix = get_safe_suffix_length(); // Everything starting from the syscall instruction if (fake_syscall_number) { uint8_t stub_patch[FakeSyscallExtendedJumpPatch::size]; substitute_extended_jump(stub_patch, extended_jump_start.as_int(), return_addr, hook.hook_address, - fake_syscall_number); + fake_syscall_number, + stub); write_and_record_bytes(t, extended_jump_start, stub_patch); - patcher.syscallbuf_stubs[extended_jump_start] = { &hook, FakeSyscallExtendedJumpPatch::size }; + patcher.syscall_stub_list.push_back({ &hook, jump_patch_start, extended_jump_start, FakeSyscallExtendedJumpPatch::size, 0, safe_suffix }); + patcher.syscallbuf_stubs_by_extended_patch[extended_jump_start] = patcher.syscall_stub_list.size() - 1; + patcher.syscallbuf_stubs_by_patch_addr[jump_patch_start] = patcher.syscall_stub_list.size() - 1; } else { uint8_t stub_patch[ExtendedJumpPatch::size]; substitute_extended_jump(stub_patch, extended_jump_start.as_int(), return_addr, hook.hook_address, - 0); + 0, + stub); write_and_record_bytes(t, extended_jump_start, stub_patch); - patcher.syscallbuf_stubs[extended_jump_start] = { &hook, ExtendedJumpPatch::size }; + patcher.syscall_stub_list.push_back({ &hook, jump_patch_start, extended_jump_start, ExtendedJumpPatch::size, 0, safe_suffix }); + patcher.syscallbuf_stubs_by_extended_patch[extended_jump_start] = patcher.syscall_stub_list.size() - 1; + patcher.syscallbuf_stubs_by_patch_addr[jump_patch_start] = patcher.syscall_stub_list.size() - 1; } intptr_t jump_offset = extended_jump_start - jump_patch_end; @@ -627,8 +706,8 @@ bool patch_syscall_with_hook_arch(Monkeypatcher& patcher, auto total_patch_size = inst_buff.size() * 4; write_and_record_bytes(t, extended_jump_start, total_patch_size, &inst_buff[0]); - patcher.syscallbuf_stubs[extended_jump_start] = { - &hook, total_patch_size, + patcher.syscall_stub_list.push_back({ + &hook, svc_ip, extended_jump_start, total_patch_size, /** * safe_prefix: * We have not modified any registers yet in the first two instructions. @@ -641,13 +720,15 @@ bool patch_syscall_with_hook_arch(Monkeypatcher& patcher, * We've returned from syscallbuf and continue execution * won't hit syscallbuf breakpoint * (this also include the 8 bytes that stores the return address) - * Note that the 4th last instruction also belongs to the syscallbuf return path + * Note that stack restore instruction also belongs to the syscallbuf return path * However, since it is still using the scratch memory, * it doesn't belong to the safe area. * The caller needs to have special handling for that instruction. */ - 3 * 4 + 8 - }; + 2 * 4 + 8 + }); + patcher.syscallbuf_stubs_by_extended_patch[extended_jump_start] = patcher.syscall_stub_list.size() - 1; + patcher.syscallbuf_stubs_by_patch_addr[svc_ip] = patcher.syscall_stub_list.size() - 1; intptr_t jump_offset = extended_jump_start - svc_ip; ASSERT(t, jump_offset <= aarch64_b_max_offset && jump_offset >= aarch64_b_min_offset) @@ -672,54 +753,6 @@ static bool patch_syscall_with_hook(Monkeypatcher& patcher, RecordTask* t, instruction_length, fake_syscall_number); } -template -static bool match_extended_jump_patch(Task* t, - uint8_t patch[], uint64_t* return_addr, vector* instruction); - -template <> -bool match_extended_jump_patch( - Task*, uint8_t patch[], uint64_t* return_addr, vector* instruction) { - uint32_t return_addr_lo, return_addr_hi; - uint64_t jmp_target; - if (!X64SyscallStubExtendedJump::match(patch, &return_addr_lo, &return_addr_hi, &jmp_target)) { - return false; - } - *instruction = rr::syscall_instruction(x86_64); - *return_addr = return_addr_lo | (((uint64_t)return_addr_hi) << 32); - return true; -} - -template <> -bool match_extended_jump_patch( - Task* t, uint8_t patch[], uint64_t* return_addr, vector* instruction) { - uint32_t return_addr_lo, return_addr_hi, fake_syscall_no; - uint64_t jmp_target; - if (!X64TrapInstructionStubExtendedJump::match(patch, &return_addr_lo, &return_addr_hi, - &fake_syscall_no, &jmp_target)) { - return false; - } - *return_addr = return_addr_lo | (((uint64_t)return_addr_hi) << 32); - if ((int)fake_syscall_no == t->session().syscall_number_for_rrcall_rdtsc()) { - instruction->resize(sizeof(rdtsc_insn)); - memcpy(instruction->data(), rdtsc_insn, instruction->size()); - } else { - ASSERT(t, false) << "Unknown fake-syscall number " << fake_syscall_no; - } - return true; -} - -template <> -bool match_extended_jump_patch( - Task*, uint8_t patch[], uint64_t* return_addr, vector* instruction) { - uint32_t return_addr_32, jmp_target_relative; - if (!X86SyscallStubExtendedJump::match(patch, &return_addr_32, &jmp_target_relative)) { - return false; - } - *return_addr = return_addr_32; - *instruction = rr::syscall_instruction(x86); - return true; -} - template static void substitute_replacement_patch(uint8_t *buffer, uint64_t patch_addr, uint64_t jmp_target); @@ -745,29 +778,13 @@ void substitute_replacement_patch(uint8_t *buffer, uint64 template static void unpatch_extended_jumps(Monkeypatcher& patcher, Task* t) { - // If these were the same size then the logic below wouldn't work. static_assert(ExtendedJumpPatch::size < FakeSyscallExtendedJumpPatch::size); - for (auto patch : patcher.syscallbuf_stubs) { - const syscall_patch_hook &hook = *patch.second.hook; + for (auto &patch : patcher.syscall_stub_list) { + const syscall_patch_hook &hook = *patch.hook; + ASSERT(t, patch.size <= FakeSyscallExtendedJumpPatch::size); uint8_t bytes[FakeSyscallExtendedJumpPatch::size]; - t->read_bytes_helper(patch.first, patch.second.size, bytes); - uint64_t return_addr = 0; - vector syscall; - if (patch.second.size == ExtendedJumpPatch::size) { - if (!match_extended_jump_patch( - t, bytes, &return_addr, &syscall)) { - ASSERT(t, false) << "Failed to match extended jump patch at " << patch.first; - return; - } - } else if (patch.second.size == FakeSyscallExtendedJumpPatch::size) { - if (!match_extended_jump_patch( - t, bytes, &return_addr, &syscall)) { - ASSERT(t, false) << "Failed to match trap-instruction extended jump patch at " << patch.first; - return; - } - } else { - ASSERT(t, false) << "Unknown patch size " << patch.second.size; - } + uint64_t return_addr = patch.patch_addr.as_int() + hook.patch_region_length; + std::vector syscall = rr::syscall_instruction(t->arch()); // Replace with // extended_jump: @@ -777,22 +794,23 @@ static void unpatch_extended_jumps(Monkeypatcher& patcher, // jmp *(return_addr) // As long as there are not relative branches or anything, this should // always be correct. - size_t new_patch_size = hook.patch_region_length + syscall.size() + ReplacementPatch::size; - ASSERT(t, new_patch_size <= sizeof(bytes)); - uint8_t* ptr = bytes; - if (!(hook.flags & PATCH_SYSCALL_INSTRUCTION_IS_LAST)) { - memcpy(ptr, syscall.data(), syscall.size()); - ptr += syscall.size(); - } - memcpy(ptr, hook.patch_region_bytes, hook.patch_region_length); - ptr += hook.patch_region_length; + ASSERT(t, hook.patch_region_length + ReplacementPatch::size + syscall.size() < + ExtendedJumpPatch::size); + uint8_t *ptr = bytes; if (hook.flags & PATCH_SYSCALL_INSTRUCTION_IS_LAST) { memcpy(ptr, syscall.data(), syscall.size()); ptr += syscall.size(); + memcpy(ptr, hook.patch_region_bytes, hook.patch_region_length); + substitute_replacement_patch(ptr, + patch.stub_addr.as_int()+(ptr-bytes), return_addr); + t->write_bytes_helper(patch.stub_addr, sizeof(bytes), bytes); + } else { + // We already have a copy of the replaced bytes in place - all we need to + // to is to nop out the preceeding instructions + uint64_t nop_area_size = ExtendedJumpPatch::size - get_safe_suffix_length(); + memset(ptr, 0x90, nop_area_size); + t->write_bytes_helper(patch.stub_addr, nop_area_size, bytes); } - substitute_replacement_patch(ptr, - patch.first.as_int() + hook.patch_region_length + syscall.size(), return_addr); - t->write_bytes_helper(patch.first, new_patch_size, bytes); } } @@ -818,19 +836,19 @@ void unpatch_syscalls_arch(Monkeypatcher &patcher, Task *t) { template <> void unpatch_syscalls_arch(Monkeypatcher &patcher, Task *t) { - for (auto patch : patcher.syscallbuf_stubs) { - const syscall_patch_hook &hook = *patch.second.hook; + for (auto patch : patcher.syscall_stub_list) { + const syscall_patch_hook &hook = *patch.hook; std::vector hook_prefix; uint32_t prefix_ninst; encode_extended_jump_aarch64(hook_prefix, hook.hook_address, 0, &prefix_ninst); uint32_t prefix_size = prefix_ninst * 4; DEBUG_ASSERT(prefix_size <= 13 * 4); - ASSERT(t, patch.second.size >= prefix_size + 8); + ASSERT(t, patch.size >= prefix_size + 8); uint8_t bytes[15 * 4]; - t->read_bytes_helper(patch.first, prefix_size + 8, bytes); + t->read_bytes_helper(patch.stub_addr, prefix_size + 8, bytes); // 3rd last instruction is the one jumping back and it won't match if (memcmp(&hook_prefix[0], bytes, prefix_size - 3 * 4) != 0) { - ASSERT(t, false) << "Failed to match extended jump patch at " << patch.first; + ASSERT(t, false) << "Failed to match extended jump patch at " << patch.stub_addr; return; } @@ -840,7 +858,7 @@ void unpatch_syscalls_arch(Monkeypatcher &patcher, Task *t) { uint32_t svc_inst = 0xd4000001; memcpy(bytes, &svc_inst, 4); - uint64_t reverse_jump_addr = patch.first.as_int() + 4; + uint64_t reverse_jump_addr = patch.stub_addr.as_int() + 4; int64_t reverse_offset = int64_t(return_addr - reverse_jump_addr); ASSERT(t, reverse_offset <= aarch64_b_max_offset && reverse_offset >= aarch64_b_min_offset) @@ -849,7 +867,7 @@ void unpatch_syscalls_arch(Monkeypatcher &patcher, Task *t) { uint32_t binst = 0x14000000 | offset_imm26; memcpy(&bytes[4], &binst, 4); - t->write_bytes_helper(patch.first, 4 * 2, bytes); + t->write_bytes_helper(patch.stub_addr, 4 * 2, bytes); } } diff --git a/src/Monkeypatcher.h b/src/Monkeypatcher.h index 9daf95b3c28..97ac7007d9a 100644 --- a/src/Monkeypatcher.h +++ b/src/Monkeypatcher.h @@ -125,24 +125,34 @@ class Monkeypatcher { }; std::vector extended_jump_pages; - bool is_jump_stub_instruction(remote_code_ptr p, bool include_safearea); - // Return the breakpoint instruction (i.e. the last branch back to caller) - // if we are on the exit path in the jump stub - remote_code_ptr get_jump_stub_exit_breakpoint(remote_code_ptr ip, RecordTask *t); struct patched_syscall { // Pointer to hook inside the syscall_hooks array, which gets initialized // once and is fixed afterwars. const syscall_patch_hook *hook; + remote_ptr patch_addr; + remote_ptr stub_addr; size_t size; uint16_t safe_prefix = 0; uint16_t safe_suffix = 0; }; + patched_syscall *find_jump_stub(remote_code_ptr ip, bool include_safearea); + bool is_jump_stub_instruction(remote_code_ptr p, bool include_safearea) { + return (bool)find_jump_stub(p, include_safearea); + } + + patched_syscall *find_syscall_patch(remote_code_ptr patch_location); + + // Return the breakpoint instruction (i.e. the last branch back to caller) + // if we are on the exit path in the jump stub + remote_code_ptr get_jump_stub_exit_breakpoint(remote_code_ptr ip, RecordTask *t); /** * Addresses/lengths of syscallbuf stubs. */ - std::map, patched_syscall> syscallbuf_stubs; + std::vector syscall_stub_list; + std::map, int> syscallbuf_stubs_by_extended_patch; + std::map, int> syscallbuf_stubs_by_patch_addr; private: /** diff --git a/src/RecordSession.cc b/src/RecordSession.cc index d765957975c..2c7e3ff30fa 100644 --- a/src/RecordSession.cc +++ b/src/RecordSession.cc @@ -471,7 +471,6 @@ void RecordSession::handle_seccomp_traced_syscall(RecordTask* t, SupportedArch syscall_arch = t->detect_syscall_arch(); t->canonicalize_regs(syscall_arch); if (!process_syscall_entry(t, step_state, result, syscall_arch)) { - step_state->continue_type = RecordSession::DONT_CONTINUE; return; } *did_enter_syscall = true; @@ -508,6 +507,8 @@ static void seccomp_trap_done(RecordTask* t) { (uint8_t)1); } +extern void disarm_desched_event(RecordTask *t); +extern void leave_syscallbuf(RecordTask *t); static void handle_seccomp_trap(RecordTask* t, RecordSession::StepState* step_state, uint16_t seccomp_data) { @@ -542,27 +543,21 @@ static void handle_seccomp_trap(RecordTask* t, } } - if (t->is_in_untraced_syscall()) { - ASSERT(t, !t->delay_syscallbuf_reset_for_seccomp_trap); - // Don't reset the syscallbuf immediately after delivering the trap. We have - // to wait until this buffered syscall aborts completely before resetting - // the buffer. - t->delay_syscallbuf_reset_for_seccomp_trap = true; - - t->push_event(Event::seccomp_trap()); - + bool is_untraced_syscall = t->is_in_untraced_syscall(); + if (is_untraced_syscall) { // desched may be armed but we're not going to execute the syscall, let - // alone block. If it fires, ignore it. - t->write_mem( - REMOTE_PTR_FIELD(t->syscallbuf_child, desched_signal_may_be_relevant), - (uint8_t)0); + // alone block. Disarm the event and if it fires, ignore it. + disarm_desched_event(t); + leave_syscallbuf(t); + r = t->regs(); } + t->canonicalize_regs(t->detect_syscall_arch()); t->push_syscall_event(syscallno); t->ev().Syscall().failed_during_preparation = true; note_entering_syscall(t); - if (t->is_in_untraced_syscall() && !syscall_entry_already_recorded) { + if (is_untraced_syscall && !syscall_entry_already_recorded) { t->record_current_event(); } @@ -578,10 +573,21 @@ static void handle_seccomp_trap(RecordTask* t, si.native_api.si_code = SYS_SECCOMP; si.native_api._sifields._sigsys._arch = to_audit_arch(r.arch()); si.native_api._sifields._sigsys._syscall = syscallno; + // Documentation says that si_call_addr is the address of the syscall // instruction, but in tests it's immediately after the syscall // instruction. - si.native_api._sifields._sigsys._call_addr = t->ip().to_data_ptr(); + remote_code_ptr seccomp_ip = t->ip(); + + /* If we actually deliver this signal, we will fudge the ip value to instead + point into the patched-out syscall. The callee may rely on these values + matching, so do the same adjustment here. */ + Monkeypatcher::patched_syscall *ps = t->vm()->monkeypatcher().find_jump_stub(seccomp_ip, true); + if (ps) { + seccomp_ip = (ps->patch_addr + (seccomp_ip - ps->stub_addr.as_int()).register_value() - (ps->size - ps->safe_suffix)).as_int(); + } + + si.native_api._sifields._sigsys._call_addr = seccomp_ip.to_data_ptr(); LOG(debug) << "Synthesizing " << si.linux_api; t->stash_synthetic_sig(si.linux_api, DETERMINISTIC_SIG); @@ -591,16 +597,31 @@ static void handle_seccomp_trap(RecordTask* t, t->set_regs(r); t->maybe_restore_original_syscall_registers(); - if (t->is_in_untraced_syscall()) { + if (is_untraced_syscall) { + Registers r = t->regs(); + // Cause kernel processing to skip the syscall + r.set_original_syscallno(SECCOMP_MAGIC_SKIP_ORIGINAL_SYSCALLNO); + t->set_regs(r); + uintptr_t orig_arg1 = r.arg1(); + + // The tracee is currently in the seccomp ptrace-stop or syscall-entry stop. + // Advance it to the syscall-exit stop so that when we try to deliver the SIGSYS via + // PTRACE_SINGLESTEP, that doesn't trigger a SIGTRAP stop. + t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); + if (t->status().ptrace_event() == PTRACE_EVENT_SECCOMP) { + t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); + } + + if (t->arch() == aarch64) { + r = t->regs(); + r.set_arg1(orig_arg1); + t->set_regs(r); + } + // For buffered syscalls, go ahead and record the exit state immediately. t->ev().Syscall().state = EXITING_SYSCALL; t->record_current_event(); t->pop_syscall(); - - // The tracee is currently in the seccomp ptrace-stop. Advance it to the - // syscall-exit stop so that when we try to deliver the SIGSYS via - // PTRACE_SINGLESTEP, that doesn't trigger a SIGTRAP stop. - t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); } // Don't continue yet. At the next iteration of record_step, if we @@ -815,12 +836,6 @@ void RecordSession::task_continue(const StepState& step_state) { // A task in an emulated ptrace-stop must really stay stopped ASSERT(t, !t->emulated_stop_pending); - bool may_restart = t->at_may_restart_syscall(); - - if (may_restart && t->seccomp_bpf_enabled) { - LOG(debug) << " PTRACE_SYSCALL to possibly-restarted " << t->ev(); - } - if (!t->vm()->first_run_event()) { t->vm()->set_first_run_event(trace_writer().time()); } @@ -892,7 +907,7 @@ void RecordSession::task_continue(const StepState& step_state) { makes PTRACE_SYSCALL traps be delivered *before* seccomp RET_TRACE traps. Detect and handle this. */ - if (!t->seccomp_bpf_enabled || may_restart || + if (!t->seccomp_bpf_enabled || syscall_seccomp_ordering_ == PTRACE_SYSCALL_BEFORE_SECCOMP_UNKNOWN) { resume = RESUME_SYSCALL; } else { @@ -1232,6 +1247,17 @@ void RecordSession::syscall_state_changed(RecordTask* t, ASSERT(t, t->regs().original_syscallno() == -1); } rec_did_sigreturn(t); + + /* The inverse of the processing we do during signal delivery - if the IP + points into a region that we patched out, move us to the extended jump + patch instead. */ + Monkeypatcher::patched_syscall *ps = t->vm()->monkeypatcher().find_syscall_patch(t->ip()); + if (ps) { + Registers r = t->regs(); + r.set_ip((ps->stub_addr + (r.ip() - ps->patch_addr.as_int()).register_value() + (ps->size - ps->safe_suffix)).as_int()); + t->set_regs(r); + } + t->record_current_event(); t->pop_syscall(); @@ -1500,6 +1526,7 @@ static bool inject_handled_signal(RecordTask* t) { t->stashed_signal_processed(); int sig = t->ev().Signal().siginfo.si_signo; + do { // We are ready to inject our signal. // XXX we assume the kernel won't respond by notifying us of a different @@ -1557,6 +1584,69 @@ static bool inject_handled_signal(RecordTask* t) { return true; } +static ssize_t get_sigframe_size(SupportedArch arch) { + if (is_x86ish(arch)) { + // It's somewhat difficult engineering-wise to + // compute the sigframe size at compile time, + // and it can vary across kernel versions and CPU + // microarchitectures. So this size is an overestimate + // of the real size(s). + // + // If this size becomes too small in the + // future, and unit tests that use sighandlers + // are run with checksumming enabled, then + // they can catch errors here. + return 1152 /* Overestimate of kernel sigframe */ + + 128 /* Redzone */ + + /* this returns 512 when XSAVE unsupported */ + xsave_area_size(); + } else if (arch) { + return sizeof(ARM64Arch::rt_sigframe) + + sizeof(ARM64Arch::user_fpsimd_state); + } else { + DEBUG_ASSERT(0 && "Add sigframe size for your architecture here"); + return 0; + } +} + +template +static remote_ptr get_sigframe_ip_ptr(remote_ptr frame_ptr); + +template <> +remote_ptr get_sigframe_ip_ptr(remote_ptr frame_ptr) { + return REMOTE_PTR_FIELD(REMOTE_PTR_FIELD(REMOTE_PTR_FIELD(REMOTE_PTR_FIELD(frame_ptr, uc), uc_mcontext), regs), pc); +} + +template <> +remote_ptr get_sigframe_ip_ptr(remote_ptr frame_ptr) { + return REMOTE_PTR_FIELD(REMOTE_PTR_FIELD(REMOTE_PTR_FIELD(frame_ptr, uc), uc_mcontext), ip); +} + +template <> +remote_ptr get_sigframe_ip_ptr(remote_ptr frame_ptr) { + return REMOTE_PTR_FIELD(REMOTE_PTR_FIELD(REMOTE_PTR_FIELD(frame_ptr, uc), uc_mcontext), ip); +} + +template +static remote_code_ptr get_sigframe_ip_arch(RecordTask *t, remote_ptr frame_ptr) +{ + return t->read_mem(get_sigframe_ip_ptr(frame_ptr)); +} + +static remote_code_ptr get_sigframe_ip(RecordTask *t, remote_ptr frame_ptr) { + RR_ARCH_FUNCTION(get_sigframe_ip_arch, t->arch(), t, frame_ptr.as_int()); +} + +template +static void set_sigframe_ip_arch(RecordTask *t, remote_ptr frame_ptr, remote_code_ptr ip) +{ + t->write_mem(get_sigframe_ip_ptr(frame_ptr), (typename Arch::unsigned_long)ip.register_value()); +} + +static void set_sigframe_ip(RecordTask *t, remote_ptr frame_ptr, remote_code_ptr ip) { + RR_ARCH_FUNCTION(set_sigframe_ip_arch, t->arch(), t, frame_ptr.as_int(), ip); +} + /** * |t| is being delivered a signal, and its state changed. * Must call t->stashed_signal_processed() once we're ready to unmask signals. @@ -1601,26 +1691,37 @@ bool RecordSession::signal_state_changed(RecordTask* t, StepState* step_state) { break; } - if (is_x86ish(t->arch())) { - // It's somewhat difficult engineering-wise to - // compute the sigframe size at compile time, - // and it can vary across kernel versions and CPU - // microarchitectures. So this size is an overestimate - // of the real size(s). - // - // If this size becomes too small in the - // future, and unit tests that use sighandlers - // are run with checksumming enabled, then - // they can catch errors here. - sigframe_size = 1152 /* Overestimate of kernel sigframe */ + - 128 /* Redzone */ + - /* this returns 512 when XSAVE unsupported */ - xsave_area_size(); - } else if (t->arch() == aarch64) { - sigframe_size = sizeof(ARM64Arch::rt_sigframe) + - sizeof(ARM64Arch::user_fpsimd_state); - } else { - DEBUG_ASSERT(0 && "Add sigframe size for your architecture here"); + sigframe_size = get_sigframe_size(t->arch()); + + /* + * If we're delivering a signal while in the extended jump patch, pretend we're in the + * unpatched code instead. That way, any unwinder that makes use of CFI for unwinding + * will see the correct unwind info of the patch site rather than that of the extended + * jump patch. The instruction sequence in the original code was of course altered by + * the patch, so if the signal handler inspects that, it might get confused. However, + * that is already a general problem with our patching strategy, in that the application + * is not allowed to read its own code. + * Naturally, we need to perform the inverse transformation in sigreturn. + * + * N.B.: We do this by modifying the sigframe after signal deliver, rather + * than modifying the registers during signal delivery, because on some platforms + * (e.g. aarch64, the kernel will adjust the pre-signal registers after the signal stop). + */ + remote_ptr sigframe = t->regs().sp().cast(); + remote_code_ptr ip = get_sigframe_ip(t, sigframe); + Monkeypatcher::patched_syscall *ps = t->vm()->monkeypatcher().find_jump_stub(ip, true); + if (ps) { + uint64_t translated_patch_offset = (ip - ps->stub_addr.as_int()).register_value() - (ps->size - ps->safe_suffix); + // We patch out the jump stub with nop, but of course, if we happen to find ourselves + // in the middle of the nop sled, we just want to end up at the end of the patch + // region. + size_t total_patch_region_size = ps->hook->patch_region_length + + rr::syscall_instruction_length(t->arch()); + if (translated_patch_offset > total_patch_region_size) { + translated_patch_offset = total_patch_region_size; + } + set_sigframe_ip(t, sigframe, ps->patch_addr.as_int() + translated_patch_offset); + LOG(debug) << "Moved ip from extended jump patch to patch area"; } t->ev().transform(EV_SIGNAL_HANDLER); @@ -1909,32 +2010,22 @@ static bool is_ptrace_any_sysemu(SupportedArch arch, int command) bool RecordSession::process_syscall_entry(RecordTask* t, StepState* step_state, RecordResult* step_result, SupportedArch syscall_arch) { - if (const RecordTask::StashedSignal* sig = t->stashed_sig_not_synthetic_SIGCHLD()) { - // The only four cases where we allow a stashed signal to be pending on - // syscall entry are: - // -- the signal is a ptrace-related signal, in which case if it's generated - // during a blocking syscall, it does not interrupt the syscall - // -- rrcall_notify_syscall_hook_exit, which is effectively a noop and - // lets us dispatch signals afterward - // -- when we're entering a blocking untraced syscall. If it really blocks, - // we'll get the desched-signal notification and dispatch our stashed - // signal. - // -- when we're doing a privileged syscall that's internal to the preload - // logic - // We do not generally want to have stashed signals pending when we enter - // a syscall, because that will execute with a hacked signal mask - // (see RecordTask::will_resume_execution) which could make things go wrong. - ASSERT(t, - t->desched_rec() || is_rrcall_notify_syscall_hook_exit_syscall( - t->regs().original_syscallno(), t->arch()) || - t->ip() == - t->vm() - ->privileged_traced_syscall_ip() - .increment_by_syscall_insn_length(t->arch())) - << "Stashed signal pending on syscall entry when it shouldn't be: " - << sig->siginfo << "; regs=" << t->regs() - << "; last_execution_resume=" << t->last_execution_resume() - << "; sig ip=" << sig->ip; + if (!t->is_in_syscallbuf() && t->stashed_sig_not_synthetic_SIGCHLD()) { + // If we have a pending signal, deliver it as if it had happened just before + // execution of the syscall instruction. To this end, kick us out of the + // current syscall again and set up the registers for a restart. Regular + // signal injection will do the rest. + LOG(debug) << "Entered syscall, but signal pending - setting up pre-syscall signal delivery"; + Registers entry_regs = t->regs(); + Registers r = entry_regs; + // Cause kernel processing to skip the syscall + r.set_original_syscallno(SECCOMP_MAGIC_SKIP_ORIGINAL_SYSCALLNO); + t->set_regs(r); + t->exit_syscall(); + entry_regs.set_ip(entry_regs.ip().decrement_by_syscall_insn_length(syscall_arch)); + entry_regs.set_syscallno(entry_regs.original_syscallno()); + t->set_regs(entry_regs); + return false; } // We just entered a syscall. diff --git a/src/RecordTask.cc b/src/RecordTask.cc index 6e4b3a2e819..0b6e5ff71a8 100644 --- a/src/RecordTask.cc +++ b/src/RecordTask.cc @@ -634,6 +634,9 @@ bool RecordTask::will_resume_execution(ResumeRequest, WaitRequest, if (!set_sigmask(sigset)) { return false; } + LOG(debug) << "Set signal mask to block all signals (bar " + << "SYSCALLBUF_DESCHED_SIGNAL/TIME_SLICE_SIGNAL) while we " + << " have a stashed signal"; } // RESUME_NO_TICKS means that tracee code is not going to run so there's no @@ -710,7 +713,9 @@ void RecordTask::did_wait() { // state, because we do not allow stashed_signals_blocking_more_signals // to hold across syscalls (traced or untraced) that change the signal mask. ASSERT(this, !blocked_sigs_dirty); - xptrace(PTRACE_SETSIGMASK, remote_ptr(8), &blocked_sigs); + if (set_sigmask(blocked_sigs)) { + LOG(debug) << "Blocked signals restored"; + } } else if (syscallbuf_child) { // The syscallbuf struct is only 32 bytes currently so read the whole thing // at once to avoid multiple calls to read_mem. Even though this shouldn't @@ -1294,10 +1299,6 @@ bool RecordTask::set_sigmask(sig_set_t mask) { return false; } ASSERT(this, errno == EINVAL); - } else { - LOG(debug) << "Set signal mask to block all signals (bar " - << "SYSCALLBUF_DESCHED_SIGNAL/TIME_SLICE_SIGNAL) while we " - << " have a stashed signal"; } return true; } diff --git a/src/Registers.h b/src/Registers.h index 5a0dff4bdcd..3ce0dceeb42 100644 --- a/src/Registers.h +++ b/src/Registers.h @@ -434,6 +434,11 @@ class Registers { return u.arm64regs.x[7]; } + uintptr_t x15() const { + DEBUG_ASSERT(arch() == aarch64); + return u.arm64regs.x[15]; + } + uintptr_t xlr() const { DEBUG_ASSERT(arch() == aarch64); return u.arm64regs.x[30]; diff --git a/src/assembly_templates.py b/src/assembly_templates.py index 866eba71055..3ada7ed1552 100644 --- a/src/assembly_templates.py +++ b/src/assembly_templates.py @@ -14,13 +14,16 @@ class Field(object): def __init__(self, name, byte_length): self.name = name self.byte_length = byte_length + self.types = { 8: 'uint64_t', 4: 'uint32_t', 2: 'uint16_t', 1: 'uint8_t' } def __len__(self): return self.byte_length def c_type(self): - types = { 8: 'uint64_t', 4: 'uint32_t', 2: 'uint16_t', 1: 'uint8_t' } - return types[self.byte_length] + return self.types[self.byte_length] if (self.byte_length in self.types) else 'char' + + def c_arr(self): + return '' if (self.byte_length in self.types) else '[' + str(self.byte_length) + ']' class ShiftField(object): """A field embedded at some bit shift offset in another object.""" @@ -37,6 +40,9 @@ def c_type(self): types = { 8: 'uint64_t', 4: 'uint32_t', 2: 'uint16_t', 1: 'uint8_t' } return types[self.byte_length] + def c_arr(self): + return '' + def patch_c_type(self): types = { 8: 'uint64_t', 4: 'uint32_t', 2: 'uint16_t', 1: 'uint8_t' } return types[len(self.parent)] @@ -96,10 +102,15 @@ def bytes(self): RawBytes(0x8b, 0x25, 0x00, 0x10, 0x00, 0x70), # movl (syscallbuf_stub_alt_stack),%esp # dont_switch: RawBytes(0xff, 0x35, 0x08, 0x10, 0x00, 0x70), # pushl (stub_scratch_1) - RawBytes(0x68), # pushl $return_addr - Field('return_addr', 4), - RawBytes(0xe9), # jmp $trampoline_relative_addr - Field('trampoline_relative_addr', 4) + RawBytes(0xe8), # call $trampoline_relative_addr + Field('trampoline_relative_addr', 4), + # Restore the stack pointer + RawBytes(0x5c), # popl %esp + RawBytes(0xcd, 0x80), # int $0x80 + Field('stub', 20), + RawBytes(0xe9), # jmp $return_addr_relative + Field('return_addr_relative', 4) + ), 'X86TrapInstructionStubExtendedJump': AssemblyTemplate( # This code must match the stubs in syscall_hook.S. @@ -113,12 +124,16 @@ def bytes(self): RawBytes(0x8b, 0x25, 0x00, 0x10, 0x00, 0x70), # movl (syscallbuf_stub_alt_stack),%esp # dont_switch: RawBytes(0xff, 0x35, 0x08, 0x10, 0x00, 0x70), # pushl (stub_scratch_1) - RawBytes(0x68), # pushl $return_addr - Field('return_addr', 4), RawBytes(0xb8), # movl $fake_syscall_no,%eax Field('fake_syscall_no', 4), - RawBytes(0xe9), # jmp $trampoline_relative_addr - Field('trampoline_relative_addr', 4) + RawBytes(0xe8), # call $trampoline_relative_addr + Field('trampoline_relative_addr', 4), + # Restore the stack pointer + RawBytes(0x5c), # popl %esp + RawBytes(0xcd, 0x80), # int $0x80 + Field('stub', 20), + RawBytes(0xe9), # jmp $return_addr_relative + Field('return_addr_relative', 4) ), 'X86SyscallStubRestore': AssemblyTemplate( RawBytes(0xe9), # jmp $trampoline_relative_addr @@ -146,14 +161,16 @@ def bytes(self): # dont_switch: RawBytes(0x48, 0x81, 0xec, 0x00, 0x01, 0x00, 0x00), # subq $256, %rsp # after adjust + # Push the stack pointer we saved above onto our new stack RawBytes(0xff, 0x34, 0x25, 0x10, 0x10, 0x00, 0x70), # pushq (stub_scratch_1) - RawBytes(0x50), # pushq rax (just to make space for the next 2 instructions) - RawBytes(0xc7, 0x04, 0x24), # movl $return_addr_lo,(%rsp) - Field('return_addr_lo', 4), - RawBytes(0xc7, 0x44, 0x24, 0x04), # movl $return_addr_hi,(%rsp+4) - Field('return_addr_hi', 4), - RawBytes(0xff, 0x25, 0x00, 0x00, 0x00, 0x00), # jmp *0(%rip) + RawBytes(0xff, 0x15, 0x1d, 0x00, 0x00, 0x00), # callq *jump_target(%rip) + # Restore the stack pointer + RawBytes(0x5c), # popq %rsp + RawBytes(0x0f, 0x05), # syscall + Field('stub', 20), + RawBytes(0xff, 0x25, 0x08, 0x00, 0x00, 0x00), # jmp *8(%rip) Field('jump_target', 8), + Field('return_addr', 8) ), 'X64TrapInstructionStubExtendedJump': AssemblyTemplate( # This code must match the stubs in syscall_hook.S. @@ -170,15 +187,16 @@ def bytes(self): RawBytes(0x48, 0x81, 0xec, 0x00, 0x01, 0x00, 0x00), # subq $256, %rsp # after adjust RawBytes(0xff, 0x34, 0x25, 0x10, 0x10, 0x00, 0x70), # pushq (stub_scratch_1) - RawBytes(0x50), # pushq rax (just to make space for the next 2 instructions) - RawBytes(0xc7, 0x04, 0x24), # movl $return_addr_lo,(%rsp) - Field('return_addr_lo', 4), - RawBytes(0xc7, 0x44, 0x24, 0x04), # movl $return_addr_hi,(%rsp+4) - Field('return_addr_hi', 4), RawBytes(0xb8), # movl $fake_syscall_no,%eax Field('fake_syscall_no', 4), - RawBytes(0xff, 0x25, 0x00, 0x00, 0x00, 0x00), # jmp *0(%rip) + RawBytes(0xff, 0x15, 0x1d, 0x00, 0x00, 0x00), # callq *jump_target(%rip) + # Restore the stack pointer + RawBytes(0x5c), # popq %rsp + RawBytes(0x0f, 0x05), # syscall + Field('stub', 20), + RawBytes(0xff, 0x25, 0x08, 0x00, 0x00, 0x00), # jmp *8(%rip) Field('jump_target', 8), + Field('return_addr', 8) ), 'X64SyscallStubRestore': AssemblyTemplate( RawBytes(0xff, 0x25, 0x00, 0x00, 0x00, 0x00), # jmp *0(%rip) @@ -232,7 +250,8 @@ def generate_match_method(byte_array, template): fields = template.fields() field_types = [f.c_type() for f in fields] field_names = [f.name for f in fields] - args = ', ' + ', '.join("%s* %s" % (t, n) for t, n in zip(field_types, field_names)) \ + field_arrs = [f.c_arr() for f in fields] + args = ', ' + ', '.join("%s (*%s)%s" % (t, n, a) for t, n, a in zip(field_types, field_names, field_arrs)) \ if fields else '' s.write(' static bool match(const uint8_t* buffer %s) {\n' % (args,)) @@ -240,8 +259,8 @@ def generate_match_method(byte_array, template): for chunk in template.chunks: if isinstance(chunk, Field): field_name = chunk.name - s.write(' memcpy(%s, &buffer[%d], sizeof(*%s));\n' - % (field_name, offset, field_name)) + s.write(' memcpy(%s, &buffer[%d], %d);\n' + % (field_name, offset, len(chunk))) elif isinstance(chunk, ShiftField): s.write(' (void)%s;' % chunk.name) s.write(' assert(0 && "Matching not implemented for ShiftField");') @@ -256,8 +275,8 @@ def generate_match_method(byte_array, template): def generate_substitute_chunk(s, chunk, byte_array, offset): if isinstance(chunk, Field): field_name = chunk.name - s.write(' memcpy(&buffer[%d], &%s, sizeof(%s));\n' - % (offset, field_name, field_name)) + s.write(' memcpy(&buffer[%d], &%s, %d);\n' + % (offset, field_name if chunk.c_arr() == '' else '*'+field_name, len(chunk))) elif isinstance(chunk, ShiftField): generate_substitute_chunk(s, chunk.parent, byte_array, offset); typ = chunk.patch_c_type() @@ -275,7 +294,8 @@ def generate_substitute_method(byte_array, template): fields = template.fields() field_types = [f.c_type() for f in fields] field_names = [f.name for f in fields] - args = ', ' + ', '.join("%s %s" % (t, n) for t, n in zip(field_types, field_names)) \ + field_arrs = [f.c_arr() for f in fields] + args = ', ' + ', '.join("%s %s%s" % (t, n, a) for t, n, a in zip(field_types, field_names, field_arrs)) \ if fields else '' s.write(' static void substitute(uint8_t* buffer %s) {\n' % (args,)) diff --git a/src/kernel_abi.h b/src/kernel_abi.h index 0a6f4b3d633..5df30a875f3 100644 --- a/src/kernel_abi.h +++ b/src/kernel_abi.h @@ -2051,6 +2051,21 @@ struct X64Arch : public BaseArch { }; RR_VERIFY_TYPE_ARCH(SupportedArch::x86_64, ::sigcontext, sigcontext); + struct ucontext { + unsigned_long uc_flags; + ptr uc_link; + stack_t uc_stack; + struct sigcontext uc_mcontext; + kernel_sigset_t uc_sigmask; + }; + + struct rt_sigframe { + ptr pretcode; + struct ucontext uc; + siginfo_t info; + // Extended ISA state follows + }; + struct user_fpregs_struct { uint16_t cwd; uint16_t swd; @@ -2274,6 +2289,24 @@ struct X86Arch : public BaseArch { }; RR_VERIFY_TYPE_ARCH(SupportedArch::x86, ::sigcontext, sigcontext); + struct ucontext { + unsigned_long uc_flags; + ptr uc_link; + stack_t uc_stack; + struct sigcontext uc_mcontext; + kernel_sigset_t uc_sigmask; + }; + + struct rt_sigframe { + ptr pretcode; + int sig; + uint32_t pinfo; + uint32_t puc; + siginfo_t info; + struct ucontext uc; + // Extended ISA state follows + }; + struct user { user_regs_struct regs; int u_fpvalid; diff --git a/src/preload/preload_interface.h b/src/preload/preload_interface.h index 62db5a52810..e2265c9be2c 100644 --- a/src/preload/preload_interface.h +++ b/src/preload/preload_interface.h @@ -178,6 +178,7 @@ static inline const char* extract_file_name(const char* s) { */ #define PATCH_SYSCALL_INSTRUCTION_IS_LAST (1 << 1) +#define STUB_PATCH_LENGTH 20 /** * To support syscall buffering, we replace syscall instructions with a "call" * instruction that calls a hook in the preload library to handle the syscall. @@ -493,7 +494,7 @@ struct syscallbuf_record { /* Does this record require an assist during replay ? */ uint8_t replay_assist : 1; uint8_t _flags_padding : 6; - uint8_t _padding; + uint8_t aborted; /* Size of entire record in bytes: this struct plus extra * recorded data stored inline after the last field, not * including padding. diff --git a/src/preload/syscall_hook.S b/src/preload/syscall_hook.S index fc963d2c9e2..9191f5f9a6d 100644 --- a/src/preload/syscall_hook.S +++ b/src/preload/syscall_hook.S @@ -31,6 +31,16 @@ ((val) & (0xFF << 0x30)) >> 0x30, \ ((val) & (0xFF << 0x38)) >> 0x38 +#define DW_OP_deref 0x06 +#define DW_OP_dup 0x12 +#define DW_OP_minus 0x1c +#define DW_OP_lit(val) 0x30+val +#define DW_OP_plus 0x22 +#define DW_OP_plus_uconst(const) 0x23, const + +#define DW_CFA_def_cfa_expression 0x0f +#define DW_CFA_val_expression 0x16 + #define REG_AT_ADDR32(reg, addr) \ .cfi_escape 0x10, /* DW_CFA_expression */ \ reg, \ @@ -42,6 +52,20 @@ 0x09, /* 9 byte expression follows */ \ DW_OP_CONST8U(addr) +#define REG_AT_ADDR32_PLUS_OFFSET(reg, addr, offset) \ + .cfi_escape DW_CFA_val_expression, \ + reg, \ + 0x08, /* 8 byte expression follows */ \ + DW_OP_CONST4U(addr), \ + DW_OP_deref, \ + DW_OP_plus_uconst(offset); + +#define CFA_AT_ADDR32(addr) \ + .cfi_escape DW_CFA_def_cfa_expression, \ + 0x06, /* 5 byte expression follows */ \ + DW_OP_CONST4U(addr), \ + DW_OP_deref; + // 10 bytes LEB128 is enough to encode 64bit integer and we shouldn't // really need anything longer than that. #define COUNT_LEB128(lebs...) \ @@ -53,7 +77,17 @@ reg, \ (COUNT_LEB128(lebs) + 1), /* 1 byte + LEB128 bytes */ \ (0x70 + base), /* DW_OP_breg0 + base */ \ - lebs + lebs; + +#define REG_AT_REG_OFFSET_DEREF_OFFSET(reg, base, offset1, offset2) \ + .cfi_escape 0x10, /* DW_CFA_expression */ \ + reg, \ + (2 + 1 + 2), \ + (0x70 + base), /* DW_OP_breg0 + base */ \ + offset1, \ + DW_OP_deref, \ + DW_OP_plus_uconst(offset2); + #if defined(__i386__) .text @@ -62,6 +96,77 @@ .set alt_stack_nesting_level, preload_thread_locals + 12 .set saved_flags, preload_thread_locals + 16 + +// Needs to match assembly_templates.py. Measured from the end of the call +// instruction to before the jmp instruction. +#define EXTENDED_JUMP_STUB_REGION_SIZE 23 + +/* + * The syscallbuf extended jump patch has the form: + * + * call + * pop %esl + * int $0x80 + * + * jmpl + * + * + * The macros help read the relative jump address from memory and convert + * it to an absolute address. The idea is that during execution in the + * syscallbuf, our backtrace will look like: + * + * < syscallbuf C code > + * _syscall_hook_trampoline + * < function that contains the patched syscall > + * + * except while executing in the actuall syscallhook stubs, where the back trace + * will look like: + * + * _syscallbuf_hook_**** + * < function that contains the patched syscall > + * + * There are two things to note here: + * 1. We always omit the extended jump patch from the backtrace, because + * we (currently) have no way to give GDB any unwind info for it. + * 2. While in the syscallbuf C code, we omit the syscallbuf_hook_* functions + * from the backtrace. This is because we may switch out that frame for + * the bail path, which would confuse GDB when attempting to leave a frame. + * This setup is a bit weird, but it's not terrible, because we are essentially + * modeling a set of leaf frames that tail call to each other, which is a + * supported mode of operation and should thus not confused GDB too much. + */ +#define REL_JMP_FROM_JMP_INSTR(offset2) \ + /* Move us to after the jmp instruction */ \ + DW_OP_plus_uconst(offset2), \ + DW_OP_dup, \ + DW_OP_lit(4), \ + DW_OP_minus, \ + DW_OP_deref, \ + DW_OP_plus +#define RIP_IS_AT_REL_JMP_OFFSET(base, offset1, offset2) \ +.cfi_escape DW_CFA_val_expression, \ + 0x08 /* %eip */, \ + 0xa, /* 10 byte expression follows */ \ + /* Compute the return address that's on the stack */ \ + (0x70 + base), /* DW_OP_breg0 + base */ \ + offset1, \ + DW_OP_deref, \ + REL_JMP_FROM_JMP_INSTR(offset2); +#define RIP_IS_AT_ADDR_REL_JMP_OFFSET(addr, offset2) \ +.cfi_escape DW_CFA_val_expression, \ + 0x08 /* %eip */, \ + 0xd, /* 13 byte expression follows */ \ + DW_OP_CONST4U(addr), \ + DW_OP_deref, \ + REL_JMP_FROM_JMP_INSTR(offset2); +#define RIP_IS_AT_CFA_REL_JMP_OFFSET(offset1, offset2) \ +.cfi_escape DW_CFA_val_expression, \ + 0x08 /* %eip */, \ + 0xa, /* 13 byte expression follows */ \ + DW_OP_plus_uconst(offset1), \ + DW_OP_deref, \ + REL_JMP_FROM_JMP_INSTR(offset2); + .p2align 4 _syscallbuf_code_start: @@ -76,6 +181,8 @@ _syscallbuf_final_exit_instruction: _syscall_hook_trampoline: .cfi_startproc + .cfi_offset %esp, 4; + RIP_IS_AT_CFA_REL_JMP_OFFSET(0x0, 0x5) /* Build a |struct syscall_info| by pushing all the syscall * args and the number onto the stack. */ /* struct syscall_info info; */ @@ -129,7 +236,7 @@ _syscall_hook_trampoline: pushl %ebp call syscall_hook - /* $eax = vsyscall_hook(&info); */ + /* $eax = syscall_hook(&info); */ movdqa 0x10(%esp),%xmm0 movdqa 0x20(%esp),%xmm1 @@ -140,6 +247,13 @@ _syscall_hook_trampoline: movdqa 0x70(%esp),%xmm6 movdqa 0x80(%esp),%xmm7 + test %eax,%eax + jnz 2f + + // Switch the syscallbuf hook frame to the bail path + movl $_syscall_hook_bail+1, 0x1c(%ebp) + +2: mov $saved_flags, %esp popfw /* From here on, non-application flag changes are not allowed */ @@ -148,9 +262,7 @@ _syscall_hook_trampoline: mov %ebp, %esp .cfi_def_cfa_register %esp - /* $eax is now the syscall return value. Erase |info.no| from the - * stack so that we can restore the other registers we saved. */ - lea 4(%esp),%esp + pop %eax .cfi_adjust_cfa_offset -4 /* Contract of __kernel_vsyscall() and real syscalls is that even @@ -189,18 +301,20 @@ _syscall_hook_trampoline: name: \ .cfi_startproc; \ .cfi_def_cfa_offset 0; \ - .cfi_offset %eip, 0; \ - .cfi_offset %esp, 4 + RIP_IS_AT_REL_JMP_OFFSET(0x04 /* esp */, 0x00, EXTENDED_JUMP_STUB_REGION_SIZE + 5) \ + .cfi_offset %esp, 4; \ + addl $EXTENDED_JUMP_STUB_REGION_SIZE, (%esp); \ + RIP_IS_AT_REL_JMP_OFFSET(0x04 /* esp */, 0x00, 5) #define SYSCALLHOOK_END(name) \ pop (stub_scratch_1); \ .cfi_adjust_cfa_offset -4; \ + RIP_IS_AT_ADDR_REL_JMP_OFFSET(stub_scratch_1, 0x05) \ pop %esp; \ .cfi_same_value %esp; \ - REG_AT_ADDR32(0x08 /* %eip */, stub_scratch_1); \ jmp _syscallbuf_final_exit_instruction; \ .cfi_endproc; \ - .size name, .-name + .size name, .-name; SYSCALLHOOK_START(_syscall_hook_trampoline_3d_01_f0_ff_ff) call _syscall_hook_trampoline @@ -211,6 +325,32 @@ SYSCALLHOOK_START(_syscall_hook_trampoline_90_90_90) call _syscall_hook_trampoline SYSCALLHOOK_END(_syscall_hook_trampoline_90_90_90) +.global _syscall_hook_bail +.hidden _syscall_hook_bail +.type _syscall_hook_bail, @function +_syscall_hook_bail: +.cfi_startproc + .cfi_def_cfa_offset 0; + .cfi_offset %esp, 4 + RIP_IS_AT_REL_JMP_OFFSET(0x04 /* esp */, 0x00, 0x05) + nop + // We target our return here rather than the first instruction in the function, + // because gdb likes to back up a byte for function identification and gets confused. + pop (stub_scratch_1); + .cfi_def_cfa_offset -4; + RIP_IS_AT_ADDR_REL_JMP_OFFSET(stub_scratch_1, 0x05) + // GDB doesn't like stack adjustments in increments of 2, so use pushfl, rather than pushfw + pushfl + .cfi_def_cfa_offset 0; + subl $EXTENDED_JUMP_STUB_REGION_SIZE, (stub_scratch_1) + RIP_IS_AT_ADDR_REL_JMP_OFFSET(stub_scratch_1, EXTENDED_JUMP_STUB_REGION_SIZE + 0x05) + popfl + .cfi_def_cfa_offset -4; + jmp *(stub_scratch_1) +nop +ret +.cfi_endproc + /* Declare gcc get_pc thunks here so they're in a known region of code */ .global _get_pc_thunks_start @@ -320,6 +460,23 @@ _syscall_hook_trampoline: movdqa 0x60(%rsp),%xmm6 movdqa 0x70(%rsp),%xmm7 + test %rax,%rax + jnz 2f + + // Switch the syscallbuf hook frame to the bail path + lea (_syscall_hook_bail+1)(%rip), %rdi + movq %rdi, 0x48(%rbx) + + // Canonicalize registers that are affected by syscall entry. + // We sometimes don't record a deferred event until we'we already + // hit the bail path syscall instruction, but want to pretend it + // hapened just before. By setting the registers here, replay will + // see that event point with the same register set we had during + // record. + movq $0x246, %r11 + movq $-1, %rcx + +2: mov $saved_flags, %rsp popfw /* From here on, non-application flag changes are not allowed */ @@ -327,9 +484,9 @@ _syscall_hook_trampoline: mov %rbx,%rsp .cfi_def_cfa_register %rsp - /* On entrance, we pushed the %rax, the syscall number. But we don't - want to |pop %rax|, as that will overwrite our return value. Skip over it. */ - pop %rdi + /* This restores either the original value of rax (if we're going out + via the bail path) or the syscall result (in the regular path). */ + pop %rax .cfi_adjust_cfa_offset -8 /* We don't really *need* to restore these, since the kernel could have @@ -406,10 +563,14 @@ _syscallbuf_final_exit_instruction: 0x77, offset; /* DW_OP_breg7, offset */ #define RIP_IS_DEREF_RSP(offset) REG_AT_REG_OFFSET(0x10 /* %rip */, 7, offset) +#define RIP_IS_DEREF_DEREF_RSP(offset1, offset2) \ + REG_AT_REG_OFFSET_DEREF_OFFSET(0x10 /* %rip */, 7, offset1, offset2) /** * On syscallhook entry, the stack has been switched to the end of per-task * scratch space, then the old RSP and the return address have been pushed. + * The CFA of our syscallbuf frame is the value that rsp was at the syscall + * instruction we're patching (i.e. on the unswitched stack). */ #define SYSCALLHOOK_START(name) \ .global name; \ @@ -419,17 +580,28 @@ name: \ .cfi_startproc; \ CFA_AT_RSP_OFFSET(8) \ RSP_IS_CFA \ - RIP_IS_DEREF_RSP(0) + RIP_IS_DEREF_DEREF_RSP(0, 0x25) +/* We skip returning into the extended jump patch, because we + don't have a CFI frame for it and this makes GDB slightly + happier. */ #define SYSCALLHOOK_END(name) \ - pop (stub_scratch_1); \ + popq (stub_scratch_1); \ CFA_AT_RSP_OFFSET(0) \ - REG_AT_ADDR32(0x10 /* %rip */, stub_scratch_1); \ - pop %rsp; \ + REG_AT_ADDR32_PLUS_OFFSET(0x10 /* %rip */, stub_scratch_1, 0x25) \ + popq %rsp; \ + .cfi_def_cfa %rsp, 0; \ + xchgq %rsp, (stub_scratch_1); \ + CFA_AT_ADDR32(stub_scratch_1) \ + RIP_IS_DEREF_RSP(0x25); \ + mov 0x25(%rsp), %rsp; \ + RIP_IS_DEREF_RSP(0); \ .cfi_def_cfa %rsp, 0; \ + xchgq %rsp, (stub_scratch_1); \ + REG_AT_ADDR32(0x10 /* %rip */, stub_scratch_1); \ jmp _syscallbuf_final_exit_instruction; \ .cfi_endproc; \ - .size name, .-name + .size name, .-name; /* See note above on what __morestack is for */ .global __morestack @@ -539,7 +711,6 @@ SYSCALLHOOK_START(_syscall_hook_trampoline_c3_nop) pop (stub_scratch_1) .cfi_adjust_cfa_offset -8 jmp _syscallbuf_final_exit_instruction - .cfi_endproc .size _syscall_hook_trampoline_c3_nop, .-_syscall_hook_trampoline_c3_nop @@ -609,6 +780,22 @@ SYSCALLHOOK_START(_syscall_hook_trampoline_b8_ca_00_00_00) callq __morestack SYSCALLHOOK_END(_syscall_hook_trampoline_b8_ca_00_00_00) +.global _syscall_hook_bail +.hidden _syscall_hook_bail +.type _syscall_hook_bail, @function +_syscall_hook_bail: +.cfi_startproc +CFA_AT_RSP_OFFSET(8) +RSP_IS_CFA +RIP_IS_DEREF_DEREF_RSP(0, 0x25) +nop +// We target our return here rather than the first instruction in the function, +// because gdb likes to back up a byte for function identification and gets confused. +nop +retq +.cfi_endproc + + #elif defined(__aarch64__) .text @@ -762,6 +949,14 @@ _syscall_hook_trampoline: bl syscall_hook + cbnz x0, 1f + + // If the function requested the bail path, rewrite the return address + ldr x0, [sp, 688] + add x0, x0, 8 + str x0, [sp, 688] + +1: movz x29, #:abs_g1:alt_stack_nesting_level // assume 32bit address movk x29, #:abs_g0_nc:alt_stack_nesting_level ldr w30, [x29] @@ -773,7 +968,7 @@ _syscall_hook_trampoline: // x30 should not use same_value since it's value is changed // by the function call instruction .cfi_restore x30 - ldr x8, [sp, 48] + LDPX_STACK(8, 0, 48) .cfi_same_value x8 LDPX_STACK(1, 2, 64) LDPX_STACK(3, 4, 80) diff --git a/src/preload/syscallbuf.c b/src/preload/syscallbuf.c index d162769aff0..400023b987c 100644 --- a/src/preload/syscallbuf.c +++ b/src/preload/syscallbuf.c @@ -320,26 +320,6 @@ static int privileged_traced_syscall(int syscallno, long a0, long a1, long a2, #define privileged_traced_syscall1(no, a0) privileged_traced_syscall2(no, a0, 0) #define privileged_traced_syscall0(no) privileged_traced_syscall1(no, 0) -/** - * Make a raw traced syscall using the params in |call|. - */ -static long traced_raw_syscall(struct syscall_info* call) { - if (call->no == SYS_rrcall_rdtsc) { - // Handle this specially because the rrcall writes to a memory out-param - // and we need to actually modify the outgoing AX/DX registers instead. - uint32_t tsc[2]; - privileged_traced_syscall1(SYS_rrcall_rdtsc, tsc); - // Overwrite RDX (syscall arg 3) with our TSC value. - call->args[2] = tsc[1]; - return tsc[0]; - } - /* FIXME: pass |call| to avoid pushing these on the stack - * again. */ - return _raw_syscall(call->no, call->args[0], call->args[1], call->args[2], - call->args[3], call->args[4], call->args[5], - RR_PAGE_SYSCALL_TRACED, 0, 0); -} - /** * Make a raw traced syscall using the params in |call|, privileged. */ @@ -732,7 +712,7 @@ static void __attribute__((constructor)) init_process(void) { 5, { 0x3d, 0x01, 0xf0, 0xff, 0xff }, (uintptr_t)_syscall_hook_trampoline_3d_01_f0_ff_ff }, - /* Our vdso syscall patch has 'int 80' followed by onp; nop; nop */ + /* Our vdso syscall patch has 'int 80' followed by nop; nop; nop */ { PATCH_IS_MULTIPLE_INSTRUCTIONS, 3, { 0x90, 0x90, 0x90 }, @@ -741,25 +721,28 @@ static void __attribute__((constructor)) init_process(void) { extern char _get_pc_thunks_start; extern char _get_pc_thunks_end; #elif defined(__x86_64__) - extern RR_HIDDEN void _syscall_hook_trampoline_48_3d_01_f0_ff_ff(void); - extern RR_HIDDEN void _syscall_hook_trampoline_48_3d_00_f0_ff_ff(void); - extern RR_HIDDEN void _syscall_hook_trampoline_48_8b_3c_24(void); - extern RR_HIDDEN void _syscall_hook_trampoline_48_89_45_f8(void); - extern RR_HIDDEN void _syscall_hook_trampoline_48_89_c3(void); - extern RR_HIDDEN void _syscall_hook_trampoline_5a_5e_c3(void); - extern RR_HIDDEN void _syscall_hook_trampoline_89_c2_f7_da(void); - extern RR_HIDDEN void _syscall_hook_trampoline_90_90_90(void); - extern RR_HIDDEN void _syscall_hook_trampoline_ba_01_00_00_00(void); - extern RR_HIDDEN void _syscall_hook_trampoline_89_c1_31_d2(void); - extern RR_HIDDEN void _syscall_hook_trampoline_c3_nop(void); - extern RR_HIDDEN void _syscall_hook_trampoline_40_80_f6_81(void); - extern RR_HIDDEN void _syscall_hook_trampoline_49_89_ca(void); - extern RR_HIDDEN void _syscall_hook_trampoline_48_89_c1(void); - extern RR_HIDDEN void _syscall_hook_trampoline_48_c1_e2_20(void); - extern RR_HIDDEN void _syscall_hook_trampoline_4c_89_f7(void); - extern RR_HIDDEN void _syscall_hook_trampoline_4c_89_ff(void); - extern RR_HIDDEN void _syscall_hook_trampoline_49_c7_c1_ff_ff_ff_ff(void); - extern RR_HIDDEN void _syscall_hook_trampoline_b8_ca_00_00_00(void); +#define DECLARE_SYSCALLHOOK(name) \ + extern RR_HIDDEN void _syscall_hook_trampoline_ ## name(void); + + DECLARE_SYSCALLHOOK(48_3d_01_f0_ff_ff); + DECLARE_SYSCALLHOOK(48_3d_00_f0_ff_ff); + DECLARE_SYSCALLHOOK(48_8b_3c_24); + DECLARE_SYSCALLHOOK(48_89_45_f8); + DECLARE_SYSCALLHOOK(48_89_c3); + DECLARE_SYSCALLHOOK(5a_5e_c3); + DECLARE_SYSCALLHOOK(89_c2_f7_da); + DECLARE_SYSCALLHOOK(90_90_90); + DECLARE_SYSCALLHOOK(ba_01_00_00_00); + DECLARE_SYSCALLHOOK(89_c1_31_d2); + DECLARE_SYSCALLHOOK(c3_nop); + DECLARE_SYSCALLHOOK(40_80_f6_81); + DECLARE_SYSCALLHOOK(49_89_ca); + DECLARE_SYSCALLHOOK(48_89_c1); + DECLARE_SYSCALLHOOK(48_c1_e2_20); + DECLARE_SYSCALLHOOK(4c_89_f7); + DECLARE_SYSCALLHOOK(4c_89_ff); + DECLARE_SYSCALLHOOK(49_c7_c1_ff_ff_ff_ff); + DECLARE_SYSCALLHOOK(b8_ca_00_00_00); #define MOV_RDX_VARIANTS \ MOV_RDX_TO_REG(48, d0) \ @@ -780,9 +763,12 @@ static void __attribute__((constructor)) init_process(void) { MOV_RDX_TO_REG(49, d7) #define MOV_RDX_TO_REG(rex, op) \ - extern RR_HIDDEN void _syscall_hook_trampoline_##rex##_89_##op(void); + DECLARE_SYSCALLHOOK(rex##_89_##op) MOV_RDX_VARIANTS +#define HOOK_REFERENCE(name) \ + (uintptr_t)_syscall_hook_trampoline_##name + struct syscall_patch_hook syscall_patch_hooks[] = { /* Many glibc syscall wrappers (e.g. read) have 'syscall' followed * by @@ -790,58 +776,58 @@ static void __attribute__((constructor)) init_process(void) { { 0, 6, { 0x48, 0x3d, 0x01, 0xf0, 0xff, 0xff }, - (uintptr_t)_syscall_hook_trampoline_48_3d_01_f0_ff_ff }, + HOOK_REFERENCE(48_3d_01_f0_ff_ff) }, /* Many glibc syscall wrappers (e.g. __libc_recv) have 'syscall' * followed by * cmp $-4096,%rax (in glibc-2.18-16.fc20.x86_64) */ { 0, 6, { 0x48, 0x3d, 0x00, 0xf0, 0xff, 0xff }, - (uintptr_t)_syscall_hook_trampoline_48_3d_00_f0_ff_ff }, + HOOK_REFERENCE(48_3d_00_f0_ff_ff) }, /* Many glibc syscall wrappers (e.g. read) have 'syscall' followed * by * mov (%rsp),%rdi (in glibc-2.18-16.fc20.x86_64) */ { 0, 4, { 0x48, 0x8b, 0x3c, 0x24 }, - (uintptr_t)_syscall_hook_trampoline_48_8b_3c_24 }, + HOOK_REFERENCE(48_8b_3c_24) }, /* Some syscall wrappers have 'syscall' followed * by * mov %rax,-8(%rbp) */ { 0, 4, { 0x48, 0x89, 0x45, 0xf8 }, - (uintptr_t)_syscall_hook_trampoline_48_89_45_f8 }, + HOOK_REFERENCE(48_89_45_f8) }, /* Some syscall wrappers (e.g. read) have 'syscall' followed * by * mov %rax,%rbx */ { 0, 3, { 0x48, 0x89, 0xc3 }, - (uintptr_t)_syscall_hook_trampoline_48_89_c3 }, + HOOK_REFERENCE(48_89_c3) }, /* Some RDTSC instructions are followed by 'mov %rax,%rcx'. */ { 0, 3, { 0x48, 0x89, 0xc1 }, - (uintptr_t)_syscall_hook_trampoline_48_89_c1 }, + HOOK_REFERENCE(48_89_c1) }, /* __lll_unlock_wake has 'syscall' followed by * pop %rdx; pop %rsi; ret */ { PATCH_IS_MULTIPLE_INSTRUCTIONS, 3, { 0x5a, 0x5e, 0xc3 }, - (uintptr_t)_syscall_hook_trampoline_5a_5e_c3 }, + HOOK_REFERENCE(5a_5e_c3) }, /* posix_fadvise64 has 'syscall' followed by * mov %eax,%edx; neg %edx (in glibc-2.22-11.fc23.x86_64) */ { PATCH_IS_MULTIPLE_INSTRUCTIONS, 4, { 0x89, 0xc2, 0xf7, 0xda }, - (uintptr_t)_syscall_hook_trampoline_89_c2_f7_da }, + HOOK_REFERENCE(89_c2_f7_da) }, /* Our VDSO vsyscall patches have 'syscall' followed by "nop; nop; nop" */ { PATCH_IS_MULTIPLE_INSTRUCTIONS, 3, { 0x90, 0x90, 0x90 }, - (uintptr_t)_syscall_hook_trampoline_90_90_90 }, + HOOK_REFERENCE(90_90_90) }, /* glibc-2.22-17.fc23.x86_64 has 'syscall' followed by 'mov $1,%rdx' * in * pthread_barrier_wait. @@ -849,54 +835,54 @@ static void __attribute__((constructor)) init_process(void) { { 0, 5, { 0xba, 0x01, 0x00, 0x00, 0x00 }, - (uintptr_t)_syscall_hook_trampoline_ba_01_00_00_00 }, + HOOK_REFERENCE(ba_01_00_00_00) }, /* pthread_sigmask has 'syscall' followed by 'mov %eax,%ecx; xor %edx,%edx' */ { PATCH_IS_MULTIPLE_INSTRUCTIONS, 4, { 0x89, 0xc1, 0x31, 0xd2 }, - (uintptr_t)_syscall_hook_trampoline_89_c1_31_d2 }, + HOOK_REFERENCE(89_c1_31_d2) }, /* getpid has 'syscall' followed by 'retq; nopl 0x0(%rax,%rax,1) */ { PATCH_IS_MULTIPLE_INSTRUCTIONS, 9, { 0xc3, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 }, - (uintptr_t)_syscall_hook_trampoline_c3_nop }, + HOOK_REFERENCE(c3_nop) }, /* liblsan internal_close has 'syscall' followed by 'retq; nopl 0x0(%rax,%rax,1) */ { PATCH_IS_MULTIPLE_INSTRUCTIONS, 6, { 0xc3, 0x0f, 0x1f, 0x44, 0x00, 0x00 }, - (uintptr_t)_syscall_hook_trampoline_c3_nop }, + HOOK_REFERENCE(c3_nop) }, /* glibc-2.29-15.fc30.x86_64 getpid has 'syscall' followed by 'retq; nopl 0x0(%rax) */ { PATCH_IS_MULTIPLE_INSTRUCTIONS, 5, { 0xc3, 0x0f, 0x1f, 0x40, 0x00 }, - (uintptr_t)_syscall_hook_trampoline_c3_nop }, + HOOK_REFERENCE(c3_nop) }, /* liblsan internal_open has 'syscall' followed by 'retq; nopl (%rax) */ { PATCH_IS_MULTIPLE_INSTRUCTIONS, 4, { 0xc3, 0x0f, 0x1f, 0x00 }, - (uintptr_t)_syscall_hook_trampoline_c3_nop }, + HOOK_REFERENCE(c3_nop) }, /* liblsan internal_dup2 has 'syscall' followed by 'retq; xchg %ax,%ax */ { PATCH_IS_MULTIPLE_INSTRUCTIONS, 3, { 0xc3, 0x66, 0x90 }, - (uintptr_t)_syscall_hook_trampoline_c3_nop }, + HOOK_REFERENCE(c3_nop) }, /* Go runtime has 'syscall' followed by 'retq; int3; int3 */ { PATCH_IS_MULTIPLE_INSTRUCTIONS, 3, { 0xc3, 0xcc, 0xcc }, - (uintptr_t)_syscall_hook_trampoline_c3_nop }, + HOOK_REFERENCE(c3_nop) }, /* glibc-2.31 on Ubuntu 20.04 has 'xor $0x81, %sil' followed by 'syscall' */ { PATCH_SYSCALL_INSTRUCTION_IS_LAST, 4, { 0x40, 0x80, 0xf6, 0x81 }, - (uintptr_t)_syscall_hook_trampoline_40_80_f6_81 }, + HOOK_REFERENCE(40_80_f6_81) }, /* DynamoRIO has 'mov r10, rcx' followed by 'syscall' */ { PATCH_SYSCALL_INSTRUCTION_IS_LAST, 3, { 0x49, 0x89, 0xca }, - (uintptr_t)_syscall_hook_trampoline_49_89_ca }, + HOOK_REFERENCE(49_89_ca) }, /* Some applications have RDTSC followed by 'mov %rdx,any-reg' */ #undef MOV_RDX_TO_REG #define MOV_RDX_TO_REG(rex, op) \ @@ -904,34 +890,34 @@ static void __attribute__((constructor)) init_process(void) { 0, \ 3, \ { 0x##rex, 0x89, 0x##op }, \ - (uintptr_t)_syscall_hook_trampoline_##rex##_89_##op }, + HOOK_REFERENCE(rex##_89_##op) }, MOV_RDX_VARIANTS /* Some application has RDTSC followed by 'shl $32,%rdx' */ { 0, 4, { 0x48, 0xc1, 0xe2, 0x20 }, - (uintptr_t)_syscall_hook_trampoline_48_c1_e2_20 }, + HOOK_REFERENCE(48_c1_e2_20) }, /* Some application has 'mov %r14,%rdi' followed by 'syscall' */ { PATCH_SYSCALL_INSTRUCTION_IS_LAST, 3, { 0x4c, 0x89, 0xf7 }, - (uintptr_t)_syscall_hook_trampoline_4c_89_f7 }, + HOOK_REFERENCE(4c_89_f7) }, /* Some application has 'mov %r15,%rdi' followed by 'syscall' */ { PATCH_SYSCALL_INSTRUCTION_IS_LAST, 3, { 0x4c, 0x89, 0xff }, - (uintptr_t)_syscall_hook_trampoline_4c_89_ff }, + HOOK_REFERENCE(4c_89_ff) }, /* Some application has 'mov $0xffffffff,%r9' followed by 'syscall' */ { PATCH_SYSCALL_INSTRUCTION_IS_LAST, 7, { 0x49, 0xc7, 0xc1, 0xff, 0xff, 0xff, 0xff }, - (uintptr_t)_syscall_hook_trampoline_49_c7_c1_ff_ff_ff_ff }, + HOOK_REFERENCE(49_c7_c1_ff_ff_ff_ff) }, /* Some application has 'mov $0xca,%eax' followed by 'syscall' */ { PATCH_SYSCALL_INSTRUCTION_IS_LAST, 5, { 0xb8, 0xca, 0x00, 0x00, 0x00 }, - (uintptr_t)_syscall_hook_trampoline_b8_ca_00_00_00 }, + HOOK_REFERENCE(b8_ca_00_00_00) }, }; #elif defined(__aarch64__) extern RR_HIDDEN void _syscall_hook_trampoline_raw(void); @@ -1308,7 +1294,6 @@ static long commit_raw_syscall(int syscallno, void* record_end, long ret) { void* record_start = buffer_last(); struct syscallbuf_record* rec = record_start; struct syscallbuf_hdr* hdr = buffer_hdr(); - int call_breakpoint = 0; assert(record_end >= record_start); rec->size = record_end - record_start; @@ -1339,21 +1324,20 @@ static long commit_raw_syscall(int syscallno, void* record_end, long ret) { fatal("Record syscall number mismatch"); } - if (hdr->abort_commit) { + rec->ret = ret; + if (rec->aborted) { /* We were descheduled in the middle of a may-block * syscall, and it was recorded as a normal entry/exit * pair. So don't record the syscall in the buffer or * replay will go haywire. */ hdr->abort_commit = 0; hdr->failed_during_preparation = 0; - /* Clear the return value that rr puts there during replay */ - rec->ret = 0; + rec->size = sizeof(struct syscallbuf_record); + hdr->num_rec_bytes += sizeof(struct syscallbuf_record); } else { - rec->ret = ret; // Finish 'rec' first before updating num_rec_bytes, since // rr might read the record anytime after this update. hdr->num_rec_bytes += stored_record_size(rec->size); - call_breakpoint = 1; } if (rec->desched) { @@ -1367,23 +1351,38 @@ static long commit_raw_syscall(int syscallno, void* record_end, long ret) { buffer_hdr()->locked &= ~SYSCALLBUF_LOCKED_TRACEE; - if (call_breakpoint) { - /* Call the breakpoint function corresponding to the record we just - * committed. This function just returns, but during replay it gives rr - * a chance to set a breakpoint for when a specific syscallbuf record - * has been processed. - */ - do_breakpoint(hdr->num_rec_bytes/8); - /* Force a tick now. - * During replay, if an async event (SIGKILL) happens between committing the syscall - * above and before this forced tick, we can detect that because the number of ticks - * recorded for the SIGKILL will be less than or equal to the number of ticks reported - * when the replay hits do_breakpoint. - */ - force_tick(); - } + /* Call the breakpoint function corresponding to the record we just + * committed. This function just returns, but during replay it gives rr + * a chance to set a breakpoint for when a specific syscallbuf record + * has been processed. + */ + do_breakpoint(hdr->num_rec_bytes/8); + /* Force a tick now. + * During replay, if an async event (SIGKILL) happens between committing the syscall + * above and before this forced tick, we can detect that because the number of ticks + * recorded for the SIGKILL will be less than or equal to the number of ticks reported + * when the replay hits do_breakpoint. + */ + force_tick(); + + return !rec->aborted; +} - return ret; +void set_return_value(struct syscall_info *call, long ret) +{ +#ifdef __aarch64__ + call->args[0] = ret; +#else + call->no = ret; +#endif +} + +long commit_main_syscall_wontblock(struct syscall_info *call, int syscallno, void* record_end, long ret) +{ + int ok = commit_raw_syscall(syscallno, record_end, ret); + assert(ok); + set_return_value(call, ret); + return 1; } /** @@ -1565,11 +1564,15 @@ static long sys_generic_nonblocking(struct syscall_info* call) { long ret; if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall6(call->no, call->args[0], call->args[1], call->args[2], call->args[3], call->args[4], call->args[5]); - return commit_raw_syscall(call->no, ptr, ret); + if (!commit_raw_syscall(call->no, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } /** @@ -1582,11 +1585,15 @@ static long sys_generic_nonblocking_fd(struct syscall_info* call) { long ret; if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall6(call->no, fd, call->args[1], call->args[2], call->args[3], call->args[4], call->args[5]); - return commit_raw_syscall(call->no, ptr, ret); + if (!commit_raw_syscall(call->no, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } /** @@ -1603,7 +1610,10 @@ static long privileged_sys_generic_nonblocking_fd(const struct syscall_info* cal } ret = privileged_untraced_syscall6(call->no, fd, call->args[1], call->args[2], call->args[3], call->args[4], call->args[5]); - return commit_raw_syscall(call->no, ptr, ret); + if (!commit_raw_syscall(call->no, ptr, ret)) { + return 0; + } + return ret; } static long sys_clock_gettime(struct syscall_info* call) { @@ -1622,7 +1632,7 @@ static long sys_clock_gettime(struct syscall_info* call) { ptr += sizeof(*tp2); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall2(syscallno, clk_id, tp2); if (tp && ret >= 0 && !buffer_hdr()->failed_during_preparation) { @@ -1630,7 +1640,11 @@ static long sys_clock_gettime(struct syscall_info* call) { our library. */ *tp = *tp2; } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } #ifdef SYS_clock_gettime64 @@ -1651,7 +1665,7 @@ static long sys_clock_gettime64(struct syscall_info* call) { ptr += sizeof(*tp2); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall2(syscallno, clk_id, tp2); if (tp && ret >= 0 && !buffer_hdr()->failed_during_preparation) { @@ -1659,7 +1673,11 @@ static long sys_clock_gettime64(struct syscall_info* call) { our library. */ *tp = *tp2; } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } #endif @@ -1674,7 +1692,15 @@ static long sys_creat(struct syscall_info* call) { * O_CREAT|O_WRONLY|O_TRUNC. */ struct syscall_info open_call = { SYS_open, { (long)pathname, O_CREAT | O_TRUNC | O_WRONLY, mode } }; - return sys_open(&open_call); + long ret = sys_open(&open_call); + if (!ret) + return ret; +#ifdef __aarch64__ + set_return_value(call, open_call.args[0]); +#else + set_return_value(call, open_call.no); +#endif + return ret; } #endif @@ -1692,10 +1718,14 @@ static int sys_fcntl64_no_outparams(struct syscall_info* call) { assert(syscallno == call->no); if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(syscallno, fd, cmd, arg); - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } static int sys_fcntl64_own_ex(struct syscall_info* call) { @@ -1716,7 +1746,7 @@ static int sys_fcntl64_own_ex(struct syscall_info* call) { ptr += sizeof(*owner2); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } if (owner2) { memcpy_input_parameter(owner2, owner, sizeof(*owner2)); @@ -1725,13 +1755,17 @@ static int sys_fcntl64_own_ex(struct syscall_info* call) { if (owner2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) { local_memcpy(owner, owner2, sizeof(*owner)); } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } static int sys_fcntl64_setlk64(struct syscall_info* call) { if (force_traced_syscall_for_chaos_mode()) { /* Releasing a lock could unblock a higher priority task */ - return traced_raw_syscall(call); + return 0; } const int syscallno = RR_FCNTL_SYSCALL; @@ -1750,7 +1784,7 @@ static int sys_fcntl64_setlk64(struct syscall_info* call) { ptr += sizeof(*lock2); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } if (lock2) { memcpy_input_parameter(lock2, lock, sizeof(*lock2)); @@ -1759,13 +1793,17 @@ static int sys_fcntl64_setlk64(struct syscall_info* call) { if (lock2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) { local_memcpy(lock, lock2, sizeof(*lock)); } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } static int sys_fcntl64_setlkw64(struct syscall_info* call) { if (force_traced_syscall_for_chaos_mode()) { /* Releasing a lock could unblock a higher priority task */ - return traced_raw_syscall(call); + return 0; } const int syscallno = RR_FCNTL_SYSCALL; @@ -1779,10 +1817,14 @@ static int sys_fcntl64_setlkw64(struct syscall_info* call) { assert(syscallno == call->no); if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(syscallno, fd, cmd, lock); - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } #if defined(SYS_fcntl64) @@ -1798,7 +1840,7 @@ static long sys_fcntl(struct syscall_info* call) if (call->args[2] == O_DIRECT) { /* This needs to go to rr so we can disable syscall buffering on this fd. */ - return traced_raw_syscall(call); + return 0; } /* Falls through. */ case F_DUPFD: @@ -1835,7 +1877,7 @@ static long sys_fcntl(struct syscall_info* call) return sys_fcntl64_setlkw64(call); default: - return traced_raw_syscall(call); + return 0; } } @@ -1866,12 +1908,16 @@ static long sys_flistxattr(struct syscall_info* call) { ptr += size; } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(syscallno, fd, buf2, size); ptr = copy_output_buffer(ret_buf_len(ret, size), ptr, buf, buf2); - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } static long sys_safe_nonblocking_ioctl(struct syscall_info* call) { @@ -1882,10 +1928,14 @@ static long sys_safe_nonblocking_ioctl(struct syscall_info* call) { long ret; if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(syscallno, fd, call->args[1], call->args[2]); - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } static long sys_ioctl_fionread(struct syscall_info* call) { @@ -1902,13 +1952,17 @@ static long sys_ioctl_fionread(struct syscall_info* call) { ptr += sizeof(*value); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(syscallno, fd, FIONREAD, buf); if (buf && ret >= 0 && !buffer_hdr()->failed_during_preparation) { local_memcpy(value, buf, sizeof(*value)); } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } static long sys_ioctl(struct syscall_info* call) { @@ -1920,7 +1974,7 @@ static long sys_ioctl(struct syscall_info* call) { case FIONREAD: return sys_ioctl_fionread(call); default: - return traced_raw_syscall(call); + return 0; } } @@ -1934,7 +1988,7 @@ static long sys_futex(struct syscall_info* call) { in which case we're at most doubling the overhead of the combined wait + wakeup. */ if (globals.in_chaos) { - return traced_raw_syscall(call); + return 0; } int op = call->args[1]; @@ -1962,7 +2016,7 @@ static long sys_futex(struct syscall_info* call) { * special processing in the tracer process (in addition to * not being worth doing for perf reasons). */ default: - return traced_raw_syscall(call); + return 0; } const int syscallno = SYS_futex; @@ -1993,7 +2047,7 @@ static long sys_futex(struct syscall_info* call) { /* See above; it's not worth buffering may-block futex * calls. */ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall6(syscallno, uaddr, op, val, timeout, uaddr2, val3); @@ -2007,7 +2061,7 @@ static long sys_futex(struct syscall_info* call) { if (saved_uaddr2) { copy_futex_int(saved_uaddr2, uaddr2); } - return commit_raw_syscall(syscallno, ptr, ret); + return commit_main_syscall_wontblock(call, syscallno, ptr, ret); } static long sys_getrandom(struct syscall_info* call) { @@ -2027,12 +2081,16 @@ static long sys_getrandom(struct syscall_info* call) { ptr += buf_len; } if (!start_commit_buffered_syscall(call->no, ptr, (flags & GRND_NONBLOCK) ? WONT_BLOCK : MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(call->no, buf2, buf_len, flags); ptr = copy_output_buffer(ret, ptr, buf, buf2); - return commit_raw_syscall(call->no, ptr, ret); + if (!commit_raw_syscall(call->no, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } static long sys_generic_getdents(struct syscall_info* call) { @@ -2049,12 +2107,16 @@ static long sys_generic_getdents(struct syscall_info* call) { ptr += count; } if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(call->no, fd, buf2, count); ptr = copy_output_buffer(ret, ptr, buf, buf2); - return commit_raw_syscall(call->no, ptr, ret); + if (!commit_raw_syscall(call->no, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } #if defined(SYS_getdents) @@ -2091,7 +2153,7 @@ static long sys_gettimeofday(struct syscall_info* call) { ptr += sizeof(*tzp2); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall2(syscallno, tp2, tzp2); if (ret >= 0 && !buffer_hdr()->failed_during_preparation) { @@ -2106,7 +2168,11 @@ static long sys_gettimeofday(struct syscall_info* call) { *tzp = *tzp2; } } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } static long sys_generic_getxattr(struct syscall_info* call) { @@ -2124,12 +2190,16 @@ static long sys_generic_getxattr(struct syscall_info* call) { ptr += size; } if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall4(call->no, path, name, value2, size); ptr = copy_output_buffer(ret_buf_len(ret, size), ptr, value, value2); - return commit_raw_syscall(call->no, ptr, ret); + if (!commit_raw_syscall(call->no, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } static long sys_getxattr(struct syscall_info* call) { @@ -2155,12 +2225,16 @@ static long sys_fgetxattr(struct syscall_info* call) { ptr += size; } if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall4(call->no, fd, name, value2, size); ptr = copy_output_buffer(ret_buf_len(ret, size), ptr, value, value2); - return commit_raw_syscall(call->no, ptr, ret); + if (!commit_raw_syscall(call->no, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } static long sys_generic_listxattr(struct syscall_info* call) { @@ -2177,12 +2251,16 @@ static long sys_generic_listxattr(struct syscall_info* call) { ptr += size; } if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(call->no, path, buf2, size); ptr = copy_output_buffer(ret_buf_len(ret, size), ptr, buf, buf2); - return commit_raw_syscall(call->no, ptr, ret); + if (!commit_raw_syscall(call->no, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } static long sys_listxattr(struct syscall_info* call) { @@ -2213,7 +2291,7 @@ static long sys__llseek(struct syscall_info* call) { ptr += sizeof(*result2); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } if (result2) { @@ -2224,7 +2302,11 @@ static long sys__llseek(struct syscall_info* call) { if (result2) { *result = *result2; } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } #endif @@ -2258,7 +2340,7 @@ static long sys_madvise(struct syscall_info* call) { advice = -1; break; default: - return traced_raw_syscall(call); + return 0; } ptr = prep_syscall(); @@ -2266,14 +2348,18 @@ static long sys_madvise(struct syscall_info* call) { assert(syscallno == call->no); if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } /* Ensure this syscall happens during replay. In particular MADV_DONTNEED * must be executed. */ ret = untraced_replayed_syscall3(syscallno, addr, length, advice); - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } static long sys_mprotect(struct syscall_info* call) { @@ -2288,7 +2374,7 @@ static long sys_mprotect(struct syscall_info* call) { if ((prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) || !buffer_hdr() || buffer_hdr()->mprotect_record_count >= MPROTECT_RECORD_COUNT) { - return traced_raw_syscall(call); + return 0; } ptr = prep_syscall(); @@ -2296,7 +2382,7 @@ static long sys_mprotect(struct syscall_info* call) { assert(syscallno == call->no); if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } mrec = &globals.mprotect_records[buffer_hdr()->mprotect_record_count++]; @@ -2310,7 +2396,11 @@ static long sys_mprotect(struct syscall_info* call) { } buffer_hdr()->mprotect_record_count_completed++; - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } static int supported_open(const char* file_name, int flags) { @@ -2345,7 +2435,7 @@ static int check_file_open_ok(struct syscall_info* call, int ret, struct check_o from doing anything, so there is nothing for us to do here and we shouldn't try to interpret the "syscall result". */ if (state.did_fail_during_preparation || ret < 0) { - return ret; + return 0; } char buf[100]; sprintf(buf, "/proc/self/fd/%d", ret); @@ -2362,7 +2452,8 @@ static int check_file_open_ok(struct syscall_info* call, int ret, struct check_o if (link_ret >= 0 && link_ret < (ssize_t)sizeof(link)) { link[link_ret] = 0; if (allow_buffered_open(link)) { - return ret; + set_return_value(call, ret); + return 1; } } /* Clean up by closing the file descriptor we should not have opened and @@ -2374,7 +2465,7 @@ static int check_file_open_ok(struct syscall_info* call, int ret, struct check_o syscall, but that's a bit more complicated and we're already on the slow (and hopefully rare) path. */ privileged_traced_syscall1(SYS_close, ret); - return traced_raw_syscall(call); + return 0; } static struct check_open_state capture_check_open_state(void) { @@ -2388,7 +2479,7 @@ static struct check_open_state capture_check_open_state(void) { static long sys_open(struct syscall_info* call) { if (force_traced_syscall_for_chaos_mode()) { /* Opening a FIFO could unblock a higher priority task */ - return traced_raw_syscall(call); + return 0; } const int syscallno = SYS_open; @@ -2401,25 +2492,27 @@ static long sys_open(struct syscall_info* call) { assert(syscallno == call->no); if (!supported_open(pathname, flags)) { - return traced_raw_syscall(call); + return 0; } ptr = prep_syscall(); if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(syscallno, pathname, flags, mode); struct check_open_state state = capture_check_open_state(); - ret = commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) + return 0; return check_file_open_ok(call, ret, state); } #endif static long sys_openat(struct syscall_info* call) { + (void)call; if (force_traced_syscall_for_chaos_mode()) { /* Opening a FIFO could unblock a higher priority task */ - return traced_raw_syscall(call); + return 0; } const int syscallno = SYS_openat; @@ -2433,17 +2526,18 @@ static long sys_openat(struct syscall_info* call) { assert(syscallno == call->no); if (!supported_open(pathname, flags)) { - return traced_raw_syscall(call); + return 0; } ptr = prep_syscall(); if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall4(syscallno, dirfd, pathname, flags, mode); struct check_open_state state = capture_check_open_state(); - ret = commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) + return 0; return check_file_open_ok(call, ret, state); } @@ -2476,7 +2570,7 @@ static long sys_poll(struct syscall_info* call) { ptr += nfds * sizeof(*fds2); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } if (fds2) { memcpy_input_parameter(fds2, fds, nfds * sizeof(*fds2)); @@ -2502,14 +2596,15 @@ static long sys_poll(struct syscall_info* call) { * incorrectly trashing 'fds'. */ local_memcpy(fds, fds2, nfds * sizeof(*fds)); } - commit_raw_syscall(syscallno, ptr, ret); + long ok = commit_raw_syscall(syscallno, ptr, ret); if (ret != 0 || timeout == 0) { - return ret; + set_return_value(call, ret); + return ok; } /* The syscall didn't return anything, and we should have blocked. Just perform a raw syscall now since we're almost certain to block. */ - return traced_raw_syscall(call); + return 0; } #endif @@ -2533,7 +2628,7 @@ static long sys_ppoll(struct syscall_info* call) { ptr += nfds * sizeof(*fds2); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } if (fds2) { memcpy_input_parameter(fds2, fds, nfds * sizeof(*fds2)); @@ -2563,11 +2658,12 @@ static long sys_ppoll(struct syscall_info* call) { commit_raw_syscall(syscallno, ptr, ret); if (ret != 0 || (tmo_p && tmo_p->tv_sec == 0 && tmo_p->tv_nsec == 0)) { - return ret; + set_return_value(call, ret); + return 1; } /* The syscall didn't return anything, and we should have blocked. Just perform a raw syscall now since we're almost certain to block. */ - return traced_raw_syscall(call); + return 0; } #endif @@ -2594,7 +2690,7 @@ static long sys_epoll_wait(struct syscall_info* call) { ptr += max_events * sizeof(*events2); } if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } /* Try a no-timeout version of the syscall first. If this doesn't return @@ -2608,7 +2704,9 @@ static long sys_epoll_wait(struct syscall_info* call) { call->args[4], call->args[5]); ptr = copy_output_buffer(ret * sizeof(*events2), ptr, events, events2); - ret = commit_raw_syscall(call->no, ptr, ret); + if (!commit_raw_syscall(call->no, ptr, ret)) { + return 0; + } if (timeout == 0 || (ret != EINTR && ret != 0)) { /* If we got some real results, or a non-EINTR error, we can just return it directly. @@ -2620,7 +2718,8 @@ static long sys_epoll_wait(struct syscall_info* call) { returned had it run traced. (We didn't enable the desched signal so no extra signals could have affected our untraced syscall that could not have been delivered to a traced syscall.) */ - return ret; + set_return_value(call, ret); + return 1; } /* Some timeout was requested and either we got no results or we got EINTR. @@ -2635,7 +2734,7 @@ static long sys_epoll_wait(struct syscall_info* call) { itself interrupt the syscall and cause it to return EINTR just as would happen without rr. */ - return traced_raw_syscall(call); + return 0; } #define CLONE_SIZE_THRESHOLD 0x10000 @@ -2643,7 +2742,7 @@ static long sys_epoll_wait(struct syscall_info* call) { static long sys_read(struct syscall_info* call) { if (force_traced_syscall_for_chaos_mode()) { /* Reading from a pipe could unblock a higher priority task */ - return traced_raw_syscall(call); + return 0; } const int syscallno = SYS_read; @@ -2706,12 +2805,10 @@ static long sys_read(struct syscall_info* call) { ioctl_ret = privileged_untraced_syscall3(SYS_ioctl, thread_locals->cloned_file_data_fd, BTRFS_IOC_CLONE_RANGE, &ioctl_args); - ioctl_ret = commit_raw_syscall(SYS_ioctl, ioctl_ptr, ioctl_ret); + commit_raw_syscall(SYS_ioctl, ioctl_ptr, ioctl_ret); } if (ioctl_ret >= 0) { - struct syscall_info read_call = { SYS_read, - { fd, (long)buf, count, 0, 0, 0 } }; thread_locals->cloned_file_data_offset += count; replay_only_syscall3(SYS_dup3, thread_locals->cloned_file_data_fd, fd, 0); @@ -2719,12 +2816,12 @@ static long sys_read(struct syscall_info* call) { ptr = prep_syscall(); if (count > thread_locals->usable_scratch_size) { if (!start_commit_buffered_syscall(SYS_read, ptr, WONT_BLOCK)) { - return traced_raw_syscall(&read_call); + return 0; } ret = untraced_replayed_syscall3(SYS_read, fd, buf, count); } else { if (!start_commit_buffered_syscall(SYS_read, ptr, MAY_BLOCK)) { - return traced_raw_syscall(&read_call); + return 0; } ret = untraced_replayed_syscall3(SYS_read, fd, thread_locals->scratch_buf, count); @@ -2735,8 +2832,10 @@ static long sys_read(struct syscall_info* call) { // ReplaySession::flush_syscallbuf instead of // ReplaySession::enter_syscall or something similar. replay_only_syscall1(SYS_close, fd); - ret = commit_raw_syscall(SYS_read, ptr, ret); - return ret; + if (!commit_raw_syscall(SYS_read, ptr, ret)) + return 0; + set_return_value(call, ret); + return 1; } } } @@ -2750,12 +2849,15 @@ static long sys_read(struct syscall_info* call) { ptr += count; } if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(syscallno, fd, buf2, count); ptr = copy_output_buffer(ret, ptr, buf, buf2); - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) + return 0; + set_return_value(call, ret); + return 1; } /* On x86-32, pread/pwrite take the offset in two registers. We don't bother @@ -2782,12 +2884,14 @@ static long sys_pread64(struct syscall_info* call) { ptr += count; } if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall4(syscallno, fd, buf2, count, offset); ptr = copy_output_buffer(ret, ptr, buf, buf2); - return commit_raw_syscall(syscallno, ptr, ret); + commit_raw_syscall(syscallno, ptr, ret); + set_return_value(call, ret); + return 1; } #endif @@ -2809,12 +2913,14 @@ static long sys_readlink(struct syscall_info* call) { ptr += bufsiz; } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(syscallno, path, buf2, bufsiz); ptr = copy_output_buffer(ret, ptr, buf, buf2); - return commit_raw_syscall(syscallno, ptr, ret); + commit_raw_syscall(syscallno, ptr, ret); + set_return_value(call, ret); + return 1; } #endif @@ -2839,7 +2945,7 @@ static long sys_readlinkat(struct syscall_info* call, int privileged) { if (privileged) { return privileged_traced_raw_syscall(call); } - return traced_raw_syscall(call); + return 0; } if (privileged) { @@ -2848,14 +2954,22 @@ static long sys_readlinkat(struct syscall_info* call, int privileged) { ret = untraced_syscall4(syscallno, dirfd, path, buf2, bufsiz); } ptr = copy_output_buffer(ret, ptr, buf, buf2); - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + if (privileged) { + return ret; + } else { + return 1; + } } #if defined(SYS_socketcall) static long sys_socketcall_recv(struct syscall_info* call) { if (force_traced_syscall_for_chaos_mode()) { /* Reading from a socket could unblock a higher priority task */ - return traced_raw_syscall(call); + return 0; } const int syscallno = SYS_socketcall; @@ -2877,7 +2991,7 @@ static long sys_socketcall_recv(struct syscall_info* call) { ptr += len; } if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } new_args[0] = sockfd; @@ -2887,7 +3001,11 @@ static long sys_socketcall_recv(struct syscall_info* call) { ret = untraced_syscall2(SYS_socketcall, SYS_RECV, new_args); /* Account for MSG_TRUNC */ ptr = copy_output_buffer(ret_buf_len(ret, len), ptr, buf, buf2); - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } static long sys_socketcall(struct syscall_info* call) { @@ -2895,7 +3013,7 @@ static long sys_socketcall(struct syscall_info* call) { case SYS_RECV: return sys_socketcall_recv(call); default: - return traced_raw_syscall(call); + return 0; } } #endif @@ -2904,7 +3022,7 @@ static long sys_socketcall(struct syscall_info* call) { static long sys_recvfrom(struct syscall_info* call) { if (force_traced_syscall_for_chaos_mode()) { /* Reading from a socket could unblock a higher priority task */ - return traced_raw_syscall(call); + return 0; } const int syscallno = SYS_recvfrom; @@ -2942,7 +3060,7 @@ static long sys_recvfrom(struct syscall_info* call) { ptr += len; } if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } if (addrlen) { memcpy_input_parameter(addrlen2, addrlen, sizeof(*addrlen2)); @@ -2963,7 +3081,11 @@ static long sys_recvfrom(struct syscall_info* call) { } } ptr = copy_output_buffer(ret_buf_len(ret, len), ptr, buf, buf2); - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } #endif @@ -3012,7 +3134,7 @@ static int msg_received_file_descriptors(struct msghdr* msg) { static long sys_recvmsg(struct syscall_info* call) { if (force_traced_syscall_for_chaos_mode()) { /* Reading from a socket could unblock a higher priority task */ - return traced_raw_syscall(call); + return 0; } const int syscallno = SYS_recvmsg; @@ -3047,7 +3169,7 @@ static long sys_recvmsg(struct syscall_info* call) { ptr += msg->msg_iov[i].iov_len; } if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } /** @@ -3116,7 +3238,11 @@ static long sys_recvmsg(struct syscall_info* call) { */ ptr_end = ptr_overwritten_end; } - return commit_raw_syscall(syscallno, ptr_end, ret); + if (!commit_raw_syscall(syscallno, ptr_end, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } #endif @@ -3124,7 +3250,7 @@ static long sys_recvmsg(struct syscall_info* call) { static long sys_sendmsg(struct syscall_info* call) { if (force_traced_syscall_for_chaos_mode()) { /* Sending to a socket could unblock a higher priority task */ - return traced_raw_syscall(call); + return 0; } const int syscallno = SYS_sendmsg; @@ -3138,12 +3264,15 @@ static long sys_sendmsg(struct syscall_info* call) { assert(syscallno == call->no); if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(syscallno, sockfd, msg, flags); - - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } #endif @@ -3151,7 +3280,7 @@ static long sys_sendmsg(struct syscall_info* call) { static long sys_sendto(struct syscall_info* call) { if (force_traced_syscall_for_chaos_mode()) { /* Sending to a socket could unblock a higher priority task */ - return traced_raw_syscall(call); + return 0; } const int syscallno = SYS_sendto; @@ -3168,13 +3297,16 @@ static long sys_sendto(struct syscall_info* call) { assert(syscallno == call->no); if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall6(syscallno, sockfd, buf, len, flags, dest_addr, addrlen); - - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } #endif @@ -3190,12 +3322,12 @@ static long sys_setsockopt(struct syscall_info* call) { if (level == SOL_PACKET && (optname == PACKET_RX_RING || optname == PACKET_TX_RING)) { // Let rr intercept this (and probably disable it) - return traced_raw_syscall(call); + return 0; } if (level == SOL_NETLINK && (optname == NETLINK_RX_RING || optname == NETLINK_TX_RING)) { // Let rr intercept this (and probably disable it) - return traced_raw_syscall(call); + return 0; } void* ptr = prep_syscall_for_fd(sockfd); @@ -3204,12 +3336,15 @@ static long sys_setsockopt(struct syscall_info* call) { assert(syscallno == call->no); if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall5(syscallno, sockfd, level, optname, optval, optlen); - - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } #endif @@ -3225,7 +3360,7 @@ static long sys_getsockopt(struct syscall_info* call) { void* optval2; if (!optlen || !optval) { - return traced_raw_syscall(call); + return 0; } void* ptr = prep_syscall_for_fd(sockfd); @@ -3239,7 +3374,7 @@ static long sys_getsockopt(struct syscall_info* call) { assert(syscallno == call->no); if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } memcpy_input_parameter(optlen2, optlen, sizeof(*optlen2)); @@ -3261,7 +3396,11 @@ static long sys_getsockopt(struct syscall_info* call) { local_memcpy(optlen, optlen2, sizeof(*optlen)); } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } #endif @@ -3291,7 +3430,7 @@ static long sys_getsockname(struct syscall_info* call) { } if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(syscallno, sockfd, addr2, addrlen2); @@ -3304,7 +3443,11 @@ static long sys_getsockname(struct syscall_info* call) { local_memcpy(addrlen, addrlen2, sizeof(*addrlen)); } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } #endif @@ -3326,13 +3469,17 @@ static long sys_socketpair(struct syscall_info* call) { sv2 = ptr; ptr += sizeof(*sv2); if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall4(syscallno, domain, type, protocol, sv2); if (ret >= 0 && !buffer_hdr()->failed_during_preparation) { local_memcpy(sv, sv2, sizeof(*sv)); } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } #endif @@ -3347,14 +3494,18 @@ static long sys_time(struct syscall_info* call) { assert(syscallno == call->no); if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall1(syscallno, NULL); if (tp) { /* No error is possible here. */ *tp = ret; } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } #endif @@ -3382,13 +3533,17 @@ static long sys_xstat64(struct syscall_info* call) { ptr += sizeof(*buf2); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall2(syscallno, what, buf2); if (buf2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) { local_memcpy(buf, buf2, sizeof(*buf)); } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } #ifdef SYS_statx @@ -3406,7 +3561,7 @@ static long sys_statx(struct syscall_info* call) { ptr += sizeof(*buf2); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall5(syscallno, call->args[0], call->args[1], call->args[2], call->args[3], @@ -3414,7 +3569,11 @@ static long sys_statx(struct syscall_info* call) { if (buf2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) { local_memcpy(buf, buf2, sizeof(*buf)); } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } #endif @@ -3426,7 +3585,7 @@ static long sys_quotactl(struct syscall_info* call) { void* addr = (void*)call->args[3]; if ((cmd >> SUBCMDSHIFT) != Q_GETQUOTA) { - return traced_raw_syscall(call); + return 0; } void* ptr = prep_syscall(); @@ -3438,13 +3597,17 @@ static long sys_quotactl(struct syscall_info* call) { ptr += sizeof(*buf2); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall4(syscallno, cmd, special, id, buf2); if (buf2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) { local_memcpy(addr, buf2, sizeof(*buf2)); } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } static long sys_statfs(struct syscall_info* call) { @@ -3466,19 +3629,22 @@ static long sys_statfs(struct syscall_info* call) { ptr += sizeof(*buf2); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall2(syscallno, what, buf2); if (buf2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) { local_memcpy(buf, buf2, sizeof(*buf)); } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) + return 0; + set_return_value(call, ret); + return 1; } static long sys_write(struct syscall_info* call) { if (force_traced_syscall_for_chaos_mode()) { /* Writing to a pipe or FIFO could unblock a higher priority task */ - return traced_raw_syscall(call); + return 0; } const int syscallno = SYS_write; @@ -3492,12 +3658,14 @@ static long sys_write(struct syscall_info* call) { assert(syscallno == call->no); if (!start_commit_buffered_syscall(syscallno, ptr, fd_write_blocks(fd))) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(syscallno, fd, buf, count); - - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) + return 0; + set_return_value(call, ret); + return 1; } /* On x86-32, pread/pwrite take the offset in two registers. We don't bother @@ -3513,13 +3681,13 @@ static long sys_pwrite64(struct syscall_info* call) { enum syscallbuf_fd_classes cls = fd_class(fd); if (cls == FD_CLASS_TRACED) { - return traced_raw_syscall(call); + return 0; } void* ptr = prep_syscall(); assert(syscallno == call->no); if (!start_commit_buffered_syscall(syscallno, ptr, fd_write_blocks(fd))) { - return traced_raw_syscall(call); + return 0; } long ret; @@ -3529,14 +3697,17 @@ static long sys_pwrite64(struct syscall_info* call) { ret = untraced_syscall4(syscallno, fd, buf, count, offset); } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) + return 0; + set_return_value(call, ret); + return 1; } #endif static long sys_writev(struct syscall_info* call) { if (force_traced_syscall_for_chaos_mode()) { /* Writing to a pipe or FIFO could unblock a higher priority task */ - return traced_raw_syscall(call); + return 0; } int syscallno = SYS_writev; @@ -3550,12 +3721,14 @@ static long sys_writev(struct syscall_info* call) { assert(syscallno == call->no); if (!start_commit_buffered_syscall(syscallno, ptr, fd_write_blocks(fd))) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall3(syscallno, fd, iov, iovcnt); - - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) + return 0; + set_return_value(call, ret); + return 1; } static long sys_ptrace(struct syscall_info* call) { @@ -3566,7 +3739,7 @@ static long sys_ptrace(struct syscall_info* call) { void* data = (void*)call->args[3]; if (request != PTRACE_PEEKDATA || !data) { - return traced_raw_syscall(call); + return 0; } /* We try to emulate PTRACE_PEEKDATA using process_vm_readv. That might not @@ -3589,7 +3762,7 @@ static long sys_ptrace(struct syscall_info* call) { ptr += sizeof(long); if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } struct iovec local_iov = { data2, sizeof(long) }; @@ -3598,12 +3771,15 @@ static long sys_ptrace(struct syscall_info* call) { if (ret > 0 && !buffer_hdr()->failed_during_preparation) { local_memcpy(data, data2, ret); } - commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } if (ret != sizeof(long)) { - return traced_raw_syscall(call); + return 0; } - return ret; + set_return_value(call, ret); + return 1; } static long sys_getrusage(struct syscall_info* call) { @@ -3621,14 +3797,18 @@ static long sys_getrusage(struct syscall_info* call) { ptr += sizeof(struct rusage); } if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } ret = untraced_syscall2(syscallno, who, buf2); if (buf2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) { local_memcpy(buf, buf2, sizeof(*buf)); } - return commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } + set_return_value(call, ret); + return 1; } static long sys_rt_sigprocmask(struct syscall_info* call) { @@ -3640,7 +3820,7 @@ static long sys_rt_sigprocmask(struct syscall_info* call) { if (call->args[3] != sizeof(kernel_sigset_t)) { // Unusual sigset size. Bail. - return traced_raw_syscall(call); + return 0; } void* ptr = prep_syscall(); @@ -3653,7 +3833,7 @@ static long sys_rt_sigprocmask(struct syscall_info* call) { ptr += sizeof(kernel_sigset_t); if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } if (set && (how == SIG_BLOCK || how == SIG_SETMASK)) { @@ -3697,14 +3877,17 @@ static long sys_rt_sigprocmask(struct syscall_info* call) { } hdr->in_sigprocmask_critical_section = 0; - commit_raw_syscall(syscallno, ptr, ret); + if (!commit_raw_syscall(syscallno, ptr, ret)) { + return 0; + } if (ret == -EAGAIN) { // The rr supervisor emulated EAGAIN because there was a pending signal. // Retry using a traced syscall so the pending signal(s) can be delivered. - return traced_raw_syscall(call); + return 0; } - return ret; + set_return_value(call, ret); + return 1; } static long sys_rrcall_rdtsc(struct syscall_info* call) { @@ -3715,7 +3898,7 @@ static long sys_rrcall_rdtsc(struct syscall_info* call) { void* buf = ptr; ptr += 8; if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { - return traced_raw_syscall(call); + return 0; } // Do an RDTSC without context-switching to rr. This is still a lot slower @@ -3727,7 +3910,9 @@ static long sys_rrcall_rdtsc(struct syscall_info* call) { local_memcpy(tsc, buf, sizeof(tsc)); // Overwrite RDX (syscall arg 3) with our TSC value. call->args[2] = tsc[1]; - return commit_raw_syscall(syscallno, ptr, tsc[0]); + commit_raw_syscall(syscallno, ptr, tsc[0]); + call->no = tsc[0]; + return 1; #else (void)call; fatal("RDTSC not supported in this architecture"); @@ -3905,7 +4090,7 @@ case SYS_epoll_pwait: #undef CASE_GENERIC_NONBLOCKING #undef CASE_GENERIC_NONBLOCKING_FD default: - return traced_raw_syscall(call); + return 0; } } @@ -3930,7 +4115,7 @@ RR_HIDDEN long syscall_hook(struct syscall_info* call) { if (!thread_locals->buffer || buffer_hdr()->locked) { /* We may be reentering via a signal handler. Bail. */ - return traced_raw_syscall(call); + return 0; } thread_locals->original_syscall_parameters = call; @@ -3939,6 +4124,7 @@ RR_HIDDEN long syscall_hook(struct syscall_info* call) { do_delay(); } + long callno = call->no; long result = syscall_hook_internal(call); if (buffer_hdr() && buffer_hdr()->notify_on_syscall_hook_exit) { // Sometimes a signal is delivered to interrupt an untraced syscall in @@ -3979,10 +4165,10 @@ RR_HIDDEN long syscall_hook(struct syscall_info* call) { // syscall_hook_internal generates either a traced syscall or a syscallbuf // record that would be flushed by SYSCALLBUF_FLUSH, so that can't // happen. - result = _raw_syscall(SYS_rrcall_notify_syscall_hook_exit, call->args[0], + set_return_value(call, _raw_syscall(SYS_rrcall_notify_syscall_hook_exit, call->args[0], call->args[1], call->args[2], call->args[3], call->args[4], call->args[5], - RR_PAGE_SYSCALL_PRIVILEGED_TRACED, result, call->no); + RR_PAGE_SYSCALL_PRIVILEGED_TRACED, call->no, callno)); } // Do work that can only be safely done after syscallbuf can be flushed if (thread_locals->notify_control_msg) { diff --git a/src/record_signal.cc b/src/record_signal.cc index 24c0837e637..7f451546813 100644 --- a/src/record_signal.cc +++ b/src/record_signal.cc @@ -245,18 +245,6 @@ static remote_code_ptr get_stub_scratch_1(RecordTask* t) { RR_ARCH_FUNCTION(get_stub_scratch_1_arch, t->arch(), t); } -template -static void get_stub_scratch_2_arch(RecordTask* t, void *buff, size_t sz) { - auto remote_locals = AddressSpace::preload_thread_locals_start() - .cast>(); - auto remote_stub_scratch_2 = REMOTE_PTR_FIELD(remote_locals, stub_scratch_2); - t->read_bytes_helper(remote_stub_scratch_2, sz, buff); -} - -static void get_stub_scratch_2(RecordTask* t, void *buff, size_t sz) { - RR_ARCH_FUNCTION(get_stub_scratch_2_arch, t->arch(), t, buff, sz); -} - /** * This function is responsible for handling breakpoints we set in syscallbuf * code to detect sigprocmask calls and syscallbuf exit. It's called when we @@ -280,14 +268,25 @@ bool handle_syscallbuf_breakpoint(RecordTask* t) { // The address in stub_scratch_1 is already the correct address for this. if (t->arch() == aarch64) { uint64_t x15_x30[2]; - get_stub_scratch_2(t, x15_x30, 16); Registers r = t->regs(); + t->read_bytes_helper(r.x15(), sizeof(x15_x30), (void*)x15_x30); + r.set_ip(r.xlr()+4); r.set_x15(x15_x30[0]); r.set_xlr(x15_x30[1]); + // There's two possibilities here. Either we're in the bail path, in which + // case we're at a syscall instruction and are our of the critical region, + // or we're at a jump instruction to get us there, in which case we should + // evaluate it. TODO: Would it be better to instead make the jump instrcution + // part of the safe region? + if (!is_at_syscall_instruction(t, r.ip())) { + r.set_ip(r.ip()+12); + t->count_direct_jump(); + } t->set_regs(r); t->count_direct_jump(); + } else { + t->emulate_jump(get_stub_scratch_1(t)); } - t->emulate_jump(get_stub_scratch_1(t)); restore_sighandler_if_not_default(t, SIGTRAP); // Now we're back in application code so any pending stashed signals @@ -366,6 +365,78 @@ bool handle_syscallbuf_breakpoint(RecordTask* t) { return true; } +/** + * Pre-condition: We're at a syscall-entry or seccomp-trap event inside the + * syscallbuf. + * + * This function will abort the current syscall and moves us to the + * syscall-entry trap of the bail syscall. + */ +void leave_syscallbuf(RecordTask *t) { + // On aarch64, the syscallbuf final instruction breakpoint is on the + // bail path, so remove that breakpoint. + t->break_at_syscallbuf_final_instruction = false; + + remote_ptr desched_rec = t->desched_rec(); + if (!desched_rec) { + LOG(debug) << "Desched initiated"; + + /* The tracee is (re-)entering the buffered syscall. Stash + * away this breadcrumb so that we can figure out what syscall + * the tracee was in, and how much "scratch" space it carved + * off the syscallbuf, if needed. */ + desched_rec = t->next_syscallbuf_record(); + //t->push_event(DeschedEvent(desched_rec)); + //int call = t->read_mem(REMOTE_PTR_FIELD(desched_rec, syscallno)); + + /* The descheduled syscall was interrupted by a signal, like + * all other may-restart syscalls, with the exception that + * this one has already been restarted (which we'll detect + * back in the main loop). */ + //t->push_event(Event(interrupted, SyscallEvent(call, t->arch()))); + //ev.desched_rec = desched_rec; + } + + int call = t->read_mem(REMOTE_PTR_FIELD(desched_rec, syscallno)); + + t->exit_syscall(); + t->write_mem(REMOTE_PTR_FIELD(desched_rec, aborted), (uint8_t)1); + + Registers regs = t->regs(); + regs.set_syscall_result((uintptr_t)-EINTR); + t->set_regs(regs); + + LOG(debug) << " resuming (and probably switching out) blocked `" + << syscall_name(call, t->arch()) << "'"; + + // Advance until we hit the syscall entry event outside the syscallbuf, + // since that's the state we expect to be in. + while (true) { + t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_UNLIMITED_TICKS); + if (t->status().is_syscall()) { + if (t->is_in_syscallbuf()) { + continue; + } + break; + } + if (t->ptrace_event() == PTRACE_EVENT_EXIT) { + LOG(debug) + << " (got exit, bailing out)"; + t->push_event(Event::noop()); + return; + } + int sig = t->stop_sig(); + if (t->session().syscallbuf_desched_sig() == sig || + PerfCounters::TIME_SLICE_SIGNAL == sig || t->is_sig_ignored(sig)) { + LOG(debug) << " dropping ignored " << signal_name(sig); + continue; + } + + LOG(debug) << " stashing " << signal_name(sig); + t->stash_sig(); + } +} + /** * Return the event needing to be processed after this desched of |t|. * The tracee's execution may be advanced, and if so |regs| is updated @@ -572,53 +643,12 @@ static void handle_desched_event(RecordTask* t) { return; } - if (t->desched_rec()) { - // We're already processing a desched. We probably reexecuted the - // system call (e.g. because a signal was processed) and the syscall - // blocked again. Carry on with the current desched. - } else { - /* This prevents the syscallbuf record counter from being - * reset until we've finished guiding the tracee through this - * interrupted call. We use the record counter for - * assertions. */ - ASSERT(t, !t->delay_syscallbuf_reset_for_desched); - t->delay_syscallbuf_reset_for_desched = true; - LOG(debug) << "Desched initiated"; - - /* The tracee is (re-)entering the buffered syscall. Stash - * away this breadcrumb so that we can figure out what syscall - * the tracee was in, and how much "scratch" space it carved - * off the syscallbuf, if needed. */ - remote_ptr desched_rec = - t->next_syscallbuf_record(); - t->push_event(DeschedEvent(desched_rec)); - int call = t->read_mem(REMOTE_PTR_FIELD(t->desched_rec(), syscallno)); - - /* The descheduled syscall was interrupted by a signal, like - * all other may-restart syscalls, with the exception that - * this one has already been restarted (which we'll detect - * back in the main loop). */ - t->push_event(Event(interrupted, SyscallEvent(call, t->arch()))); - SyscallEvent& ev = t->ev().Syscall(); - ev.desched_rec = desched_rec; - } - - SyscallEvent& ev = t->ev().Syscall(); - ev.regs = t->regs(); - /* For some syscalls (at least poll) but not all (at least not read), - * repeated cont_syscall()s above of the same interrupted syscall - * can set $orig_eax to 0 ... for unclear reasons. Fix that up here - * otherwise we'll get a divergence during replay, which will not - * encounter this problem. - */ - int call = t->read_mem(REMOTE_PTR_FIELD(t->desched_rec(), syscallno)); - ev.regs.set_original_syscallno(call); - t->set_regs(ev.regs); - // runnable_state_changed will observe us entering this syscall and change - // state to ENTERING_SYSCALL + // Get us out of this syscall so we can unwind the buffer and resume. + Registers regs = t->regs(); + regs.set_original_syscallno((uintptr_t)-1); + t->set_regs(regs); - LOG(debug) << " resuming (and probably switching out) blocked `" - << syscall_name(call, ev.arch()) << "'"; + leave_syscallbuf(t); } static bool is_safe_to_deliver_signal(RecordTask* t, siginfo_t* si) { @@ -632,71 +662,7 @@ static bool is_safe_to_deliver_signal(RecordTask* t, siginfo_t* si) { << " because not in syscallbuf"; return true; } - - // Note that this will never fire on aarch64 in a signal stop - // since the ip has been moved to the syscall entry. - // We will catch it in the traced_syscall_entry case below. - // We will miss the exit for rrcall_notify_syscall_hook_exit - // but that should not be a big problem. - if (t->is_in_traced_syscall()) { - LOG(debug) << "Safe to deliver signal at " << t->ip() - << " because in traced syscall"; - return true; - } - - // Don't deliver signals just before entering rrcall_notify_syscall_hook_exit. - // At that point, notify_on_syscall_hook_exit will be set, but we have - // passed the point at which syscallbuf code has checked that flag. - // Replay will set notify_on_syscall_hook_exit when we replay towards the - // rrcall_notify_syscall_hook_exit *after* handling this signal, but - // that will be too late for syscallbuf to notice. - // It's OK to delay signal delivery until after rrcall_notify_syscall_hook_exit - // anyway. - if (t->is_at_traced_syscall_entry() && - !is_rrcall_notify_syscall_hook_exit_syscall(t->regs().syscallno(), t->arch())) { - LOG(debug) << "Safe to deliver signal at " << t->ip() - << " because at entry to traced syscall"; - return true; - } - - // On aarch64, the untraced syscall here include both entry and exit - // if we are at a signal stop. - if (t->is_in_untraced_syscall() && t->desched_rec()) { - // Untraced syscalls always use the architecture of the process - LOG(debug) << "Safe to deliver signal at " << t->ip() - << " because tracee interrupted by desched of " - << syscall_name(t->read_mem(REMOTE_PTR_FIELD(t->desched_rec(), - syscallno)), - t->arch()); - return true; - } - - if (t->is_in_untraced_syscall() && si->si_signo == SIGSYS && - si->si_code == SYS_SECCOMP) { - LOG(debug) << "Safe to deliver signal at " << t->ip() - << " because signal is seccomp trap."; - return true; - } - - // If the syscallbuf buffer hasn't been created yet, just delay the signal - // with no need to set notify_on_syscall_hook_exit; the signal will be - // delivered when rrcall_init_buffers is called. - if (t->syscallbuf_child) { - if (t->read_mem(REMOTE_PTR_FIELD(t->syscallbuf_child, locked)) & 2) { - LOG(debug) << "Safe to deliver signal at " << t->ip() - << " because the syscallbuf is locked"; - return true; - } - - // A signal (e.g. seccomp SIGSYS) interrupted a untraced syscall in a - // non-restartable way. Defer it until SYS_rrcall_notify_syscall_hook_exit. - if (t->is_in_untraced_syscall()) { - // Our emulation of SYS_rrcall_notify_syscall_hook_exit clears this flag. - t->write_mem( - REMOTE_PTR_FIELD(t->syscallbuf_child, notify_on_syscall_hook_exit), - (uint8_t)1); - } - } + (void)si; LOG(debug) << "Not safe to deliver signal at " << t->ip(); return false; diff --git a/src/record_syscall.cc b/src/record_syscall.cc index 13b79cbf136..4d1dd251a9e 100644 --- a/src/record_syscall.cc +++ b/src/record_syscall.cc @@ -4151,7 +4151,8 @@ static Switchable rec_prepare_syscall_arch(RecordTask* t, case Arch::close_range: case Arch::clone3: case Arch::io_uring_setup: - case Arch::io_setup: { + case Arch::io_setup: + case Arch::io_destroy: { // Prevent the various syscalls that we don't support from being used by // applications and fake an ENOSYS return. Registers r = regs; @@ -5039,11 +5040,11 @@ static Switchable rec_prepare_syscall_arch(RecordTask* t, // the syscallbuf as a fake rrcall_rdtsc, but we then can't buffer it // because the buffer is full or disabled. case SYS_rrcall_rdtsc: { - syscall_state.emulate_result(0); uint64_t tsc = rdtsc(); - remote_ptr addr(t->regs().arg1()); - t->write_mem(addr, tsc); - t->record_local(addr, &tsc); + syscall_state.emulate_result((uint32_t)tsc); + Registers regs = t->regs(); + regs.set_dx(tsc >> 32); + t->set_regs(regs); return PREVENT_SWITCH; } diff --git a/src/syscalls.py b/src/syscalls.py index 5cbabf69120..503d647c16d 100644 --- a/src/syscalls.py +++ b/src/syscalls.py @@ -1242,7 +1242,9 @@ def __init__(self, **kwargs): get_thread_area = IrregularEmulatedSyscall(x86=244, x64=211) io_setup = IrregularEmulatedSyscall(x86=245, x64=206, generic=0) -io_destroy = UnsupportedSyscall(x86=246, x64=207, generic=1) +# syscall 1 is common to get if a register gets corrupted. We just ENOSYS +# it to avoid rr asserting for this kind of corruption. +io_destroy = IrregularEmulatedSyscall(x86=246, x64=207, generic=1) io_getevents = UnsupportedSyscall(x86=247, x64=208, generic=4) io_submit = UnsupportedSyscall(x86=248, x64=209, generic=2) io_cancel = UnsupportedSyscall(x86=249, x64=210, generic=3) diff --git a/src/test/doublesegv.c b/src/test/doublesegv.c index 10a9d4d2e55..4eacda70afb 100644 --- a/src/test/doublesegv.c +++ b/src/test/doublesegv.c @@ -33,9 +33,9 @@ int main(void) { act.sa_sigaction = fault_handler; act.sa_flags = SA_ONSTACK | SA_SIGINFO; sigemptyset(&act.sa_mask); - sigaction(SIGSEGV, &act, NULL); + test_assert(0 == sigaction(SIGSEGV, &act, NULL)); - pthread_create(&thread, NULL, do_thread, NULL); + test_assert(0 == pthread_create(&thread, NULL, do_thread, NULL)); test_assert(0 == sched_yield()); sleep(1000); test_assert(0 && "Should not reach here"); diff --git a/src/test/execve_loop.c b/src/test/execve_loop.c index 171ada58724..d91d2543fb6 100644 --- a/src/test/execve_loop.c +++ b/src/test/execve_loop.c @@ -7,6 +7,11 @@ what we want to test here. */ int main(__attribute__((unused)) int argc, char* argv[], char* envp[]) { + if (argc < 2) { + atomic_printf("Usage: %s \n", argv[0]); + exit(1); + } + int count = atoi(argv[1]); if (count > 0) { diff --git a/src/test/expect_in_atomic_printf.py b/src/test/expect_in_atomic_printf.py index 10338698653..b6fb6235aae 100644 --- a/src/test/expect_in_atomic_printf.py +++ b/src/test/expect_in_atomic_printf.py @@ -1,6 +1,12 @@ from util import * import re +# Advance a bit, we may be in the jump stub. +# TODO: It would be nice to just teach gdb about this +for i in range(0,7): + send_gdb('stepi') + expect_gdb('(rr)') + send_gdb('bt') expect_gdb('atomic_printf') diff --git a/src/test/expect_in_exit.py b/src/test/expect_in_exit.py index 921fde5dbc0..22cb08ceb55 100644 --- a/src/test/expect_in_exit.py +++ b/src/test/expect_in_exit.py @@ -1,7 +1,11 @@ from util import * import re -send_gdb('reverse-stepi') +# Step out of the extended syscall jump patch. +for i in range(0,3): + send_gdb('reverse-stepi') + expect_gdb('(rr)') + send_gdb('bt') expect_gdb('_exit') diff --git a/src/test/get_thread_list.py b/src/test/get_thread_list.py index ccae5cc4139..d508a4a3c51 100644 --- a/src/test/get_thread_list.py +++ b/src/test/get_thread_list.py @@ -26,11 +26,17 @@ '(0x[0-9a-f]+ in )?pthread_barrier_wait', '(0x[0-9a-f]+ in )?futex_wait', '0x0*70000002 in \?\?', + # This is the extended jump page. We hide it from the application, + # but not from GDB. Eventually we may want to supply some additional + # debug info to GDB to teach it about this, but for now we just let it be. + '0x[0-9a-f]+ in \?\?', '(0x[0-9a-f]+ in )?syscall_traced', '(0x[0-9a-f]+ in )?rr_page_start'], 'aarch64': ['(0x[0-9a-f]+ in )?syscall_traced', '(0x[0-9a-f]+ in )?pthread_barrier_wait', - '(0x[0-9a-f]+ in )?futex_wait'] + '(0x[0-9a-f]+ in )?futex_wait', + # Extended jump page + '0x[0-9a-f]+ in \?\?'] } for i in range(NUM_THREADS + 1, 1, -1): diff --git a/src/test/step_thread.py b/src/test/step_thread.py index 672c351d751..23c7a600f78 100644 --- a/src/test/step_thread.py +++ b/src/test/step_thread.py @@ -68,12 +68,18 @@ '(0x[0-9a-f]+ in )?pthread_barrier_wait', '(0x[0-9a-f]+ in )?futex_wait', '0x0*70000002 in \?\?', + # This is the extended jump page. We hide it from the application, + # but not from GDB. Eventually we may want to supply some additional + # debug info to GDB to teach it about this, but for now we just let it be. + '0x[0-9a-f]+ in \?\?', '(0x[0-9a-f]+ in )?syscall_traced', '(0x[0-9a-f]+ in )?rr_page_start' ], 'aarch64': ['(0x[0-9a-f]+ in )?syscall_traced', '(0x[0-9a-f]+ in )?pthread_barrier_wait', - '(0x[0-9a-f]+ in )?futex_wait'], + '(0x[0-9a-f]+ in )?futex_wait', + # Extended jump page + '0x[0-9a-f]+ in \?\?',], } location_regex = '|'.join(stopped_locations[arch]) diff --git a/src/test/vdso_stack.py b/src/test/vdso_stack.py index a6c060f2e47..95a3a331fba 100644 --- a/src/test/vdso_stack.py +++ b/src/test/vdso_stack.py @@ -5,7 +5,10 @@ send_gdb('c') expect_gdb('Breakpoint 1') -send_gdb('break traced_raw_syscall') +# This was supposed to check the unwinding in the bail path of the vdso, +# but we now unwind the syscallbuf before performing the bail syscall, +# so just check the stack at the main syscall_hook entry. +send_gdb('break syscall_hook') expect_gdb('Breakpoint 2') send_gdb('c') expect_gdb('Breakpoint 2') From 618715cdb783f97564367acb9629738cb03967b8 Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Thu, 23 Jun 2022 20:12:44 -0400 Subject: [PATCH 2/4] Add test case for switching out of desched syscall This adds a test case to model #3285, where the test case pokes the sigframe to force sigreturn to switch to a different function than that which incurred the signal. --- CMakeLists.txt | 1 + src/test/desched_sigreturn.c | 85 ++++++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+) create mode 100644 src/test/desched_sigreturn.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 5e83ebd4380..ee63050bd6b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -912,6 +912,7 @@ set(BASIC_TESTS daemon desched_blocking_poll desched_sigkill + desched_sigreturn detach_state detach_threads detach_sigkill diff --git a/src/test/desched_sigreturn.c b/src/test/desched_sigreturn.c new file mode 100644 index 00000000000..b46be2acfba --- /dev/null +++ b/src/test/desched_sigreturn.c @@ -0,0 +1,85 @@ +/* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ + +#include "util.h" + +pid_t main_thread_tid = 0; +int fds[2]; +char zeros[8192]; + +void sigproc_and_hang(void) +{ + sigset_t sigs; + sigfillset(&sigs); + sigdelset(&sigs, SIGUSR2); + sigprocmask(SIG_SETMASK, &sigs, NULL); + write(fds[1], &zeros, 8192); + write(fds[1], &zeros, 8192); + test_assert(0); +} + +void print_and_exit(void) +{ + atomic_printf("EXIT-SUCCESS\n"); + exit(0); +} + +volatile int counter = 0; +void usr2_handler(__attribute__((unused)) int signum, + __attribute__((unused)) siginfo_t* siginfo_ptr, + void* ucontext_ptr) { + uintptr_t target = counter == 0 ? (uintptr_t)&sigproc_and_hang : (uintptr_t)&print_and_exit; + counter += 1; +#if defined(__i386__) + ucontext_t* ctx = (ucontext_t*)ucontext_ptr; + ctx->uc_mcontext.gregs[REG_EIP] = (uint32_t)target; +#elif defined(__x86_64__) + ucontext_t* ctx = (ucontext_t*)ucontext_ptr; + ctx->uc_mcontext.gregs[REG_RIP] = (long long)target; +#elif defined(__aarch64__) + ucontext_t* ctx = (ucontext_t*)ucontext_ptr; + ctx->uc_mcontext.pc = (long)target; +#else + #error "Unsupported architecture" +#endif +} + +static void* signaler_thread(__attribute__((unused)) void* p) { + for (int i = 0; i < 10; ++i) + sched_yield(); + syscall(SYS_tgkill, getpid(), main_thread_tid, SIGUSR2); + // Technically should use atomics, but volatile is good enough for this test, + // since we're doing a syscall in the loop. + while (counter == 0) + sched_yield(); + for (int i = 0; i < 10; ++i) + sched_yield(); + syscall(SYS_tgkill, getpid(), main_thread_tid, SIGUSR2); + return NULL; +} + +int main(void) { + int err = pipe(fds); + test_assert(err == 0); + + err = fcntl(fds[1], F_SETPIPE_SZ, 4096); + test_assert(err > 0); + + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = usr2_handler; + sa.sa_flags = SA_ONSTACK | SA_SIGINFO | SA_RESTART; + err = sigaction(SIGUSR2, &sa, NULL); + test_assert(err == 0); + + main_thread_tid = sys_gettid(); + + pthread_t thread; + pthread_create(&thread, NULL, signaler_thread, NULL); + + // Block on pipe read. + int ch = 0; + err = read(fds[0], &ch, sizeof(int)); + test_assert(0); + + return 0; +} From df8d55c290a172fdcb7b929f4cc03566e771b54e Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Sat, 9 Jul 2022 23:27:50 +0000 Subject: [PATCH 3/4] Allow deterministic signals during syscallbuf, but give a loud error --- src/record_signal.cc | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/record_signal.cc b/src/record_signal.cc index 7f451546813..c9a24a81e9f 100644 --- a/src/record_signal.cc +++ b/src/record_signal.cc @@ -651,7 +651,7 @@ static void handle_desched_event(RecordTask* t) { leave_syscallbuf(t); } -static bool is_safe_to_deliver_signal(RecordTask* t, siginfo_t* si) { +static bool is_safe_to_deliver_signal(RecordTask* t, siginfo_t *si, SignalDeterministic deterministic) { if (!t->is_in_syscallbuf()) { /* The tracee is outside the syscallbuf code, * so in most cases can't possibly affect @@ -662,7 +662,13 @@ static bool is_safe_to_deliver_signal(RecordTask* t, siginfo_t* si) { << " because not in syscallbuf"; return true; } - (void)si; + + if (deterministic == DETERMINISTIC_SIG) { + LOG(error) << "Recevied deterministic signal " << signal_name(si->si_signo) + << " while in syscallbuf code.\n Ordinarily this should never happen.\n" + << "Recording will proceed, but additional errors or a corrupted trace may follow."; + return true; + } LOG(debug) << "Not safe to deliver signal at " << t->ip(); return false; @@ -714,7 +720,7 @@ SignalHandled handle_signal(RecordTask* t, siginfo_t* si, return SIGNAL_HANDLED; } - if (!is_safe_to_deliver_signal(t, si)) { + if (!is_safe_to_deliver_signal(t, si, deterministic)) { return DEFER_SIGNAL; } From d6dc8590c9861d63831773b8431c8aabbb252f45 Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Sun, 17 Jul 2022 02:55:27 +0000 Subject: [PATCH 4/4] Address Yichao's review --- src/Monkeypatcher.cc | 18 +++++++++--------- src/preload/syscall_hook.S | 5 ++++- src/record_signal.cc | 2 +- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/Monkeypatcher.cc b/src/Monkeypatcher.cc index 23115cc823a..1efa5e233b3 100644 --- a/src/Monkeypatcher.cc +++ b/src/Monkeypatcher.cc @@ -500,8 +500,8 @@ remote_code_ptr Monkeypatcher::get_jump_stub_exit_breakpoint(remote_code_ptr ip, --it; patched_syscall *ps = &syscall_stub_list[it->second]; auto bp = it->first + ps->size - ps->safe_suffix; - if (pp == bp - 4 || pp == bp - 8) { - return remote_code_ptr((it->first + ps->size - 4).as_int()); + if (pp == bp - 4 || pp == bp - 8 || pp == bp - 12) { + return remote_code_ptr((it->first + ps->size - 12).as_int()); } return nullptr; } @@ -717,13 +717,13 @@ bool patch_syscall_with_hook_arch(Monkeypatcher& patcher, 2 * 4, /** * safe_suffix: - * We've returned from syscallbuf and continue execution - * won't hit syscallbuf breakpoint - * (this also include the 8 bytes that stores the return address) - * Note that stack restore instruction also belongs to the syscallbuf return path - * However, since it is still using the scratch memory, - * it doesn't belong to the safe area. - * The caller needs to have special handling for that instruction. + * The safe suffix are all instructions that are no longer using syscallbuf + * private stack memory. On aarch64, that is the bail path svc instruction + * and the final jump instruction (including the 8 byte return address). + * See the detailed extended jump patch assembly above for details. + * Note that the stack restore instructions also occurr on the syscallbuf + * return path, but are not considered part of the safe suffix, since they + * still rely on the syscallbuf stack memory to function properly. */ 2 * 4 + 8 }); diff --git a/src/preload/syscall_hook.S b/src/preload/syscall_hook.S index 9191f5f9a6d..a07a40cbaf0 100644 --- a/src/preload/syscall_hook.S +++ b/src/preload/syscall_hook.S @@ -891,7 +891,9 @@ retq _syscallbuf_code_start: _syscall_hook_trampoline: - // stack frame: + // parent frame: + // 0 (688): lr from the extended jump patch [this gets rewritten here in the bail path] + // this stack frame: // 208-688: q2 - q31 // 128-200: x10 - x18 // 112-128: x7, x9 @@ -952,6 +954,7 @@ _syscall_hook_trampoline: cbnz x0, 1f // If the function requested the bail path, rewrite the return address + // N.B.: This modifies the stack address saved in the parent frame. ldr x0, [sp, 688] add x0, x0, 8 str x0, [sp, 688] diff --git a/src/record_signal.cc b/src/record_signal.cc index c9a24a81e9f..c6a2d7015ab 100644 --- a/src/record_signal.cc +++ b/src/record_signal.cc @@ -298,7 +298,7 @@ bool handle_syscallbuf_breakpoint(RecordTask* t) { LOG(debug) << "Reached syscallstub exit instruction, singlestepping to " "enable signal dispatch"; ASSERT(t, t->arch() == aarch64 && t->syscallstub_exit_breakpoint); - auto retaddr_addr = t->syscallstub_exit_breakpoint.to_data_ptr() + 3 * 4; + auto retaddr_addr = t->syscallstub_exit_breakpoint.to_data_ptr() + 4; uint64_t retaddr; t->read_bytes_helper(retaddr_addr, sizeof(retaddr), &retaddr); Registers r = t->regs();