diff --git a/CMakeLists.txt b/CMakeLists.txt index 7d5d7f60057..bdd35e67449 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -556,17 +556,26 @@ endforeach(generated_file) add_custom_target(Generated DEPENDS ${GENERATED_FILES}) -add_custom_command(OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/rr_trace.capnp.c++" - "${CMAKE_CURRENT_BINARY_DIR}/rr_trace.capnp.h" - COMMAND capnp compile - "--src-prefix=${CMAKE_CURRENT_SOURCE_DIR}/src" - "-oc++:${CMAKE_CURRENT_BINARY_DIR}" - "${CMAKE_CURRENT_SOURCE_DIR}/src/rr_trace.capnp" - DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/src/rr_trace.capnp") -set_source_files_properties("${CMAKE_CURRENT_BINARY_DIR}/rr_trace.capnp.c++" - PROPERTIES GENERATED true) -set_source_files_properties("${CMAKE_CURRENT_BINARY_DIR}/rr_trace.capnp.h" - PROPERTIES GENERATED true HEADER_FILE_ONLY true) + +set(CAPNP_FILES + rr_trace + rr_pcp +) + +# Compile capnproto files +foreach(capnp_file ${CAPNP_FILES}) +add_custom_command(OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${capnp_file}.capnp.c++" + "${CMAKE_CURRENT_BINARY_DIR}/${capnp_file}.capnp.h" + COMMAND capnp compile + "--src-prefix=${CMAKE_CURRENT_SOURCE_DIR}/src" + "-oc++:${CMAKE_CURRENT_BINARY_DIR}" + "${CMAKE_CURRENT_SOURCE_DIR}/src/${capnp_file}.capnp" + DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/src/${capnp_file}.capnp") + set_source_files_properties("${CMAKE_CURRENT_BINARY_DIR}/${capnp_file}.capnp.c++" + PROPERTIES GENERATED true) + set_source_files_properties("${CMAKE_CURRENT_BINARY_DIR}/${capnp_file}.capnp.h" + PROPERTIES GENERATED true HEADER_FILE_ONLY true) +endforeach() if (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64") set(BLAKE_ARCH_DIR third-party/blake2/neon) @@ -578,12 +587,14 @@ set(RR_SOURCES src/AddressSpace.cc src/AutoRemoteSyscalls.cc src/BuildidCommand.cc + src/CheckpointInfo.cc src/Command.cc src/CompressedReader.cc src/CompressedWriter.cc src/ContextSwitchEvent.cc src/CPUFeaturesCommand.cc src/CPUIDBugDetector.cc + src/CreateCheckpointsCommand.cc src/DiversionSession.cc src/DumpCommand.cc src/Dwarf.cc @@ -621,6 +632,7 @@ set(RR_SOURCES src/PackCommand.cc src/PerfCounters.cc src/PerfCounterBuffers.cc + src/PersistentCheckpointing.cc src/PidFdMonitor.cc src/processor_trace_check.cc src/ProcFdDirMonitor.cc @@ -660,6 +672,7 @@ set(RR_SOURCES src/WaitManager.cc src/WaitStatus.cc ${CMAKE_CURRENT_BINARY_DIR}/rr_trace.capnp.c++ + ${CMAKE_CURRENT_BINARY_DIR}/rr_pcp.capnp.c++ ${BLAKE_ARCH_DIR}/blake2b.c ) @@ -1499,6 +1512,7 @@ set(TESTS_WITH_PROGRAM # check_session_leaks checkpoint_dying_threads checkpoint_mixed_mode + checkpoint_persistent_shmem checksum_sanity check_lost_interrupts clone_file_range diff --git a/src/AddressSpace.cc b/src/AddressSpace.cc index 883a09e7b3e..3243520c07b 100644 --- a/src/AddressSpace.cc +++ b/src/AddressSpace.cc @@ -554,6 +554,11 @@ void AddressSpace::save_auxv(Task* t) { save_interpreter_base(t, saved_auxv()); } +void AddressSpace::restore_auxv(Task* t, std::vector&& auxv) { + saved_auxv_ = std::move(auxv); + save_interpreter_base(t, saved_auxv()); +} + void AddressSpace::save_interpreter_base(Task* t, std::vector auxv) { saved_interpreter_base_ = read_interpreter_base(auxv); save_ld_path(t, saved_interpreter_base()); diff --git a/src/AddressSpace.h b/src/AddressSpace.h index 6f6338df607..46a1bac3a91 100644 --- a/src/AddressSpace.h +++ b/src/AddressSpace.h @@ -662,6 +662,14 @@ class AddressSpace : public HasTaskSet { * Dies if no shm size is registered for the address. */ size_t get_shm_size(remote_ptr addr) { return shm_sizes[addr]; } + + /** + * Check if `map` is shared memory + */ + bool has_shm_at(const KernelMapping& map) const { + return shm_sizes.find(map.start()) != std::cend(shm_sizes); + } + void remove_shm_size(remote_ptr addr) { shm_sizes.erase(addr); } /** @@ -795,6 +803,9 @@ class AddressSpace : public HasTaskSet { const std::vector& saved_auxv() { return saved_auxv_; } void save_auxv(Task* t); + /* Used when restoring persistent checkpoints. */ + void restore_auxv(Task* t, std::vector&& auxv); + remote_ptr saved_interpreter_base() { return saved_interpreter_base_; } void save_interpreter_base(Task* t, std::vector auxv); @@ -873,6 +884,15 @@ class AddressSpace : public HasTaskSet { bool legacy_breakpoint_mode() { return stopping_breakpoint_table_ != nullptr; } remote_code_ptr do_breakpoint_fault_addr() { return do_breakpoint_fault_addr_; } + + void set_breakpoint_fault_addr(remote_code_ptr addr) { + do_breakpoint_fault_addr_ = addr; + } + + void set_uses_syscall_buffer(bool uses_syscall_buffer = true) { + syscallbuf_enabled_ = uses_syscall_buffer; + } + remote_code_ptr stopping_breakpoint_table() { return stopping_breakpoint_table_; } int stopping_breakpoint_table_entry_size() { return stopping_breakpoint_table_entry_size_; } diff --git a/src/BpfMapMonitor.h b/src/BpfMapMonitor.h index 9ee1e1c67f5..6095d26eb6d 100644 --- a/src/BpfMapMonitor.h +++ b/src/BpfMapMonitor.h @@ -14,12 +14,19 @@ class BpfMapMonitor : public FileMonitor { public: BpfMapMonitor(uint64_t key_size, uint64_t value_size) : key_size_(key_size), value_size_(value_size) {} - virtual Type type() override { return BpfMap; } + virtual Type type() const override { return BpfMap; } uint64_t key_size() const { return key_size_; } uint64_t value_size() const { return value_size_; } private: + virtual void serialize_type( + pcp::FileMonitor::Builder& builder) const noexcept override { + auto bpf = builder.initBpf(); + bpf.setKeySize(key_size_); + bpf.setValueSize(value_size_); + } + uint64_t key_size_; uint64_t value_size_; }; diff --git a/src/CheckpointInfo.cc b/src/CheckpointInfo.cc new file mode 100644 index 00000000000..1652280beaf --- /dev/null +++ b/src/CheckpointInfo.cc @@ -0,0 +1,333 @@ +#include "CheckpointInfo.h" +#include "GdbServerConnection.h" +#include "ReplayTimeline.h" +#include "ScopedFd.h" +#include "rr_pcp.capnp.h" +#include "util.h" +#include +#include +#include +#include +#include +#include +#include + +namespace rr { + +MarkData::MarkData(const ReplayTimeline::Mark& m) + : time(m.get_key().trace_time), + ticks(m.get_key().ticks), + step_key(m.get_key().step_key.as_int()), + ticks_at_event_start(m.get_internal()->ticks_at_event_start), + regs(m.regs()), + extra_regs(m.extra_regs()), + return_addresses(m.get_internal()->proto.return_addresses), + singlestep_to_next_mark_no_signal( + m.get_internal()->singlestep_to_next_mark_no_signal), + arch(m.get_internal()->extra_regs.arch()) {} + +MarkData::MarkData(rr::pcp::MarkData::Reader reader, + const CPUIDRecords& cpuid_recs) + : time(reader.getTime()), + ticks(reader.getTicks()), + step_key(reader.getStepKey()), + ticks_at_event_start(reader.getTicksAtEventStart()), + regs(), + extra_regs(), + return_addresses(), + singlestep_to_next_mark_no_signal( + reader.getSinglestepToNextMarkNoSignal()), + arch(from_trace_arch(reader.getArch())) { + regs.set_arch(arch); + regs.set_from_trace(arch, reader.getRegs().getRaw().begin(), + reader.getRegs().getRaw().size()); + auto eregs = reader.getExtraRegs().getRaw(); + set_extra_regs_from_raw(arch, cpuid_recs, eregs, extra_regs); + auto i = 0; + for (auto rs : reader.getReturnAddresses()) { + return_addresses.addresses[i++] = rs; + } +} + +static std::vector checkpoint_directories(const string& trace_dir) { + std::vector result; + auto tracedir = opendir(trace_dir.c_str()); + if (!tracedir) { + return {}; + } + for (dirent* e = readdir(tracedir); e; e = readdir(tracedir)) { + std::string_view filename{ e->d_name }; + if (filename.find("checkpoint-") == 0) { + stringstream metadata_file{}; + metadata_file << trace_dir << '/' << filename; + auto f = metadata_file.str(); + struct stat buffer; + if (stat(f.c_str(), &buffer) == 0) { + result.push_back(std::move(f)); + } + } + } + closedir(tracedir); + return result; +} + +std::vector get_checkpoint_infos( + const std::string& trace_dir, const CPUIDRecords& cpuid_recs) { + + std::vector checkpoints; + for (auto checkpoint_dir : checkpoint_directories(trace_dir)) { + auto metadata_file = checkpoint_dir + "/metadata"; + ScopedFd fd(metadata_file.c_str(), O_RDONLY); + if (!fd.is_open()) { + continue; + } + capnp::PackedFdMessageReader reader(fd); + auto checkpointsInfoReader = reader.getRoot(); + auto info = + CheckpointInfo{ checkpoint_dir, checkpointsInfoReader, cpuid_recs }; + checkpoints.push_back(info); + } + + std::sort(checkpoints.begin(), checkpoints.end(), + [](CheckpointInfo& a, CheckpointInfo& b) { + return a.clone_data.time <= b.clone_data.time; + }); + return checkpoints; +} + +bool CheckpointInfo::serialize(ReplaySession& session) { + // and write a new one + auto fd = open_for_write(); + if (!fd.is_open() && errno == EEXIST) { + // we just nope out, if it exists already. 1 checkpoint per FrameTime + // allowed currently. + std::cout << "path already exists: " << capnp_directory << std::endl; + return false; + } else if (!fd.is_open()) { + FATAL() << "failed to open file " << capnp_directory; + } + + capnp::MallocMessageBuilder message; + pcp::CheckpointInfo::Builder cp_entry = + message.initRoot(); + auto clone_writer = cp_entry.getCloneCompletion(); + session.serialize_checkpoint(clone_writer, *this); + cp_entry.setId(unique_id); + auto tuid = cp_entry.initLastContinueTask(); + tuid.setGroupId(last_continue_task.tguid.tid()); + tuid.setGroupSerial(last_continue_task.tguid.serial()); + tuid.setTaskId(last_continue_task.tuid.tid()); + tuid.setTaskSerial(last_continue_task.tuid.serial()); + + cp_entry.setWhere(str_to_data(where)); + cp_entry.setNextSerial(next_serial); + auto statsWriter = cp_entry.getStatistics(); + statsWriter.setBytesWritten(stats.bytes_written); + statsWriter.setSyscallsPerformed(stats.syscalls_performed); + statsWriter.setTicksProcessed(stats.ticks_processed); + + const auto mark_data_serializer = [](const MarkData& mark_data, + auto& builder) { + builder.setTime(mark_data.time); + builder.setStepKey(mark_data.step_key); + builder.setTicks(mark_data.ticks); + builder.initRegs().setRaw(regs_to_raw(mark_data.regs)); + auto ras = builder.initReturnAddresses(8); + for (auto i = 0; i < 8; i++) { + ras.set(i, mark_data.return_addresses.addresses[i].as_int()); + } + builder.initExtraRegs().setRaw(extra_regs_to_raw(mark_data.extra_regs)); + builder.setTicksAtEventStart(mark_data.ticks_at_event_start); + builder.setSinglestepToNextMarkNoSignal( + mark_data.singlestep_to_next_mark_no_signal); + builder.setArch(to_trace_arch(mark_data.arch)); + }; + + if (is_explicit()) { + auto explicit_builder = cp_entry.initExplicit(); + mark_data_serializer(clone_data, explicit_builder); + } else { + auto non_explicit = cp_entry.initNonExplicit(); + // mark that holds _actual_ session clone + auto mark_with_clone = non_explicit.initCloneMark(); + mark_data_serializer(clone_data, mark_with_clone); + // mark that only holds a mark. It gets very messy quickly, wrt to Marks, + // Clones, Checkpoints. + auto mark_with_gdb_checkpoint = non_explicit.initCheckpointMark(); + mark_data_serializer(*non_explicit_mark_data, mark_with_gdb_checkpoint); + } + capnp::writePackedMessageToFd(fd, message); + return true; +} + +bool CheckpointInfo::exists_on_disk() const { + struct stat buf; + return stat(capnp_file_path().c_str(), &buf) == 0; +} + +void CheckpointInfo::set_capnp_directory(const ReplayTimeline::Mark& mark) { + + capnp_directory = mark.get_checkpoint()->trace_reader().dir() + + "/checkpoint-" + std::to_string(mark.time()); +} + +CheckpointInfo::CheckpointInfo(const Checkpoint& c) + : unique_id(CheckpointInfo::generate_unique_id(c.unique_id)), + last_continue_task(c.last_continue_task), + where(c.where), + clone_data(c.mark), + non_explicit_mark_data(nullptr) { + DEBUG_ASSERT(c.is_explicit == Checkpoint::EXPLICIT && + c.mark.has_rr_checkpoint()); + // can't assert before ctor, set these values here. + next_serial = c.mark.get_checkpoint()->current_task_serial(); + stats = c.mark.get_checkpoint()->statistics(); + LOG(debug) << "checkpoint clone at " << clone_data.time + << "; GDB checkpoint at " << clone_data.time; + set_capnp_directory(c.mark); +} + +CheckpointInfo::CheckpointInfo(ExtendedTaskId last_continue, + const ReplayTimeline::Mark& mark_with_checkpoint) + : unique_id(CheckpointInfo::generate_unique_id()), + last_continue_task(last_continue), + where("Unknown"), + next_serial(mark_with_checkpoint.get_checkpoint()->current_task_serial()), + clone_data(mark_with_checkpoint), + non_explicit_mark_data(nullptr), + stats(mark_with_checkpoint.get_checkpoint()->statistics()) { + LOG(debug) << "checkpoint clone at " << clone_data.time + << "; GDB checkpoint at " << clone_data.time; + set_capnp_directory(mark_with_checkpoint); +} + +CheckpointInfo::CheckpointInfo(const Checkpoint& non_explicit_cp, + const ReplayTimeline::Mark& mark_with_clone) + : unique_id(CheckpointInfo::generate_unique_id(non_explicit_cp.unique_id)), + last_continue_task(non_explicit_cp.last_continue_task), + where(non_explicit_cp.where), + next_serial(mark_with_clone.get_checkpoint()->current_task_serial()), + clone_data(mark_with_clone), + non_explicit_mark_data(new MarkData{ non_explicit_cp.mark }), + stats(mark_with_clone.get_checkpoint()->statistics()) { + DEBUG_ASSERT(non_explicit_cp.is_explicit == Checkpoint::NOT_EXPLICIT && + !non_explicit_cp.mark.has_rr_checkpoint() && + "Constructor meant for non explicit checkpoints"); + // XXX we give this checkpoint the id (and name/path) of the actual cloned + // session data, so that multiple non explicit checkpoints later on, can + // reference the same clone data (not yet implemented) + LOG(debug) << "checkpoint clone at " << clone_data.time + << "; GDB checkpoint at " << non_explicit_mark_data->time; + set_capnp_directory(mark_with_clone); +} + +CheckpointInfo::CheckpointInfo(std::string metadata_file, + rr::pcp::CheckpointInfo::Reader reader, + const CPUIDRecords& cpuid_recs) + : capnp_directory(std::move(metadata_file)), + unique_id(reader.getId()), + where(data_to_str(reader.getWhere())), + next_serial(reader.getNextSerial()), + clone_data(reader.isExplicit() ? reader.getExplicit() + : reader.getNonExplicit().getCloneMark(), + cpuid_recs), + non_explicit_mark_data( + reader.isNonExplicit() + ? new MarkData{ reader.getNonExplicit().getCheckpointMark(), + cpuid_recs } + : nullptr), + stats() { + auto t = reader.getLastContinueTask(); + last_continue_task = ExtendedTaskId{ { t.getGroupId(), t.getGroupSerial() }, + { t.getTaskId(), t.getTaskSerial() } }; + auto s = reader.getStatistics(); + stats.bytes_written = s.getBytesWritten(); + stats.syscalls_performed = s.getSyscallsPerformed(); + stats.ticks_processed = s.getTicksProcessed(); +} + +void CheckpointInfo::delete_from_disk() { + const auto remove_file = [](auto path_data) { + const auto path = data_to_str(path_data); + if (remove(path.c_str()) != 0) { + LOG(error) << "Failed to remove " << path; + } + }; + ScopedFd fd(capnp_directory.c_str(), O_RDONLY); + capnp::PackedFdMessageReader datum(fd); + pcp::CloneCompletionInfo::Reader cc_reader = + datum.getRoot(); + const auto addr_spaces = cc_reader.getAddressSpaces(); + for (const auto& as : addr_spaces) { + const auto mappings_data = as.getProcessSpace().getVirtualAddressSpace(); + for (const auto& m : mappings_data) { + switch (m.getMapType().which()) { + case pcp::KernelMapping::MapType::FILE: + remove_file(m.getMapType().getFile().getContentsPath()); + break; + case pcp::KernelMapping::MapType::SHARED_ANON: + remove_file(m.getMapType().getSharedAnon().getContentsPath()); + break; + case pcp::KernelMapping::MapType::PRIVATE_ANON: + remove_file(m.getMapType().getPrivateAnon().getContentsPath()); + break; + case pcp::KernelMapping::MapType::GUARD_SEGMENT: + break; + case pcp::KernelMapping::MapType::SYSCALL_BUFFER: + remove_file(m.getMapType().getSyscallBuffer().getContentsPath()); + break; + case pcp::KernelMapping::MapType::RR_PAGE: + remove_file(m.getMapType().getRrPage().getContentsPath()); + break; + } + } + } + + remove(capnp_directory.c_str()); + remove(data_directory().c_str()); + if (exists_on_disk()) { + LOG(error) << "Couldn't remove persistent checkpoint data (or directory)"; + } +} + +ScopedFd CheckpointInfo::open_for_read() const { + DEBUG_ASSERT(exists_on_disk() && "This checkpoint has not been serialized; " + "or the index file has been removed."); + auto file = ScopedFd(capnp_file_path().c_str(), O_RDONLY); + if (!file.is_open()) + FATAL() << "Couldn't open checkpoint data " << file; + return file; +} + +ScopedFd CheckpointInfo::open_for_write() const { + DEBUG_ASSERT(!exists_on_disk() && + "Already serialized checkpoints shouldn't be re-written"); + auto file = + ScopedFd(capnp_file_path().c_str(), O_EXCL | O_CREAT | O_RDWR, 0700); + if (!file.is_open()) + FATAL() << "Couldn't open checkpoint file for writing " + << capnp_file_path(); + return file; +} + +std::string CheckpointInfo::capnp_file_path() const { + return capnp_directory + "/metadata"; +} + +const std::string& CheckpointInfo::data_directory() const { + return capnp_directory; +} + +/*static*/ size_t CheckpointInfo::generate_unique_id(size_t id) { + // if we haven't been set already, generate a unique "random" id + if (id == 0) { + timeval t; + gettimeofday(&t, nullptr); + auto cp_id = (t.tv_sec * 1000 + t.tv_usec / 1000); + return cp_id; + } else { + return id; + } +} + +} // namespace rr \ No newline at end of file diff --git a/src/CheckpointInfo.h b/src/CheckpointInfo.h new file mode 100644 index 00000000000..efeb16df362 --- /dev/null +++ b/src/CheckpointInfo.h @@ -0,0 +1,122 @@ +#pragma once + +#include "ExtraRegisters.h" +#include "GdbServer.h" +#include "GdbServerConnection.h" +#include "ReplayTimeline.h" +#include "ReturnAddressList.h" +#include "kernel_abi.h" +#include "rr_pcp.capnp.h" +#include "util.h" +#include +#include + +namespace rr { + +using CPUIDRecords = std::vector; + +/** + * CheckpointInfo and MarkData are intermediary types between de/serialization + * of checkpoints and marks. These types are added to not intrude in Checkpoint, + * Mark, InternalMarks, ProtoMark etc, to make sure that the implementation of + * persistent checkpoints do not break any guarantees or invariants provided by + * those types in normal record/replay. + */ + +/** + * `MarkData` flattens that "hierarchy" representing `Mark`, `InternalMark` and + * `ProtoMark` required for de/serialization. When deserializing this hierarchy + * is rebuilt from `MarkData` + */ +struct MarkData { + // Constructor when serializing + MarkData(const ReplayTimeline::Mark& m); + // Constructor when de-serializing + MarkData(rr::pcp::MarkData::Reader reader, const CPUIDRecords& cpuid_recs); + + FrameTime time; + Ticks ticks; + int step_key; + Ticks ticks_at_event_start; + Registers regs; + ExtraRegisters extra_regs; + ReturnAddressList return_addresses; + bool singlestep_to_next_mark_no_signal; + SupportedArch arch; +}; + +class CheckpointInfo { + void set_capnp_directory(const ReplayTimeline::Mark& mark); + +public: + /** + * For `GDBServer` users of explicit checkpoints. + */ + CheckpointInfo(const Checkpoint& checkpoint); + + /** + * For `GDBServer` users where a non explicit checkpoint was set. + * `mark_with_clone` is the mark which holds the actual checkpoint / clone, + * which is some arbitrary event time before actual GDB checkpoint. + */ + CheckpointInfo(const Checkpoint& checkpoint, + const ReplayTimeline::Mark& mark_with_clone); + + /* For `CreateCheckpointsCommand` users (rr create-checkpoints command) */ + CheckpointInfo(ExtendedTaskId last_continue_task, + const ReplayTimeline::Mark& mark_with_checkpoint); + // When deserializing from capnproto stream + CheckpointInfo(std::string metadata_file, + rr::pcp::CheckpointInfo::Reader reader, + const CPUIDRecords& cpuid_recs); + + bool serialize(ReplaySession& session); + bool exists_on_disk() const; + void delete_from_disk(); + + ScopedFd open_for_read() const; + ScopedFd open_for_write() const; + std::string capnp_file_path() const; + const std::string& data_directory() const; + + /** + * Returns event time for this checkpoint + */ + FrameTime event_time() const { return clone_data.time; } + + static size_t generate_unique_id(size_t id = 0); + + friend bool operator==(const CheckpointInfo& lhs, const CheckpointInfo& rhs) { + return lhs.capnp_directory == rhs.capnp_directory; + } + + bool is_explicit() const { return non_explicit_mark_data == nullptr; } + + // Path to file containing filled out capnproto schema for this checkpoint + std::string capnp_directory; + size_t unique_id; + ExtendedTaskId last_continue_task; + std::string where; + uint32_t next_serial; + // MarkData collected from a Mark with a clone (either an explicit checkpoint, + // or the first found clone before a non-explicit checkpoint) + MarkData clone_data; + // (optional) MarkData collected from a Mark without a clone (in the case of + // non explicit checkpoints) + std::shared_ptr non_explicit_mark_data; + Session::Statistics stats; +}; + +/** + * Returns the path of checkpoint index file, given the dir `trace_dir` + */ +std::string checkpoints_index_file(const std::string& trace_dir); + +/** + * Retrieve list of persistent checkpoints in `trace_dir` sorted in ascending + * order by event time. + */ +std::vector get_checkpoint_infos( + const std::string& trace_dir, const CPUIDRecords& cpuid_recs); + +} // namespace rr \ No newline at end of file diff --git a/src/CreateCheckpointsCommand.cc b/src/CreateCheckpointsCommand.cc new file mode 100644 index 00000000000..d1e28fb57a5 --- /dev/null +++ b/src/CreateCheckpointsCommand.cc @@ -0,0 +1,191 @@ +#include "CreateCheckpointsCommand.h" +#include "CheckpointInfo.h" +#include "Command.h" +#include "GdbServerConnection.h" +#include "ReplayTimeline.h" +#include "TraceStream.h" +#include "log.h" +#include "main.h" +#include + +namespace rr { + +CreateCheckpointsCommand CreateCheckpointsCommand::singleton( + "create-checkpoints", + " rr create-checkpoints [OPTION]... []\n" + " -i, --interval= Create persistent checkpoints on an interval " + "of \n" + " events.\n" + " -s, --start= Start setting checkpoints at event \n" + " -e, --end= Stop setting checkpoints at event \n" + "\n" + "Creates a checkpoint at an interval of N events. " + "The command will attempt to\n" + "honor this interval as closely as possible.\n"); + +static bool parse_options(std::vector& args, + CreateCheckpointsFlags& options) { + if (parse_global_option(args)) { + return true; + } + static const OptionSpec op_spec[] = { { 'i', "--interval", HAS_PARAMETER }, + { 's', "--start", HAS_PARAMETER }, + { 'e', "--end", HAS_PARAMETER } }; + + ParsedOption opt; + if (!Command::parse_option(args, op_spec, &opt)) { + return false; + } + switch (opt.short_name) { + case 'i': + options.events_interval = static_cast(std::abs(opt.int_value)); + break; + case 's': + options.start_event = static_cast(std::abs(opt.int_value)); + break; + case 'e': + options.end_event = static_cast(std::abs(opt.int_value)); + break; + default: + DEBUG_ASSERT(0 && "Unknown option"); + return false; + } + return true; +} + +bool CreateCheckpointsCommand::verify_params_ok( + const CreateCheckpointsFlags& flags) { + if (flags.events_interval == 0) { + std::cout << "You need to provide an interval to set checkpoints at.\n"; + return false; + } + if (flags.end_event < flags.start_event) { + std::cout << "start & end has invalid values"; + return false; + } + if ((flags.end_event == UINT64_MAX && flags.start_event == 0) || + (flags.start_event != 0 && flags.end_event == UINT64_MAX)) { + return true; + } + + if ((flags.end_event - flags.start_event) < flags.events_interval) { + std::cout << "interval too large, can't fit between start & end"; + return false; + } + return true; +} + +int CreateCheckpointsCommand::run(std::vector& args) { + CreateCheckpointsFlags flags; + bool found_dir = false; + std::string trace_dir{}; + while (!args.empty()) { + if (parse_options(args, flags)) { + continue; + } + if (!found_dir && parse_optional_trace_dir(args, &trace_dir)) { + found_dir = true; + continue; + } + print_help(stderr); + return 1; + } + + if (!verify_params_ok(flags)) { + print_help(stderr); + return 1; + } + + auto verified_frames_to_checkpoint_at = + CreateCheckpointsCommand::find_events_to_checkpoint(trace_dir, flags); + if (verified_frames_to_checkpoint_at.empty()) { + std::cout << "No checkpointable events found.\n"; + return 2; + } + return run_main(trace_dir, verified_frames_to_checkpoint_at); +} + +static bool create_persistent_checkpoint_dir(const string& trace_dir, + FrameTime time) { + string checkpoint_dir = trace_dir + "/checkpoint-" + std::to_string(time); + if (mkdir(checkpoint_dir.c_str(), 0755) == 0) { + return true; + } + return false; +} + +int CreateCheckpointsCommand::run_main( + const std::string& trace_dir, + const std::vector& verified_events) { + DEBUG_ASSERT(!verified_events.empty() && + "No events provided to checkpoint at."); + ReplaySession::Flags session_flags{}; + ReplayTimeline timeline{ ReplaySession::create(trace_dir, session_flags) }; + auto& reader = timeline.current_session().trace_reader(); + auto serializedCheckpoints = 0; + for (const auto evt : verified_events) { + RunCommand cmd = RUN_CONTINUE; + while (reader.time() < evt) { + auto r = timeline.replay_step_forward(cmd); + } + auto& session = timeline.current_session(); + if (session.trace_reader().time() == evt && + create_persistent_checkpoint_dir(trace_dir, evt)) { + ASSERT(session.current_task(), session.can_clone()) + << "could not clone at frame " << evt; + auto mark = timeline.add_explicit_checkpoint(); + CheckpointInfo cp_info{ extended_task_id(session.current_task()), mark }; + if (cp_info.serialize(*mark.get_checkpoint())) { + serializedCheckpoints++; + } + timeline.remove_explicit_checkpoint(mark); + LOG(debug) << "Serialized checkpoint at event " << evt; + } else { + FATAL() << "Stopped at wrong event"; + } + } + + std::cout << "Create checkpoints run successfully finished: " + << serializedCheckpoints << " checkpoints created." << std::endl; + return 0; +} + +std::vector CreateCheckpointsCommand::find_events_to_checkpoint( + const std::string& trace_dir, const CreateCheckpointsFlags& flags) { + TraceReader reader{ trace_dir }; + std::vector events; + auto total = 0ul; + + if (flags.start_event != 0) { + while (!reader.at_end()) { + total++; + const auto f = reader.read_frame(); + if (f.time() >= static_cast(flags.start_event) && + f.event().can_checkpoint_at()) { + LOG(debug) << "Checkpointable event: " << f.event() << " " << f.time(); + events.push_back(f.time()); + break; + } + } + if (reader.at_end()) { + std::cout << "Trace is shorter than " << flags.start_event + << " (total trace events: " << total << ")" + << "Aborting." << std::endl; + return {}; + } + } + const auto add = flags.start_event == 0 ? 1 : 0; + while (!reader.at_end() && total <= flags.end_event) { + const auto f = reader.read_frame(); + const auto next = + (events.size() + add) * flags.events_interval + flags.start_event; + if (f.time() >= static_cast(next) && f.event().can_checkpoint_at()) { + LOG(debug) << "Checkpointable event: " << f.event() << " " << f.time(); + events.push_back(f.time()); + } + total++; + } + return events; +} + +}; // namespace rr \ No newline at end of file diff --git a/src/CreateCheckpointsCommand.h b/src/CreateCheckpointsCommand.h new file mode 100644 index 00000000000..0b3dda98c2c --- /dev/null +++ b/src/CreateCheckpointsCommand.h @@ -0,0 +1,41 @@ +#pragma once + +#include "Command.h" +#include + +namespace rr { + +using FrameTime = int64_t; + +struct CreateCheckpointsFlags { + uint64_t events_interval = 0; + uint64_t start_event = 0; + uint64_t end_event = UINT64_MAX; +}; + +class CreateCheckpointsCommand : Command { +public: + virtual int run(std::vector& args) override; + + static CreateCheckpointsCommand* get() { return &singleton; } + +protected: + CreateCheckpointsCommand(const char* name, const char* help) + : Command(name, help) {} + + static CreateCheckpointsCommand singleton; + +private: + /* Runs the actual replay, creating checkpoints at events + * `frames_to_checkpoint_at`. */ + int run_main(const std::string& trace_dir, + const std::vector& frames_to_checkpoint_at); + + /* Returns events to checkpoint at given an `interval`. If `report_total` as + * an out parameter, will report total event count of trace. */ + static std::vector find_events_to_checkpoint( + const std::string& trace_dir, const CreateCheckpointsFlags& interval); + bool verify_params_ok(const CreateCheckpointsFlags& cp); +}; + +} // namespace rr diff --git a/src/DebuggerExtensionCommand.cc b/src/DebuggerExtensionCommand.cc index ef2ecc552d3..c47311d39f9 100644 --- a/src/DebuggerExtensionCommand.cc +++ b/src/DebuggerExtensionCommand.cc @@ -1,9 +1,11 @@ /* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "DebuggerExtensionCommand.h" +#include "CheckpointInfo.h" #include "ReplayTask.h" #include "log.h" +#include using namespace std; @@ -76,6 +78,7 @@ static SimpleDebuggerExtensionCommand rr_history_push( forward_stack.clear(); return string(); }); + static SimpleDebuggerExtensionCommand back( "back", "Go back one entry in the rr history.", [](GdbServer& gdb_server, Task* t, const vector&) { @@ -124,22 +127,20 @@ string invoke_checkpoint(GdbServer& gdb_server, Task*, if (gdb_server.in_debuggee_end_state) { return string("The program is not being run."); } + auto& timeline = *gdb_server.timeline(); int checkpoint_id = ++gNextCheckpointId; - GdbServer::Checkpoint::Explicit e; - if (gdb_server.timeline()->can_add_checkpoint()) { - e = GdbServer::Checkpoint::EXPLICIT; - } else { - e = GdbServer::Checkpoint::NOT_EXPLICIT; - } - gdb_server.checkpoints[checkpoint_id] = GdbServer::Checkpoint( - *gdb_server.timeline(), gdb_server.last_continue_task, e, where); + const Checkpoint::Explicit e = timeline.can_add_checkpoint() + ? Checkpoint::EXPLICIT + : Checkpoint::NOT_EXPLICIT; + gdb_server.checkpoints[checkpoint_id] = + Checkpoint(timeline, gdb_server.last_continue_task, e, where); return string("Checkpoint ") + to_string(checkpoint_id) + " at " + where; } static SimpleDebuggerExtensionCommand checkpoint( - "checkpoint", - "create a checkpoint representing a point in the execution\n" - "use the 'restart' command to return to the checkpoint", - invoke_checkpoint); + "checkpoint", + "create a checkpoint representing a point in the execution\n" + "use the 'restart' command to return to the checkpoint", + invoke_checkpoint); string invoke_delete_checkpoint(GdbServer& gdb_server, Task*, const vector& args) { @@ -156,7 +157,7 @@ string invoke_delete_checkpoint(GdbServer& gdb_server, Task*, } auto it = gdb_server.checkpoints.find(id); if (it != gdb_server.checkpoints.end()) { - if (it->second.is_explicit == GdbServer::Checkpoint::EXPLICIT) { + if (it->second.is_explicit == Checkpoint::EXPLICIT) { gdb_server.timeline()->remove_explicit_checkpoint(it->second.mark); } gdb_server.checkpoints.erase(it); @@ -166,9 +167,9 @@ string invoke_delete_checkpoint(GdbServer& gdb_server, Task*, } } static SimpleDebuggerExtensionCommand delete_checkpoint( - "delete checkpoint", - "remove a checkpoint created with the 'checkpoint' command", - invoke_delete_checkpoint); + "delete checkpoint", + "remove a checkpoint created with the 'checkpoint' command", + invoke_delete_checkpoint); string invoke_info_checkpoints(GdbServer& gdb_server, Task*, const vector&) { @@ -183,9 +184,100 @@ string invoke_info_checkpoints(GdbServer& gdb_server, Task*, return out; } static SimpleDebuggerExtensionCommand info_checkpoints( - "info checkpoints", - "list all checkpoints created with the 'checkpoint' command", - invoke_info_checkpoints); + "info checkpoints", + "list all checkpoints created with the 'checkpoint' command", + invoke_info_checkpoints); + +string invoke_load_checkpoint(GdbServer& server, Task*, const vector&) { + auto existing_checkpoints = + server.current_session().as_replay()->get_persistent_checkpoints(); + auto cp_deserialized = 0; + for (const auto& cp : existing_checkpoints) { + if (server.persistent_checkpoint_is_loaded(cp.unique_id)) { + LOG(debug) << "checkpoint at time " << cp.event_time() + << " already loaded"; + continue; + } + auto session = ReplaySession::create( + server.current_session().as_replay()->trace_reader().dir(), + server.timeline()->current_session().flags()); + int checkpoint_id = ++gNextCheckpointId; + session->load_checkpoint(cp); + + server.checkpoints[checkpoint_id] = + Checkpoint(*server.timeline(), cp, session); + cp_deserialized++; + } + return "loaded " + std::to_string(cp_deserialized) + " checkpoints from disk"; +} + +// we only allow for 1 checkpoint at any particular event. This function +// returns true if it succeeded in creating a new directory, thus also +// signalling that there previously was no checkpoint with that name +static bool create_persistent_checkpoint_dir(const std::string& dir) { + if (mkdir(dir.c_str(), 0755) == 0) { + return true; + } + return false; +} + +static SimpleDebuggerExtensionCommand load_checkpoint( + "load-checkpoints", "loads persistent checkpoints", invoke_load_checkpoint); + +string invoke_write_checkpoints(GdbServer& server, Task* t, + const vector&) { + auto checkpointsWritten = 0; + const auto& trace_dir = t->session().as_replay()->trace_reader().dir(); + std::vector existing_checkpoints; + + for (auto& kvp : server.checkpoints) { + auto& cp = kvp.second; + if (cp.mark.has_rr_checkpoint()) { + const auto checkpoint_dir = + trace_dir + "/checkpoint-" + std::to_string(cp.mark.time()); + + if (!cp.persistent() && + create_persistent_checkpoint_dir(checkpoint_dir)) { + // if it's already made persistent don't serialize. if failure to create + // directory, don't serialize + CheckpointInfo info{ cp }; + if (info.serialize(*cp.mark.get_checkpoint())) { + checkpointsWritten++; + // update checkpoint to have the newly persisted cp's id. + cp.unique_id = info.unique_id; + } + } + } else { + auto mark_with_clone = + server.timeline()->find_closest_mark_with_clone(cp.mark); + const auto checkpoint_dir = + trace_dir + "/checkpoint-" + std::to_string(mark_with_clone->time()); + if (!mark_with_clone) { + std::cout + << "Could not find a session clone to serialize for checkpoint " + << kvp.first << '\n'; + } else if (!cp.persistent() && + create_persistent_checkpoint_dir(checkpoint_dir)) { + // if it's already made persistent don't serialize. if failure to create + // directory, don't serialize + CheckpointInfo info{ cp, *mark_with_clone }; + if (info.serialize(*mark_with_clone->get_checkpoint())) { + checkpointsWritten++; + // update checkpoint to have the newly persisted cp's id. + cp.unique_id = info.unique_id; + } + } + } + } + + return std::to_string(checkpointsWritten) + + " new checkpoints serialized. (total: " + + std::to_string(existing_checkpoints.size()) + ")"; +} + +static SimpleDebuggerExtensionCommand write_checkpoints( + "write-checkpoints", "make checkpoints persist on disk.", + invoke_write_checkpoints); void DebuggerExtensionCommand::init_auto_args() { static __attribute__((unused)) int dummy = []() { diff --git a/src/Event.cc b/src/Event.cc index a855533f7aa..a9c3be02f24 100644 --- a/src/Event.cc +++ b/src/Event.cc @@ -243,4 +243,24 @@ const char* state_name(SyscallState state) { } } +bool Event::can_checkpoint_at() const { + if (has_ticks_slop()) { + return false; + } + switch (type()) { + case EV_EXIT: + // At exits, we can't clone the exiting tasks, so + // don't event bother trying to checkpoint. + case EV_SYSCALLBUF_RESET: + // RESETs are usually inserted in between syscall + // entry/exit. Do not attempting to checkpoint at + // RESETs. Users would never want to do that anyway. + case EV_TRACE_TERMINATION: + // There's nothing to checkpoint at the end of a trace. + return false; + default: + return true; + } +} + } // namespace rr diff --git a/src/Event.h b/src/Event.h index b262be0309b..8819004578e 100644 --- a/src/Event.h +++ b/src/Event.h @@ -394,6 +394,12 @@ struct Event { /** Return a string naming |ev|'s type. */ std::string type_name() const; + /** + * Return true if it's possible/meaningful to make a checkpoint at the + * |frame| that |t| will replay. + */ + bool can_checkpoint_at() const; + static Event noop() { return Event(EV_NOOP); } static Event trace_termination() { return Event(EV_TRACE_TERMINATION); } static Event instruction_trap() { return Event(EV_INSTRUCTION_TRAP); } diff --git a/src/FdTable.cc b/src/FdTable.cc index 154e7baa413..12429b92627 100644 --- a/src/FdTable.cc +++ b/src/FdTable.cc @@ -7,6 +7,19 @@ #include #include +#include "BpfMapMonitor.h" +#include "FileMonitor.h" +#include "MagicSaveDataMonitor.h" +#include "MmappedFileMonitor.h" +#include "NonvirtualPerfCounterMonitor.h" +#include "ODirectFileMonitor.h" +#include "PreserveFileMonitor.h" +#include "ProcFdDirMonitor.h" +#include "ProcMemMonitor.h" +#include "ProcStatMonitor.h" +#include "RRPageMonitor.h" +#include "StdioMonitor.h" +#include "SysCpuMonitor.h" #include "rr/rr.h" #include "AddressSpace.h" @@ -283,4 +296,86 @@ vector FdTable::fds_to_close_after_exec(RecordTask* t) { return fds_to_close; } +void FdTable::deserialize(Task* leader, + const pcp::ProcessSpace::Reader& leader_reader) { + auto monitors = leader_reader.getMonitors(); + for (auto m : monitors) { + FileMonitor::Type t = (FileMonitor::Type)m.getType(); + auto fd = m.getFd(); + if (!is_monitoring(m.getFd())) { + switch (t) { + case FileMonitor::Base: + FATAL() << "Can't add abstract type"; + break; + case FileMonitor::MagicSaveData: + add_monitor(leader, fd, new MagicSaveDataMonitor()); + break; + case FileMonitor::Mmapped: { + const auto mmap = m.getMmap(); + add_monitor(leader, fd, + new MmappedFileMonitor(mmap.getDead(), mmap.getDevice(), + mmap.getInode())); + } break; + case FileMonitor::Preserve: + add_monitor(leader, fd, new PreserveFileMonitor()); + break; + case FileMonitor::ProcFd: { + const auto p_fd = m.getProcFd(); + const auto tuid = TaskUid(p_fd.getTid(), p_fd.getSerial()); + add_monitor(leader, fd, new ProcFdDirMonitor(tuid)); + break; + } + case FileMonitor::ProcMem: { + const auto pmem = m.getProcMem(); + add_monitor(leader, fd, + new ProcMemMonitor(AddressSpaceUid(pmem.getTid(), + pmem.getSerial(), + pmem.getExecCount()))); + } break; + case FileMonitor::Stdio: + add_monitor(leader, fd, new StdioMonitor(m.getStdio())); + break; + case FileMonitor::VirtualPerfCounter: + FATAL() << "VirtualPerCounter Monitor deserializing unimplemented!\n"; + break; + case FileMonitor::NonvirtualPerfCounter: + add_monitor(leader, fd, new NonvirtualPerfCounterMonitor()); + break; + case FileMonitor::SysCpu: + add_monitor(leader, fd, new SysCpuMonitor(leader, "")); + break; + case FileMonitor::ProcStat: + add_monitor( + leader, fd, + new ProcStatMonitor(leader, data_to_str(m.getProcStat()))); + break; + case FileMonitor::RRPage: + add_monitor(leader, fd, new RRPageMonitor()); + break; + case FileMonitor::ODirect: + add_monitor(leader, fd, new ODirectFileMonitor()); + break; + case FileMonitor::BpfMap: + add_monitor(leader, fd, + new BpfMapMonitor(m.getBpf().getKeySize(), + m.getBpf().getValueSize())); + break; + default: + FATAL() << "unhandled FileMonitor: " << file_monitor_type_name(t); + } + } + } +} + +void FdTable::serialize(pcp::ProcessSpace::Builder& leader_builder) const { + auto serialized_fd_mons = leader_builder.initMonitors(fds.size()); + auto mon_index = 0; + for (const auto& mon : fds) { + const auto fd = mon.first; + const auto& monitor = mon.second; + auto builder = serialized_fd_mons[mon_index++]; + monitor->serialize(fd, builder); + } +} + } // namespace rr diff --git a/src/FdTable.h b/src/FdTable.h index 02b9f233cd8..212de6cb2a3 100644 --- a/src/FdTable.h +++ b/src/FdTable.h @@ -9,6 +9,7 @@ #include "FileMonitor.h" #include "HasTaskSet.h" +#include "rr_pcp.capnp.h" namespace rr { @@ -74,6 +75,10 @@ class FdTable final : public HasTaskSet { int last_free_fd() const { return last_free_fd_; } void set_last_free_fd(int last_free_fd) { last_free_fd_ = last_free_fd; } + void serialize(pcp::ProcessSpace::Builder& leader_builder) const; + void deserialize(Task* leader, + const pcp::ProcessSpace::Reader& leader_reader); + void insert_task(Task* t) override; void erase_task(Task* t) override; diff --git a/src/FileMonitor.cc b/src/FileMonitor.cc index 340d55b1a4c..2b2308e4a0c 100644 --- a/src/FileMonitor.cc +++ b/src/FileMonitor.cc @@ -114,4 +114,10 @@ std::string file_monitor_type_name(FileMonitor::Type t) { } } +void FileMonitor::serialize(int fd, + pcp::FileMonitor::Builder& builder) const noexcept { + builder.setFd(fd); + builder.setType(type()); + serialize_type(builder); +} } diff --git a/src/FileMonitor.h b/src/FileMonitor.h index a0ed3dfbdba..81789ebec86 100644 --- a/src/FileMonitor.h +++ b/src/FileMonitor.h @@ -13,6 +13,7 @@ class Task; #include "preload/preload_interface.h" #include "util.h" +#include "rr_pcp.capnp.h" namespace rr { @@ -43,7 +44,7 @@ class FileMonitor { PidFd, }; - virtual Type type() { return Base; } + virtual Type type() const { return Base; } /** * Overriding this to return true will cause close() (and related fd-smashing @@ -129,6 +130,13 @@ class FileMonitor { virtual enum syscallbuf_fd_classes get_syscallbuf_class() { return FD_CLASS_TRACED; } + + /** Serialize this file monitor for persistent checkpoints. */ + void serialize(int fd, pcp::FileMonitor::Builder& builder) const noexcept; + +private: + // default serialize_type does nothing + virtual void serialize_type(pcp::FileMonitor::Builder&) const noexcept {} }; std::string file_monitor_type_name(FileMonitor::Type t); diff --git a/src/GdbServer.cc b/src/GdbServer.cc index 261c3523d35..c5d1356fac7 100644 --- a/src/GdbServer.cc +++ b/src/GdbServer.cc @@ -18,9 +18,11 @@ #include #include "BreakpointCondition.h" +#include "CheckpointInfo.h" #include "ElfReader.h" #include "Event.h" #include "DebuggerExtensionCommandHandler.h" +#include "GdbServerConnection.h" #include "GdbServerExpression.h" #include "ReplaySession.h" #include "ReplayTask.h" @@ -50,10 +52,6 @@ GdbServer::ConnectionFlags::ConnectionFlags() serve_files(false), debugger_params_write_pipe(nullptr) {} -static ExtendedTaskId extended_task_id(Task* t) { - return ExtendedTaskId(t->thread_group()->tguid(), t->tuid()); -} - GdbServer::GdbServer(std::unique_ptr& connection, Task* t, ReplayTimeline* timeline, const Target& target) @@ -1764,8 +1762,33 @@ void GdbServer::restart_session(const GdbRequest& req) { if (checkpoint_to_restore.mark) { timeline_->seek_to_mark(checkpoint_to_restore.mark); - last_query_task = last_continue_task = - checkpoint_to_restore.last_continue_task; + const auto at_followed_process = [&](const auto& target) { + return timeline()->current_session().current_task()->tgid() == target.pid; + }; + if (at_followed_process(target)) { + // normal checkpoint restart branch, because checkpoint was created via + // GDB. last_continue_tuid is therefore serialized, so we can set it from + // that. + DEBUG_ASSERT(timeline()->current_session().current_task()->tuid() == + checkpoint_to_restore.last_continue_task.tuid); + last_query_task = last_continue_task = + checkpoint_to_restore.last_continue_task; + } else { + // Persistent checkpoints might have been created during another process' + // execution which GDB is not "following" thus, we need to tell + // ReplayTimeline to play until it reaches |Target.pid|. + while (!at_followed_process(target)) { + ReplayResult result = timeline()->replay_step_forward(RUN_CONTINUE); + // We should never reach the end of the trace without hitting the stop + // condition below. + DEBUG_ASSERT(result.status != REPLAY_EXITED); + } + auto t = timeline()->current_session().current_task(); + ASSERT(t, t != nullptr) + << "Could not find current task at checkpoint restore"; + last_query_task = last_continue_task = extended_task_id(t); + } + if (debugger_restart_checkpoint.is_explicit == Checkpoint::EXPLICIT) { timeline_->remove_explicit_checkpoint(debugger_restart_checkpoint.mark); } @@ -2116,6 +2139,46 @@ static void remove_trailing_guard_pages(ReplaySession::MemoryRanges& ranges) { } } +bool GdbServer::persistent_checkpoint_is_loaded(size_t unique_id) { + for (const auto& cp : checkpoints) { + if (cp.second.unique_id == unique_id) + return true; + } + return false; +} + +Checkpoint::Checkpoint(ReplayTimeline& timeline, + ExtendedTaskId last_continue_task, Explicit e, + const std::string& where) + : last_continue_task(last_continue_task), is_explicit(e), where(where) { + if (e == EXPLICIT) { + mark = timeline.add_explicit_checkpoint(); + } else { + mark = timeline.mark(); + const auto prior = timeline.find_closest_mark_with_clone(mark); + if (prior) { + prior->get_internal()->inc_refcount(); + } + } +} + +// Used when deserializing persistent checkpoints +Checkpoint::Checkpoint(ReplayTimeline& timeline, const CheckpointInfo& cp, + ReplaySession::shr_ptr session) + : last_continue_task(cp.last_continue_task), + is_explicit(EXPLICIT), + where(cp.where), + unique_id(cp.unique_id) { + if (cp.non_explicit_mark_data) { + LOG(debug) << "checkpoint clone at " << cp.clone_data.time + << "; GDB checkpoint at " << cp.non_explicit_mark_data->time; + mark = timeline.recreate_marks_for_non_explicit(cp, session); + } else { + mark = timeline.recreate_mark_from_data(cp.clone_data, session); + timeline.register_mark_as_checkpoint(mark); + } +} + remote_ptr GdbServer::allocate_debugger_mem(ThreadGroupUid tguid, size_t size, int prot) { if (!timeline_) { diff --git a/src/GdbServer.h b/src/GdbServer.h index 1b940fde72f..e34715ca5db 100644 --- a/src/GdbServer.h +++ b/src/GdbServer.h @@ -22,6 +22,25 @@ namespace rr { +struct Checkpoint { + enum Explicit { EXPLICIT, NOT_EXPLICIT }; + Checkpoint(ReplayTimeline& timeline, ExtendedTaskId last_continue_task, + Explicit e, const std::string& where); + Checkpoint() : is_explicit(NOT_EXPLICIT) {} + // Used when creating deserialized checkpoints + Checkpoint(ReplayTimeline& timeline, const CheckpointInfo& cp, + ReplaySession::shr_ptr session); + + bool persistent() const { return unique_id != 0; } + + ReplayTimeline::Mark mark; + ExtendedTaskId last_continue_task; + Explicit is_explicit; + std::string where; + // Only persistent checkpoints have unique id's. + size_t unique_id = 0; +}; + class GdbServer { // Not ideal but we can't inherit friend from DebuggerExtensionCommand friend std::string invoke_checkpoint(GdbServer&, Task*, @@ -30,6 +49,10 @@ class GdbServer { const std::vector&); friend std::string invoke_info_checkpoints(GdbServer&, Task*, const std::vector&); + friend std::string invoke_write_checkpoints(GdbServer&, Task*, + const std::vector&); + friend std::string invoke_load_checkpoint(GdbServer&, Task*, + const std::vector&); public: struct Target { @@ -179,6 +202,11 @@ class GdbServer { */ int open_file(Session& session, Task *continue_task, const std::string& file_name); + /** + * Check if persistent checkpoint with id `unique_id` has been loaded in this session. + */ + bool persistent_checkpoint_is_loaded(size_t unique_id); + /** * Allocates debugger-owned memory region. * We pretend this memory exists in all sessions, but it actually only @@ -258,23 +286,6 @@ class GdbServer { ReplayTimeline* timeline_; Session* emergency_debug_session; - struct Checkpoint { - enum Explicit { EXPLICIT, NOT_EXPLICIT }; - Checkpoint(ReplayTimeline& timeline, ExtendedTaskId last_continue_task, Explicit e, - const std::string& where) - : last_continue_task(last_continue_task), is_explicit(e), where(where) { - if (e == EXPLICIT) { - mark = timeline.add_explicit_checkpoint(); - } else { - mark = timeline.mark(); - } - } - Checkpoint() : is_explicit(NOT_EXPLICIT) {} - ReplayTimeline::Mark mark; - ExtendedTaskId last_continue_task; - Explicit is_explicit; - std::string where; - }; // |debugger_restart_mark| is the point where we will restart from with // a no-op debugger "run" command. Checkpoint debugger_restart_checkpoint; diff --git a/src/GdbServerConnection.cc b/src/GdbServerConnection.cc index 8668cad9ebb..dc69bc36933 100644 --- a/src/GdbServerConnection.cc +++ b/src/GdbServerConnection.cc @@ -59,8 +59,13 @@ static bool request_needs_immediate_response(const GdbRequest* req) { } #endif +ExtendedTaskId extended_task_id(Task* t) { + return ExtendedTaskId(t->thread_group()->tguid(), t->tuid()); +} + GdbServerConnection::GdbServerConnection(ThreadGroupUid tguid, - DebuggerType debugger_type, const Features& features) + DebuggerType debugger_type, + const Features& features) : tguid(tguid), cpu_features_(0), debugger_type(debugger_type), diff --git a/src/GdbServerConnection.h b/src/GdbServerConnection.h index 6496f33798a..5459bae16de 100644 --- a/src/GdbServerConnection.h +++ b/src/GdbServerConnection.h @@ -73,6 +73,8 @@ struct ExtendedTaskId { } }; +ExtendedTaskId extended_task_id(Task* t); + inline std::ostream& operator<<(std::ostream& o, const ExtendedTaskId& t) { o << t.tguid.tid() << "." << t.tuid.tid(); return o; diff --git a/src/MagicSaveDataMonitor.h b/src/MagicSaveDataMonitor.h index 6ac2c1bf2c6..1fd99c37192 100644 --- a/src/MagicSaveDataMonitor.h +++ b/src/MagicSaveDataMonitor.h @@ -14,7 +14,7 @@ class MagicSaveDataMonitor : public FileMonitor { public: MagicSaveDataMonitor() {} - virtual Type type() override { return MagicSaveData; } + virtual Type type() const override { return MagicSaveData; } virtual void did_write(Task* t, const std::vector& ranges, LazyOffset& offset) override; diff --git a/src/MmappedFileMonitor.cc b/src/MmappedFileMonitor.cc index 5a28a4739b4..772e2e70203 100644 --- a/src/MmappedFileMonitor.cc +++ b/src/MmappedFileMonitor.cc @@ -27,6 +27,10 @@ MmappedFileMonitor::MmappedFileMonitor(Task* t, EmuFile::shr_ptr f) { inode_ = f->inode(); } +MmappedFileMonitor::MmappedFileMonitor(bool dead, dev_t device, + ino_t inode) noexcept + : dead_(dead), device_(device), inode_(inode) {} + void MmappedFileMonitor::did_write(Task* t, const std::vector& ranges, LazyOffset& offset) { // If there are no remaining mappings that we care about, those can't reappear @@ -108,4 +112,11 @@ void MmappedFileMonitor::did_write(Task* t, const std::vector& ranges, } } +void MmappedFileMonitor::serialize_type(pcp::FileMonitor::Builder& builder) const noexcept { + auto mmap = builder.initMmap(); + mmap.setDead(dead_); + mmap.setDevice(device_); + mmap.setInode(inode_); +} + } // namespace rr diff --git a/src/MmappedFileMonitor.h b/src/MmappedFileMonitor.h index 3f19f004407..41d3d05b4e0 100644 --- a/src/MmappedFileMonitor.h +++ b/src/MmappedFileMonitor.h @@ -18,8 +18,9 @@ class MmappedFileMonitor : public FileMonitor { public: MmappedFileMonitor(Task* t, int fd); MmappedFileMonitor(Task* t, EmuFile::shr_ptr f); + MmappedFileMonitor(bool dead, dev_t device, ino_t inode) noexcept; - virtual Type type() override { return Mmapped; } + virtual Type type() const override { return Mmapped; } void revive() { dead_ = false; } // If this write could potentially affect memory we need to PREVENT_SWITCH, // since the timing of the write is otherwise unpredictable from our @@ -35,6 +36,8 @@ class MmappedFileMonitor : public FileMonitor { LazyOffset& offset) override; private: + void serialize_type( + pcp::FileMonitor::Builder& builder) const noexcept override; // Whether this monitor is still actively monitoring bool dead_; dev_t device_; diff --git a/src/NonvirtualPerfCounterMonitor.h b/src/NonvirtualPerfCounterMonitor.h index 92282e4329d..8f0adb30a8d 100644 --- a/src/NonvirtualPerfCounterMonitor.h +++ b/src/NonvirtualPerfCounterMonitor.h @@ -15,7 +15,7 @@ class NonvirtualPerfCounterMonitor : public FileMonitor { public: NonvirtualPerfCounterMonitor() {} - virtual Type type() override { return NonvirtualPerfCounter; } + virtual Type type() const override { return NonvirtualPerfCounter; } }; } // namespace rr diff --git a/src/ODirectFileMonitor.h b/src/ODirectFileMonitor.h index 2532e02f81e..be53d780070 100644 --- a/src/ODirectFileMonitor.h +++ b/src/ODirectFileMonitor.h @@ -17,7 +17,7 @@ class ODirectFileMonitor : public FileMonitor { public: ODirectFileMonitor() : FileMonitor() {}; - virtual Type type() override { return ODirect; } + virtual Type type() const override { return ODirect; } }; } // namespace rr diff --git a/src/PersistentCheckpointing.cc b/src/PersistentCheckpointing.cc new file mode 100644 index 00000000000..9c44984613a --- /dev/null +++ b/src/PersistentCheckpointing.cc @@ -0,0 +1,510 @@ +#include "PersistentCheckpointing.h" +#include "AutoRemoteSyscalls.h" +#include "BpfMapMonitor.h" +#include "CheckpointInfo.h" +#include "EmuFs.h" +#include "FileMonitor.h" +#include "MagicSaveDataMonitor.h" +#include "MmappedFileMonitor.h" +#include "NonvirtualPerfCounterMonitor.h" +#include "ODirectFileMonitor.h" +#include "PidFdMonitor.h" +#include "PreserveFileMonitor.h" +#include "ProcFdDirMonitor.h" +#include "ProcMemMonitor.h" +#include "ProcStatMonitor.h" +#include "RRPageMonitor.h" +#include "ReplayTask.h" +#include "ScopedFd.h" +#include "Session.h" +#include "StdioMonitor.h" +#include "SysCpuMonitor.h" +#include "Task.h" +#include "TaskishUid.h" +#include "TraceFrame.h" +#include "TraceStream.h" +#include "VirtualPerfCounterMonitor.h" +#include "kernel_abi.h" +#include "log.h" +#include "replay_syscall.h" +#include "rr_pcp.capnp.h" +#include "util.h" +#include +#include +#include +#include +#include +#include + +namespace rr { + +#define PAGE_PRESENT(page_map_entry) page_map_entry & (1ul << 63) +#define PAGE_SWAPPED(page_map_entry) page_map_entry & (1ul << 62) +#define PAGE_FILE_OR_SHARED_ANON(page_map_entry) page_map_entry & (1ul << 61) +#define FILE_OP_FATAL(file) \ + FATAL() << "write_map failed for " << std::string{ file.get() } << " " +constexpr auto PRIVATE_ANON = MAP_ANONYMOUS | MAP_PRIVATE; + +static std::string file_name_of(const std::string& path) { + auto pos = path.rfind("/"); + // means we're an "ok" filename (ok, means we have no path components - we're + // either empty or just a file name) + if (pos == std::string::npos) { + return path; + } + return path.substr(pos + 1); +} + +WriteVmConfig::WriteVmConfig(Task* clone_leader, const char* data_dir, + size_t buffer_size) noexcept + : clone_leader(clone_leader), cp_data_dir(data_dir) { + const auto procfs_mem = clone_leader->proc_mem_path(); + const auto procfs_pagemap = clone_leader->proc_pagemap_path(); + proc_mem_fd = ScopedFd{ procfs_mem.c_str(), O_RDONLY }; + ASSERT(clone_leader, proc_mem_fd.is_open()) + << "Serializing VM for " << clone_leader->rec_tid + << " failed. Couldn't open " << procfs_mem; + proc_pagemap_fd = ScopedFd{ procfs_pagemap.c_str(), O_RDONLY }; + ASSERT(clone_leader, proc_pagemap_fd.is_open()) + << "Serializing VM for " << clone_leader->rec_tid + << " failed. Couldn't open " << proc_pagemap_fd; + buffer = { .ptr = + (uint8_t*)::mmap(nullptr, buffer_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0), + .size = buffer_size }; + ASSERT(clone_leader, buffer.ptr != MAP_FAILED) + << "Failed to mmap buffer with capacity " << buffer_size; +} + +ssize_t WriteVmConfig::pread(ssize_t bytes_read, + const KernelMapping& km) const { + DEBUG_ASSERT(bytes_read != 1 && + "you've passed in the 'invalid pread result' as bytes_read"); + const auto current_read = + ::pread(proc_mem_fd, buffer.ptr + bytes_read, km.size() - bytes_read, + km.start().as_int() + bytes_read); + if (current_read == -1) + return current_read; + return bytes_read + current_read; +} + +std::string checkpoints_index_file(const std::string& trace_dir) { + return trace_dir + "/checkpoints"; +} + +static void write_map(const WriteVmConfig& cfg, + rr::pcp::KernelMapping::Builder builder, + const AddressSpace::Mapping& map) { + LOG(debug) << "serializing " << map.map.str(); + builder.setStart(map.map.start().as_int()); + builder.setEnd(map.map.end().as_int()); + builder.setFsname(str_to_data(map.recorded_map.fsname())); + builder.setDevice(map.map.device()); + builder.setInode(map.recorded_map.inode()); + builder.setProtection(map.map.prot()); + builder.setFlags(map.map.flags()); + // This will be interpreted as 0 on restore, since we create files for + // individual mappings. + builder.setOffset(map.map.file_offset_bytes()); + + std::vector pagemap_entries{}; + + const auto page_count = map.map.size() / page_size(); + pagemap_entries.resize(page_count); + + const auto read_idx_start = (map.map.start().as_int() / page_size()) * 8; + DEBUG_ASSERT(read_idx_start % 8 == 0); + + // walk the page map entries for mapping and determine on how we represent (or + // not represent) it's data in the capnproto file + auto entries_read_sz = ::pread(cfg.proc_pagemap_fd, pagemap_entries.data(), + page_count * sizeof(uint64_t), read_idx_start); + if (entries_read_sz == -1) + FATAL() << "Failed to read page map"; + auto pages_present = 0; + bool all_not_file_or_shared = true; + for (auto pme : pagemap_entries) { + if (PAGE_PRESENT(pme)) + pages_present++; + // probably don't have to check _all_ of the mappings for this, since we + // know the entire segment up front. + if (PAGE_FILE_OR_SHARED_ANON(pme)) + all_not_file_or_shared = false; + } + + // "guard segment": untouched, uninitialized memory, we don't write it's + // contents + if ((map.map.flags() & PRIVATE_ANON) == PRIVATE_ANON && pages_present == 0 && + map.map.prot() == PROT_NONE && all_not_file_or_shared) { + builder.initMapType().setGuardSegment(); + } else { + auto map_type = builder.initMapType(); + + const auto pid = cfg.clone_leader->tid; + const auto fname = file_name_of(map.map.fsname()); + // XXX when/if RR moves to c++20, use std::format. + const auto len = std::snprintf( + nullptr, 0, "%s/%d-%s-%p-%p", cfg.cp_data_dir, pid, fname.c_str(), + (void*)map.map.start().as_int(), (void*)map.map.end().as_int()); + auto file = std::make_unique(len + 1); + if (map.map.fsname().empty()) { + std::snprintf(file.get(), len, "%s/%d-%p-%p", cfg.cp_data_dir, pid, + (void*)map.map.start().as_int(), + (void*)map.map.end().as_int()); + } else { + std::snprintf(file.get(), len, "%s/%d-%s-%p-%p", cfg.cp_data_dir, pid, + fname.c_str(), (void*)map.map.start().as_int(), + (void*)map.map.end().as_int()); + } + ScopedFd f{ file.get(), O_EXCL | O_CREAT | O_RDWR, 0777 }; + if (!f.is_open()) + FILE_OP_FATAL(file) << "Couldn't open file"; + + const auto sz = ::ftruncate(f, map.map.size()); + if (sz == -1) + FILE_OP_FATAL(file) << "couldn't truncate file to size " + << map.map.size(); + + auto bytes_read = 0ull; + while (static_cast(bytes_read) < map.map.size()) { + const auto current_read = cfg.pread(bytes_read, map.map); + if (current_read == -1) + FILE_OP_FATAL(file) << " couldn't read contents of " << map.map.str(); + bytes_read = current_read; + } + + ASSERT(cfg.clone_leader, + static_cast(bytes_read) == map.map.size()) + << " data read from /proc/" << cfg.clone_leader->tid + << "/mem did not match kernel mapping metadata" + << " read " << bytes_read << " expected: " << map.map.size() << " of " + << map.map.str(); + + const auto written_bytes = ::write(f, cfg.buffer.ptr, map.map.size()); + if (written_bytes == -1) + FILE_OP_FATAL(file) << " couldn't write contents of " << map.map.str(); + + const std::string data_fname{ file.get() }; + const auto contents_path = str_to_data(data_fname); + if (map.flags == AddressSpace::Mapping::IS_RR_PAGE || + map.flags == AddressSpace::Mapping::IS_THREAD_LOCALS) { + map_type.initRrPage().setContentsPath(contents_path); + } else if (map.flags == AddressSpace::Mapping::IS_SYSCALLBUF) { + map_type.initSyscallBuffer().setContentsPath(contents_path); + } else if (map.emu_file) { + // XXX simon(optimization): we should not need to write to shared + // memory multiple times (once for each leader - just once?). + auto shared_anon = map_type.initSharedAnon(); + const auto isSysVSegment = + cfg.clone_leader->vm()->has_shm_at(map.map) || + cfg.clone_leader->vm()->has_shm_at(map.recorded_map); + shared_anon.setContentsPath(contents_path); + shared_anon.setIsSysVSegment(isSysVSegment); + } else { + if (map.map.fsname().empty() || map.map.is_stack() || map.map.is_heap()) { + map_type.initPrivateAnon().setContentsPath(contents_path); + } else { + map_type.initFile().setContentsPath(contents_path); + } + } + } +} + +void write_vm(Task* clone_leader, rr::pcp::ProcessSpace::Builder builder, + const std::string& checkpoint_data_dir) { + LOG(debug) << "writing VM for " << clone_leader->rec_tid << " to " + << checkpoint_data_dir; + if (::mkdir(checkpoint_data_dir.c_str(), 0700) != 0) { + LOG(info) << " directory " << checkpoint_data_dir << " already exists."; + } + + std::vector mappings; + auto copy_buffer_size = 0ul; + // any stack mapping will do. It has to be mapped first, mimicking + // `process_execve` at restore + const AddressSpace::Mapping* stack_mapping = nullptr; + for (const auto& m : clone_leader->vm()->maps()) { + // linux has exclusive control over this mapping. + if (m.map.is_vsyscall()) { + continue; + } + if (m.recorded_map.is_stack() && stack_mapping == nullptr) { + stack_mapping = &m; + } else { + mappings.push_back(&m); + } + // largest mapping in the vm; use that as buffer size + copy_buffer_size = std::max(copy_buffer_size, m.map.size()); + } + + ASSERT(clone_leader, !mappings.empty()) << "No mappings found to serialize"; + copy_buffer_size = ceil_page_size(copy_buffer_size); + WriteVmConfig cfg{ clone_leader, checkpoint_data_dir.c_str(), + copy_buffer_size }; + + auto kernel_mappings = builder.initVirtualAddressSpace(mappings.size() + 1); + builder.setBreakpointFaultAddress( + clone_leader->vm()->do_breakpoint_fault_addr().register_value()); + auto idx = 0; + // write the/a stack mapping first. We're mimicking process_execve, therefore + // we need a stack segment first + + write_map(cfg, kernel_mappings[idx++], *stack_mapping); + for (auto m : mappings) { + write_map(cfg, kernel_mappings[idx++], *m); + } +} + +// reads serialized map contents from |path|, mmaps a read buffer in the +// supervisor, then write its contents to mappping |km| in ReplayTask |t|. +void restore_map_contents(ReplayTask* t, const std::string& path, + const KernelMapping& km) { + LOG(debug) << "restoring contents of " << km << " from " << path + << " for task " << t->rec_tid; + auto fd = ScopedFd(path.c_str(), O_RDONLY); + ASSERT(t, fd.is_open()) << "Failed to open mapping contents file for " + << km.str() << " at " << path; + + auto addr = ::mmap(nullptr, km.size(), PROT_READ, MAP_PRIVATE, fd, 0); + ASSERT(t, addr != MAP_FAILED) + << "Could not load mapping contents of " << km.str() << " from " << path; + + bool write_ok = true; + auto bytes_written = t->write_bytes_helper_no_notifications( + km.start(), km.size(), addr, &write_ok); + ASSERT(t, write_ok) << "Failed to restore contents of mapping from file for " + << km.str(); + ASSERT(t, static_cast(bytes_written) == km.size()) + << "Failed to restore contents of mapping from file. Wrote " + << bytes_written << "; expected " << km.size(); + if (::munmap(addr, km.size()) == -1) { + FATAL() << "munmap failed for temporary buffer"; + } +} + +void map_region_file(AutoRemoteSyscalls& remote, const KernelMapping& km, + const std::string& file_path) { + struct stat real_file; + std::string real_file_name; + LOG(debug) << "directly mmap'ing " << km.size() << " bytes of " << file_path + << " at offset " << HEX(km.file_offset_bytes()) << "(" << km.str() + << ")"; + remote.finish_direct_mmap(km.start(), km.size(), km.prot(), + ((km.flags() & ~MAP_GROWSDOWN) | MAP_PRIVATE), + file_path.c_str(), O_RDONLY, 0, real_file, + real_file_name); + remote.task()->vm()->map(remote.task(), km.start(), km.size(), km.prot(), + km.flags(), km.file_offset_bytes(), km.fsname(), + km.device(), km.inode(), nullptr, &km); +} + +void map_private_anonymous(AutoRemoteSyscalls& remote, + const KernelMapping& km) { + LOG(debug) << "map region no file: " << km.str(); + remote.infallible_mmap_syscall_if_alive( + km.start(), km.size(), km.prot(), + (km.flags() & ~MAP_GROWSDOWN) | MAP_FIXED | MAP_ANONYMOUS, -1, 0); + remote.task()->vm()->map(remote.task(), km.start(), km.size(), km.prot(), + km.flags(), km.file_offset_bytes(), km.fsname(), + km.device(), km.inode(), nullptr, &km); +} + +Task::CapturedState reconstitute_captured_state( + SupportedArch arch, const std::vector& cpuid_records, + pcp::CapturedState::Reader reader) { + Task::CapturedState res; + res.ticks = reader.getTicks(); + auto register_raw = reader.getRegs().getRaw(); + res.regs = Registers{ arch }; + res.regs.restore_from_persistent_checkpoint(arch, register_raw.begin(), + register_raw.size()); + + auto raw = reader.getExtraRegs().getRaw(); + set_extra_regs_from_raw(arch, cpuid_records, raw, res.extra_regs); + + res.prname = data_to_str(reader.getPrname()); + res.fdtable_identity = reader.getFdtableIdentity(); + res.syscallbuf_child = reader.getSyscallbufChild(); + res.syscallbuf_size = reader.getSyscallbufSize(); + res.num_syscallbuf_bytes = reader.getNumSyscallbufBytes(); + res.preload_globals = reader.getPreloadGlobals(); + res.scratch_ptr = reader.getScratchPtr(); + res.scratch_size = reader.getScratchSize(); + res.top_of_stack = reader.getTopOfStack(); + auto rs = reader.getRseqState(); + res.rseq_state = std::make_unique(remote_ptr(rs.getPtr()), + rs.getAbortPrefixSignature()); + res.cloned_file_data_offset = reader.getClonedFileDataOffset(); + memcpy(res.thread_locals, reader.getThreadLocals().asBytes().begin(), + PRELOAD_THREAD_LOCALS_SIZE); + + res.rec_tid = reader.getRecTid(); + res.own_namespace_rec_tid = reader.getOwnNamespaceRecTid(); + res.serial = reader.getSerial(); + res.tguid = ThreadGroupUid{ reader.getTguid().getTid(), + reader.getTguid().getSerial() }; + res.desched_fd_child = reader.getDeschedFdChild(); + res.cloned_file_data_fd_child = reader.getClonedFileDataFdChild(); + res.cloned_file_data_fname = data_to_str(reader.getClonedFileDataFname()); + res.wait_status = WaitStatus{ reader.getWaitStatus() }; + res.tls_register = reader.getTlsRegister(); + + res.thread_areas = {}; + for (const auto& ta : reader.getThreadAreas()) { + const X86Arch::user_desc item = *(X86Arch::user_desc*)ta.begin(); + res.thread_areas.push_back(item); + } + + return res; +} + +void init_scratch_memory(ReplayTask* t, const KernelMapping& km) { + + t->scratch_ptr = km.start(); + t->scratch_size = km.size(); + size_t sz = t->scratch_size; + + ASSERT(t, (km.prot() & (PROT_READ | PROT_WRITE)) == (PROT_READ | PROT_WRITE)); + ASSERT(t, (km.flags() & (MAP_PRIVATE | MAP_ANONYMOUS)) == + (MAP_PRIVATE | MAP_ANONYMOUS)); + + { + AutoRemoteSyscalls remote(t); + remote.infallible_mmap_syscall_if_alive(t->scratch_ptr, sz, km.prot(), + km.flags() | MAP_FIXED, -1, 0); + t->vm()->map(t, t->scratch_ptr, sz, km.prot(), km.flags(), 0, std::string(), + KernelMapping::NO_DEVICE, KernelMapping::NO_INODE, nullptr, + &km); + } +} + +kj::Array prepare_user_desc(const X86Arch::user_desc& desc) { + kj::Array data = + kj::heapArray(sizeof(X86Arch::user_desc)); + memcpy(data.begin(), &desc, sizeof(X86Arch::user_desc)); // Copy raw bytes + return data; +} + +void write_capture_state(pcp::CapturedState::Builder& sb, + const Task::CapturedState& state) { + sb.setTicks(state.ticks); + sb.initRegs().setRaw(regs_to_raw(state.regs)); + sb.initExtraRegs().setRaw(extra_regs_to_raw(state.extra_regs)); + sb.setPrname(str_to_data(state.prname)); + sb.setFdtableIdentity(state.fdtable_identity); + sb.setSyscallbufChild(state.syscallbuf_child.as_int()); + sb.setSyscallbufSize(state.syscallbuf_size); + sb.setNumSyscallbufBytes(state.num_syscallbuf_bytes); + sb.setPreloadGlobals(state.preload_globals.as_int()); + sb.setScratchPtr(state.scratch_ptr.as_int()); + sb.setScratchSize(state.scratch_size); + sb.setTopOfStack(state.top_of_stack.as_int()); + auto rseq = sb.initRseqState(); + if (state.rseq_state) { + rseq.setPtr(state.rseq_state->ptr.as_int()); + rseq.setAbortPrefixSignature(state.rseq_state->abort_prefix_signature); + } else { + rseq.setPtr(0); + rseq.setAbortPrefixSignature(0); + } + + sb.setClonedFileDataOffset(state.cloned_file_data_offset); + auto tl = kj::ArrayPtr( + reinterpret_cast(state.thread_locals), 104); + sb.setThreadLocals(tl); + sb.setRecTid(state.rec_tid); + sb.setOwnNamespaceRecTid(state.own_namespace_rec_tid); + sb.setSerial(state.serial); + auto tguid = sb.initTguid(); + tguid.setTid(state.tguid.tid()); + tguid.setSerial(state.tguid.serial()); + sb.setDeschedFdChild(state.desched_fd_child); + sb.setClonedFileDataFdChild(state.cloned_file_data_fd_child); + sb.setClonedFileDataFname(str_to_data(state.cloned_file_data_fname)); + sb.setWaitStatus(state.wait_status.get()); + sb.setTlsRegister(state.tls_register); + auto thread_areas = sb.initThreadAreas(state.thread_areas.size()); + auto i = 0; + for (const auto& ta : state.thread_areas) { + thread_areas.set( + i++, kj::ArrayPtr( + reinterpret_cast(&ta), sizeof(ta))); + } +} + +void deserialize_fdtable( + Task* leader, const rr::pcp::ProcessSpace::Reader& clone_leader_reader) { + auto table = leader->fd_table(); + auto monitors = clone_leader_reader.getMonitors(); + for (auto m : monitors) { + FileMonitor::Type t = (FileMonitor::Type)m.getType(); + auto fd = m.getFd(); + if (!table->is_monitoring(m.getFd())) { + switch (t) { + case FileMonitor::Base: + FATAL() << "Can't add abstract type"; + break; + case FileMonitor::MagicSaveData: + table->add_monitor(leader, fd, new MagicSaveDataMonitor()); + break; + case FileMonitor::Mmapped: { + const auto mmap = m.getMmap(); + table->add_monitor(leader, fd, + new MmappedFileMonitor(mmap.getDead(), + mmap.getDevice(), + mmap.getInode())); + } break; + case FileMonitor::Preserve: + table->add_monitor(leader, fd, new PreserveFileMonitor()); + break; + case FileMonitor::ProcFd: { + const auto p_fd = m.getProcFd(); + const auto tuid = TaskUid(p_fd.getTid(), p_fd.getSerial()); + table->add_monitor(leader, fd, new ProcFdDirMonitor(tuid)); + break; + } + case FileMonitor::ProcMem: { + const auto pmem = m.getProcMem(); + table->add_monitor( + leader, fd, + new ProcMemMonitor(AddressSpaceUid( + pmem.getTid(), pmem.getSerial(), pmem.getExecCount()))); + break; + } + case FileMonitor::Stdio: + table->add_monitor(leader, fd, new StdioMonitor(m.getStdio())); + break; + case FileMonitor::VirtualPerfCounter: + FATAL() << "VirtualPerCounter Monitor deserializing unimplemented!\n"; + break; + case FileMonitor::NonvirtualPerfCounter: + table->add_monitor(leader, fd, new NonvirtualPerfCounterMonitor()); + break; + case FileMonitor::SysCpu: + table->add_monitor(leader, fd, new SysCpuMonitor(leader, "")); + break; + case FileMonitor::ProcStat: + table->add_monitor( + leader, fd, + new ProcStatMonitor(leader, data_to_str(m.getProcStat()))); + break; + case FileMonitor::RRPage: + table->add_monitor(leader, fd, new RRPageMonitor()); + break; + case FileMonitor::ODirect: + table->add_monitor(leader, fd, new ODirectFileMonitor()); + break; + case FileMonitor::BpfMap: + table->add_monitor(leader, fd, + new BpfMapMonitor(m.getBpf().getKeySize(), + m.getBpf().getValueSize())); + break; + case FileMonitor::PidFd: + FATAL() << "PidFd not supported to be serialized yet"; + break; + } + } + } +} + +} // namespace rr \ No newline at end of file diff --git a/src/PersistentCheckpointing.h b/src/PersistentCheckpointing.h new file mode 100644 index 00000000000..f927c525457 --- /dev/null +++ b/src/PersistentCheckpointing.h @@ -0,0 +1,88 @@ +#pragma once + +#include "AddressSpace.h" +#include "CheckpointInfo.h" +#include "kernel_abi.h" +#include "log.h" +#include "rr_pcp.capnp.h" +#include +#include +#include +#include +namespace rr { + +using FrameTime = int64_t; + +// Persistent checkpointing related utilities + +/** Passed from write_vm to each write_map call. Configures buffer for copying + * mappings into as well as opening relevant proc fs files */ +class WriteVmConfig { +public: + WriteVmConfig(Task* clone_leader, const char* data_dir, + size_t buffer_size) noexcept; + ~WriteVmConfig() { ::munmap(buffer.ptr, buffer.size); } + + Task* clone_leader; + ScopedFd proc_mem_fd; + ScopedFd proc_pagemap_fd; + const char* cp_data_dir; + + struct { + uint8_t* ptr; + size_t size; + } buffer; + + ssize_t pread(ssize_t bytes_read, const KernelMapping& km) const; +}; + +/* Writes capture `state` to state builder `sb`. */ +void write_capture_state(pcp::CapturedState::Builder& sb, + const Task::CapturedState& state); + +/** + * Writes the VM of |clone_leader| using the Capnproto |builder|. Checkpoint + * specific data, like the serialized segments are stored in + * |checkpoint_data_dir| + */ +void write_vm(Task* clone_leader, rr::pcp::ProcessSpace::Builder builder, + const std::string& checkpoint_data_dir); + +/** + * Write file |monitor| information to capnproto |builder| + */ +void write_monitor(rr::pcp::FileMonitor::Builder& builder, int fd, + FileMonitor* monitor); + +/** + * Restores Task::CapturedState from capnproto data. + */ +Task::CapturedState reconstitute_captured_state( + SupportedArch arch, const std::vector& cpuid_records, + pcp::CapturedState::Reader reader); + +void map_private_anonymous(AutoRemoteSyscalls& remote, const KernelMapping& km); + +/** + * Restores contents of `km` by copying contents from a file at `path` into it. + */ +void restore_map_contents(ReplayTask* t, const std::string& path, + const KernelMapping& km); + +/** + * Maps a file-backed (read only) segment in `remote.task()`. + */ +void map_region_file(AutoRemoteSyscalls& remote, const KernelMapping& km, + const std::string& file_path); + +// XXX re-factor this from `replay_syscall.cc` so that we don't duplicate code +// like this. It's identical, but without assertion. Need input from maintainers +// on where to put this. +void init_scratch_memory(ReplayTask* t, const KernelMapping& km); + +using CapturedMemory = + std::vector, std::vector>>; + +void deserialize_fdtable(Task* t, const rr::pcp::ProcessSpace::Reader& reader); + +} // namespace rr \ No newline at end of file diff --git a/src/PidFdMonitor.h b/src/PidFdMonitor.h index 20c2019220e..794dad55898 100644 --- a/src/PidFdMonitor.h +++ b/src/PidFdMonitor.h @@ -21,7 +21,7 @@ class PidFdMonitor : public FileMonitor { PidFdMonitor(TaskUid tuid) : tuid(tuid) {} - virtual Type type() override { return PidFd; } + virtual Type type() const override { return PidFd; } static PidFdMonitor* get(FdTable* fd_table, int fd); diff --git a/src/PreserveFileMonitor.h b/src/PreserveFileMonitor.h index f3b01da13ca..84bd16cd333 100644 --- a/src/PreserveFileMonitor.h +++ b/src/PreserveFileMonitor.h @@ -20,7 +20,7 @@ namespace rr { class PreserveFileMonitor : public FileMonitor { public: PreserveFileMonitor() {} - virtual Type type() override { return Preserve; } + virtual Type type() const override { return Preserve; } virtual bool is_rr_fd() override { return true; } }; diff --git a/src/ProcFdDirMonitor.cc b/src/ProcFdDirMonitor.cc index 02b847203ec..c189c0618c0 100644 --- a/src/ProcFdDirMonitor.cc +++ b/src/ProcFdDirMonitor.cc @@ -26,6 +26,8 @@ ProcFdDirMonitor::ProcFdDirMonitor(Task* t, const string& pathname) { } } +ProcFdDirMonitor::ProcFdDirMonitor(TaskUid tuid) noexcept : tuid(tuid) {} + // returns the number of valid dirent structs left in the buffer template static int filter_dirent_structs(RecordTask* t, uint8_t* buf, size_t size) { @@ -124,4 +126,11 @@ void ProcFdDirMonitor::filter_getdents(RecordTask* t) { filter_dirents(t); } +void ProcFdDirMonitor::serialize_type( + pcp::FileMonitor::Builder& builder) const noexcept { + auto pfd = builder.initProcFd(); + pfd.setTid(tuid.tid()); + pfd.setSerial(tuid.serial()); +} + } // namespace rr diff --git a/src/ProcFdDirMonitor.h b/src/ProcFdDirMonitor.h index 0757094c95d..29d620ecfc0 100644 --- a/src/ProcFdDirMonitor.h +++ b/src/ProcFdDirMonitor.h @@ -15,12 +15,15 @@ namespace rr { class ProcFdDirMonitor : public FileMonitor { public: ProcFdDirMonitor(Task* t, const std::string& pathname); + ProcFdDirMonitor(TaskUid tuid) noexcept; - virtual Type type() override { return ProcFd; } + virtual Type type() const override { return ProcFd; } virtual void filter_getdents(RecordTask* t) override; private: + void serialize_type( + pcp::FileMonitor::Builder& builder) const noexcept override; // 0 if this doesn't object doesn't refer to a tracee's proc-mem. TaskUid tuid; }; diff --git a/src/ProcMemMonitor.cc b/src/ProcMemMonitor.cc index 27727e6c6a6..1a1153d86d5 100644 --- a/src/ProcMemMonitor.cc +++ b/src/ProcMemMonitor.cc @@ -8,6 +8,7 @@ #include "RecordSession.h" #include "ReplaySession.h" #include "ReplayTask.h" +#include "TaskishUid.h" #include "log.h" using namespace std; @@ -26,6 +27,8 @@ ProcMemMonitor::ProcMemMonitor(Task* t, const string& pathname) { } } +ProcMemMonitor::ProcMemMonitor(AddressSpaceUid auid) noexcept : auid(auid) {} + void ProcMemMonitor::did_write(Task* t, const std::vector& ranges, LazyOffset& lazy_offset) { if (ranges.empty()) { @@ -68,4 +71,12 @@ bool ProcMemMonitor::target_is_vm(AddressSpace *vm) { return auid == vm->uid(); } +void ProcMemMonitor::serialize_type( + pcp::FileMonitor::Builder& builder) const noexcept { + auto pm = builder.initProcMem(); + pm.setExecCount(auid.exec_count()); + pm.setTid(auid.tid()); + pm.setSerial(auid.serial()); +} + } // namespace rr diff --git a/src/ProcMemMonitor.h b/src/ProcMemMonitor.h index e17b91571d9..8958d7a443c 100644 --- a/src/ProcMemMonitor.h +++ b/src/ProcMemMonitor.h @@ -15,8 +15,9 @@ namespace rr { class ProcMemMonitor : public FileMonitor { public: ProcMemMonitor(Task* t, const std::string& pathname); + ProcMemMonitor(AddressSpaceUid auid) noexcept; - virtual Type type() override { return ProcMem; } + virtual Type type() const override { return ProcMem; } // We need to PREVENT_SWITCH, since the timing of the write is otherwise // unpredictable from our perspective. @@ -32,6 +33,8 @@ class ProcMemMonitor : public FileMonitor { bool target_is_vm(AddressSpace *t); private: + void serialize_type( + pcp::FileMonitor::Builder& builder) const noexcept override; // 0 if this doesn't object doesn't refer to a tracee's proc-mem. AddressSpaceUid auid; }; diff --git a/src/ProcStatMonitor.cc b/src/ProcStatMonitor.cc index 3aa4093fec7..64fb5af3f7a 100644 --- a/src/ProcStatMonitor.cc +++ b/src/ProcStatMonitor.cc @@ -61,4 +61,9 @@ bool ProcStatMonitor::emulate_read( return true; } +void ProcStatMonitor::serialize_type( + pcp::FileMonitor::Builder& builder) const noexcept { + builder.setProcStat(str_to_data(data)); +} + } // namespace rr diff --git a/src/ProcStatMonitor.h b/src/ProcStatMonitor.h index 1ae97a0270e..849cb830c89 100644 --- a/src/ProcStatMonitor.h +++ b/src/ProcStatMonitor.h @@ -18,11 +18,14 @@ class ProcStatMonitor : public FileMonitor { public: ProcStatMonitor(Task* t, const std::string& pathname); - virtual Type type() override { return ProcStat; } + virtual Type type() const override { return ProcStat; } bool emulate_read(RecordTask* t, const std::vector& ranges, LazyOffset&, uint64_t* result) override; + private: + void serialize_type( + pcp::FileMonitor::Builder& builder) const noexcept override; std::string data; }; diff --git a/src/RRPageMonitor.h b/src/RRPageMonitor.h index d2f7fc98502..74dc1008376 100644 --- a/src/RRPageMonitor.h +++ b/src/RRPageMonitor.h @@ -17,7 +17,7 @@ class RRPageMonitor : public FileMonitor { public: RRPageMonitor() : FileMonitor() {}; - virtual Type type() override { return RRPage; } + virtual Type type() const override { return RRPage; } }; static_assert(TraceReader::SpecialLibRRpage != 0, diff --git a/src/Registers.cc b/src/Registers.cc index 6843465002d..7f2481e8415 100644 --- a/src/Registers.cc +++ b/src/Registers.cc @@ -10,6 +10,7 @@ #include "ReplayTask.h" #include "core.h" +#include "kernel_abi.h" #include "log.h" using namespace std; @@ -625,6 +626,25 @@ void Registers::set_from_ptrace_for_arch(SupportedArch a, const void* data, memcpy(&u.x86regs, data, sizeof(u.x86regs)); } +void Registers::restore_from_persistent_checkpoint(SupportedArch arch, + const void* data, + size_t size) { + switch (arch) { + case x86: + DEBUG_ASSERT(sizeof(u.x86regs) == size); + memcpy(&u.x86regs, data, size); + break; + case x86_64: + DEBUG_ASSERT(sizeof(u.x64regs) == size); + memcpy(&u.x64regs, data, size); + break; + case aarch64: + DEBUG_ASSERT(sizeof(u.x64regs) == size); + memcpy(&u.arm64regs, data, size); + break; + } +} + void Registers::set_from_trace(SupportedArch a, const void* data, size_t size) { if (is_x86ish(a)) { diff --git a/src/Registers.h b/src/Registers.h index 2fd99cab4ca..64e38ce5cd6 100644 --- a/src/Registers.h +++ b/src/Registers.h @@ -124,6 +124,9 @@ class Registers { void set_from_trace(SupportedArch arch, const void* data, size_t size); + void restore_from_persistent_checkpoint(SupportedArch arch, const void* data, + size_t size); + #define ARCH_SWITCH_CASE(rettype, x86case, x64case, arm64case) \ (([=](void) -> rettype { \ switch (arch()) { \ diff --git a/src/ReplayCommand.cc b/src/ReplayCommand.cc index 1ee8d1b97eb..cb049c1f27e 100644 --- a/src/ReplayCommand.cc +++ b/src/ReplayCommand.cc @@ -7,14 +7,17 @@ #include #include +#include #include +#include "CheckpointInfo.h" #include "Command.h" #include "Flags.h" #include "GdbServer.h" #include "ReplaySession.h" #include "ScopedFd.h" #include "WaitManager.h" +#include "TraceTaskEvent.h" #include "core.h" #include "kernel_metadata.h" #include "launch_debugger.h" @@ -79,7 +82,8 @@ ReplayCommand ReplayCommand::singleton( " --serve-files Serve all files from the trace rather than\n" " assuming they exist on disk. Debugging will\n" " be slower, but be able to tolerate missing files\n" - " --tty Redirect tracee replay output to \n"); + " --tty Redirect tracee replay output to \n" + " --ignore-pcp Don't spawn session from persistent checkpoint\n"); struct ReplayFlags { // Start a debug server for the task scheduled at the first @@ -87,6 +91,9 @@ struct ReplayFlags { // been "created". FrameTime goto_event; + // Ignore persistent checkpoints and start session from beginning + bool ignore_persistent_cp; + FrameTime singlestep_to_event; pid_t target_process; @@ -146,6 +153,7 @@ struct ReplayFlags { ReplayFlags() : goto_event(0), + ignore_persistent_cp(false), singlestep_to_event(0), target_process(0), process_created_how(CREATED_NONE), @@ -189,7 +197,8 @@ static bool parse_replay_arg(vector& args, ReplayFlags& flags) { { 3, "serve-files", NO_PARAMETER }, { 4, "tty", HAS_PARAMETER }, { 5, "intel-pt-start-checking-event", HAS_PARAMETER }, - { 6, "retry-transient-errors", NO_PARAMETER } + { 6, "retry-transient-errors", NO_PARAMETER }, + { 7, "ignore-pcp", NO_PARAMETER }, }; ParsedOption opt; if (!Command::parse_option(args, options, &opt)) { @@ -295,12 +304,34 @@ static bool parse_replay_arg(vector& args, ReplayFlags& flags) { case 6: flags.retry_transient_errors = true; break; + case 7: + flags.ignore_persistent_cp = true; + break; + break; default: DEBUG_ASSERT(0 && "Unknown option"); } return true; } +/** + * Return event time when `pid` is created or first found in trace. + */ +static FrameTime process_spawn_time(const string& trace_dir, pid_t pid) { + TraceReader trace(trace_dir); + FrameTime time = -1; + for (auto e = trace.read_task_event(&time); e.type() != TraceTaskEvent::NONE; + e = trace.read_task_event(&time)) { + if ((e.type() == TraceTaskEvent::CLONE || + e.type() == TraceTaskEvent::EXEC) && + e.tid() == pid) { + LOG(debug) << "Process " << pid << " created at " << time; + return time; + } + } + return -1; +} + static int find_pid_for_command(const string& trace_dir, const string& command) { TraceReader trace(trace_dir); @@ -512,6 +543,17 @@ static int replay(const string& trace_dir, const ReplayFlags& flags) { serve_replay_no_debugger(trace_dir, flags); } else { auto session = ReplaySession::create(trace_dir, session_flags(flags, false)); + if (!flags.ignore_persistent_cp) { + const auto cps = session->get_persistent_checkpoints(); + const auto cp = find_if(rbegin(cps), rend(cps), [&](const auto& cp) { + return target.event >= cp.event_time(); + }); + if (cp != rend(cps)) { + LOG(debug) << "Spawning from persistent checkpoint at " + << cp->event_time(); + session->load_checkpoint(*cp); + } + } GdbServer::ConnectionFlags conn_flags; conn_flags.dbg_port = flags.dbg_port; conn_flags.dbg_host = flags.dbg_host; @@ -544,6 +586,22 @@ static int replay(const string& trace_dir, const ReplayFlags& flags) { ScopedFd debugger_params_write_pipe(debugger_params_pipe[1]); auto session = ReplaySession::create(trace_dir, session_flags(flags, false)); + if (!flags.ignore_persistent_cp) { + const auto event_when_created = + process_spawn_time(trace_dir, flags.target_process); + const auto pcps = session->get_persistent_checkpoints(); + const auto cp = find_if(rbegin(pcps), rend(pcps), [&](const auto& cp) { + return target.event >= cp.event_time() || + event_when_created > cp.event_time(); + }); + + if (cp != rend(pcps)) { + LOG(debug) << "Spawning session from persistent checkpoint at " + << cp->event_time(); + session->load_checkpoint(*cp); + } + } + GdbServer::ConnectionFlags conn_flags; conn_flags.dbg_port = flags.dbg_port; conn_flags.dbg_host = flags.dbg_host; diff --git a/src/ReplaySession.cc b/src/ReplaySession.cc index cb75a2c9842..f8bf795f947 100644 --- a/src/ReplaySession.cc +++ b/src/ReplaySession.cc @@ -27,6 +27,10 @@ #include "replay_syscall.h" #include "util.h" +#include "PersistentCheckpointing.h" +#include "PreserveFileMonitor.h" +#include + using namespace std; namespace rr { @@ -335,35 +339,12 @@ ReplaySession::shr_ptr ReplaySession::clone() { return session; } -/** - * Return true if it's possible/meaningful to make a checkpoint at the - * |frame| that |t| will replay. - */ -static bool can_checkpoint_at(const Event& ev) { - if (ev.has_ticks_slop()) { - return false; - } - switch (ev.type()) { - case EV_EXIT: - // At exits, we can't clone the exiting tasks, so - // don't event bother trying to checkpoint. - case EV_SYSCALLBUF_RESET: - // RESETs are usually inserted in between syscall - // entry/exit. Do not attempting to checkpoint at - // RESETs. Users would never want to do that anyway. - case EV_TRACE_TERMINATION: - // There's nothing to checkpoint at the end of a trace. - return false; - default: - return true; - } -} - bool ReplaySession::can_clone() { finish_initializing(); ReplayTask* t = current_task(); - return t && done_initial_exec() && can_checkpoint_at(current_trace_frame().event()); + return t && done_initial_exec() && + current_trace_frame().event().can_checkpoint_at(); } DiversionSession::shr_ptr ReplaySession::clone_diversion() { @@ -2264,4 +2245,343 @@ bool ReplaySession::echo_stdio() const { current_frame_time() >= suppress_stdio_before_event_; } +void ReplaySession::serialize_checkpoint( + pcp::CloneCompletionInfo::Builder& writer, CheckpointInfo& cp_info) { + DEBUG_ASSERT(clone_completion != nullptr); + + auto addr_space_count = clone_completion->address_spaces.size(); + auto& as_data = clone_completion->address_spaces; + auto addr_space_builders = writer.initAddressSpaces(addr_space_count); + + for (auto i = 0u; i < addr_space_count; i++) { + const auto& as = as_data[i]; + const auto leader = static_cast(as.clone_leader); + + auto addr_space_clone = addr_space_builders[i]; + addr_space_clone.setAuxv(kj::ArrayPtr{ + leader->vm()->saved_auxv().data(), leader->vm()->saved_auxv().size() }); + addr_space_clone.setArch(to_trace_arch(as.clone_leader->arch())); + auto cls = addr_space_clone.initCloneLeaderState(); + write_capture_state(cls, as.clone_leader_state); + auto pspace = addr_space_builders[i].initProcessSpace(); + pspace.setTaskFirstRunEvent(leader->tg->first_run_event()); + pspace.setVmFirstRunEvent(leader->vm()->first_run_event()); + pspace.setExe(str_to_data(leader->vm()->exe_image())); + const auto orig_exe = leader->original_exe(); + pspace.setOriginalExe(str_to_data(orig_exe)); + + write_vm(as.clone_leader, pspace, cp_info.data_directory()); + auto captured_mem_list = + addr_space_clone.initCapturedMemory(as.captured_memory.size()); + auto captured_idx = 0; + for (const auto& mem : as.captured_memory) { + auto cm = captured_mem_list[captured_idx++]; + cm.setStartAddress(mem.first.as_int()); + cm.setData(kj::ArrayPtr(mem.second.data(), + mem.second.size())); + } + + auto member_states = + addr_space_clone.initMemberState(as.member_states.size()); + auto cs_idx = 0; + for (const auto& state : as.member_states) { + auto ms = member_states[cs_idx++]; + write_capture_state(ms, state); + } + clone_completion->cloned_fd_tables[as.clone_leader_state.fdtable_identity] + ->serialize(pspace); + writer.setUsesSyscallBuffering(leader->vm()->syscallbuf_enabled()); + } + + auto step = capnp::Data::Reader{ (std::uint8_t*)¤t_step, + sizeof(ReplayTraceStep) }; + writer.setSessionCurrentStep(step); + + auto siginfo = + capnp::Data::Reader{ (std::uint8_t*)&last_siginfo_, sizeof(siginfo_t) }; + writer.setLastSigInfo(siginfo); +} + +void ReplaySession::load_checkpoint(const CheckpointInfo& cp_info) { + DEBUG_ASSERT(!partially_initialized()); + ScopedFd checkpoint_fd = cp_info.open_for_read(); + capnp::PackedFdMessageReader datum(checkpoint_fd); + + auto checkpointInfo = datum.getRoot(); + pcp::CloneCompletionInfo::Reader cc_reader = + checkpointInfo.getCloneCompletion(); + + const auto addr_spaces = cc_reader.getAddressSpaces(); + + std::vector partial_init_addr_spaces{}; + Task::ClonedFdTables cloned_fd_tables{}; + + std::vector cloned_leaders{}; + auto zygote = current_task(); + for (const auto& as : addr_spaces) { + const auto taskInfo = as.getCloneLeaderState(); + AutoRemoteSyscalls remote(zygote, + AutoRemoteSyscalls::DISABLE_MEMORY_PARAMS); + Task* child = Task::os_clone(Task::SESSION_CLONE_LEADER, this, remote, + taskInfo.getRecTid(), taskInfo.getSerial(), + SIGCHLD, nullptr); + cloned_leaders.push_back(static_cast(child)); + } + + auto clone_leader_index = 0; + LOG(debug) << "Restoring " << addr_spaces.size() << " clone leaders"; + for (const auto& as : addr_spaces) { + ReplayTask* leader = cloned_leaders[clone_leader_index++]; + const auto proc_space = as.getProcessSpace(); + const auto cleader_captured_state = as.getCloneLeaderState(); + + SupportedArch address_space_arch = from_trace_arch(as.getArch()); + leader->is_stopped_ = true; + leader->os_exec_stub(address_space_arch); + std::string exe_name = data_to_str(proc_space.getExe()); + std::string original_exe_name = data_to_str(proc_space.getOriginalExe()); + leader->post_exec(original_exe_name); + static_cast(leader)->post_exec_syscall(original_exe_name); + + // set up the/a stack mapping in which we can make remote syscalls in + // afterwards + auto mappings_data = proc_space.getVirtualAddressSpace(); + auto mappings_it = mappings_data.begin(); + + // Map an executable mapping first, so we can use that memory for remote sys + // calls + { + AutoRemoteSyscalls remote(leader, + AutoRemoteSyscalls::DISABLE_MEMORY_PARAMS); + leader->vm()->unmap_all_but_rr_mappings(remote); + DEBUG_ASSERT(mappings_it->getMapType().isPrivateAnon() && + (mappings_it->getProtection() & (PROT_READ | PROT_WRITE)) == + (PROT_READ | PROT_WRITE)); + KernelMapping stack_mapping{ mappings_it->getStart(), + mappings_it->getEnd(), + data_to_str(mappings_it->getFsname()), + mappings_it->getDevice(), + mappings_it->getInode(), + mappings_it->getProtection(), + mappings_it->getFlags(), + static_cast( + mappings_it->getOffset()) }; + map_private_anonymous(remote, stack_mapping); + restore_map_contents( + leader, + data_to_str( + mappings_it->getMapType().getPrivateAnon().getContentsPath()), + stack_mapping); + mappings_it++; + } + + auto scratchPointer = + remote_ptr(cleader_captured_state.getScratchPtr()); + ASSERT(leader, scratchPointer != nullptr) << "No scratch pointer found!"; + if (proc_space.getBreakpointFaultAddress() != 0) { + leader->vm()->set_breakpoint_fault_addr( + proc_space.getBreakpointFaultAddress()); + } + + leader->thread_group()->set_first_run_event( + proc_space.getTaskFirstRunEvent()); + leader->vm()->set_first_run_event(proc_space.getVmFirstRunEvent()); + + std::vector> syscallbuf_mappings; + std::unique_ptr> scratch_mem = + nullptr; + { + AutoRemoteSyscalls remote(leader); + for (; mappings_it != std::end(mappings_data); mappings_it++) { + const auto& km_data = *mappings_it; + auto map = km_data.getMapType(); + KernelMapping km( + remote_ptr(km_data.getStart()), km_data.getEnd(), + km_data.hasFsname() ? data_to_str(km_data.getFsname()) : "", + km_data.getDevice(), km_data.getInode(), km_data.getProtection(), + km_data.getFlags(), km_data.getOffset()); + if (km.contains(scratchPointer)) { + scratch_mem = std::make_unique>( + std::make_pair( + km, data_to_str(map.getPrivateAnon().getContentsPath()))); + } else if (map.isGuardSegment()) { + // Guard segments: empty private anon mappings, where no data has been + // written. + map_private_anonymous(remote, km); + } else if (map.isFile()) { + auto p = data_to_str(map.getFile().getContentsPath()); + map_region_file(remote, km, p); + } else if (map.isSharedAnon()) { + auto sa = map.getSharedAnon(); + auto emufile = leader->session().emufs().get_or_create(km); + struct stat real_file; + std::string real_file_name; + remote.finish_direct_mmap( + km.start(), km.size(), km.prot(), + (km.flags() | MAP_FIXED) & ~MAP_ANONYMOUS, emufile->proc_path(), + O_RDWR, km.file_offset_bytes(), real_file, real_file_name); + leader->vm()->map(leader, km.start(), km.size(), km.prot(), + km.flags(), km.file_offset_bytes(), real_file_name, + real_file.st_dev, real_file.st_ino, nullptr, &km, + emufile); + restore_map_contents(leader, data_to_str(sa.getContentsPath()), km); + if (sa.getIsSysVSegment()) { + leader->vm()->set_shm_size(km.start(), km.size()); + } + } else if (map.isPrivateAnon()) { + auto f = map.getPrivateAnon(); + auto path = data_to_str(f.getContentsPath()); + map_private_anonymous(remote, km); + restore_map_contents(leader, path, km); + } else if (map.isRrPage()) { + const auto path = data_to_str(map.getRrPage().getContentsPath()); + restore_map_contents(leader, path, km); + } else if (map.isSyscallBuffer()) { + const auto path = + data_to_str(map.getSyscallBuffer().getContentsPath()); + syscallbuf_mappings.push_back(std::make_pair(km, path)); + } else { + FATAL() << "Unknown serialized map type"; + } + } + + auto index = original_exe_name.rfind('/'); + auto name = "rr:" + original_exe_name.substr( + index == std::string::npos ? 0 : index + 1); + leader->set_name(remote, name); + } + + ASSERT(leader, scratch_mem != nullptr) + << "Scratch memory mapping could not be restored."; + { + auto& km = scratch_mem->first; + auto& path = scratch_mem->second; + init_scratch_memory(leader, km); + restore_map_contents(leader, path, km); + } + + std::vector auxv{}; + auto auxv_data = as.getAuxv().asChars(); + std::copy(auxv_data.begin(), auxv_data.end(), std::back_inserter(auxv)); + + leader->vm()->restore_auxv(leader, std::move(auxv)); + syscall(SYS_rrcall_reload_auxv, leader->tid); + std::vector member_states; + for (const auto& member_state : as.getMemberState()) { + member_states.push_back(reconstitute_captured_state( + address_space_arch, trace_in.cpuid_records(), member_state)); + } + + CapturedMemory captured_memory; + for (const auto& captured_mem : as.getCapturedMemory()) { + std::vector mem; + auto mem_reader = captured_mem.getData(); + std::copy(mem_reader.begin(), mem_reader.end(), std::back_inserter(mem)); + captured_memory.push_back( + std::make_pair(captured_mem.getStartAddress(), std::move(mem))); + } + Task::CapturedState cloneLeaderCaptureState = reconstitute_captured_state( + address_space_arch, trace_in.cpuid_records(), as.getCloneLeaderState()); + auto fd_table_key = cloneLeaderCaptureState.fdtable_identity; + leader->preload_globals = cloneLeaderCaptureState.preload_globals; + partial_init_addr_spaces.push_back(CloneCompletion::AddressSpaceClone{ + .clone_leader = leader, + .clone_leader_state = std::move(cloneLeaderCaptureState), + .member_states = std::move(member_states), + .captured_memory = std::move(captured_memory) }); + on_create(leader); + deserialize_fdtable(leader, proc_space); + + if (cc_reader.getUsesSyscallBuffering()) { + leader->vm()->set_uses_syscall_buffer(); + for (const auto& sysbuf : syscallbuf_mappings) { + const auto& km = sysbuf.first; + const auto& path = sysbuf.second; + AutoRemoteSyscalls remote(leader); + if (km.contains(cleader_captured_state.getSyscallbufChild())) { + const auto map_hint = km.start(); + leader->syscallbuf_size = cleader_captured_state.getSyscallbufSize(); + leader->init_syscall_buffer(remote, map_hint); + leader->desched_fd_child = cleader_captured_state.getDeschedFdChild(); + if (!leader->fd_table()->get_monitor(leader->desched_fd_child)) { + leader->fd_table()->add_monitor(leader, leader->desched_fd_child, + new PreserveFileMonitor()); + } + if (cleader_captured_state.getClonedFileDataFdChild() >= 0) { + leader->cloned_file_data_fd_child = + cleader_captured_state.getClonedFileDataFdChild(); + leader->cloned_file_data_fname = + trace_reader().file_data_clone_file_name(leader->tuid()); + ScopedFd clone_file(leader->cloned_file_data_fname.c_str(), + O_RDONLY); + ASSERT(leader, clone_file.is_open()); + remote.infallible_send_fd_dup( + clone_file, leader->cloned_file_data_fd_child, O_CLOEXEC); + leader->fd_table()->replace_monitor( + leader, leader->cloned_file_data_fd_child, + new PreserveFileMonitor()); + } + for (const auto& mem : + partial_init_addr_spaces.back().captured_memory) { + if (km.contains(mem.first)) { + leader->write_bytes_helper(mem.first, mem.second.size(), + mem.second.data()); + } + } + restore_map_contents(leader, path, km); + } else { + // recreate shared map, i.e. some _other_ task's (A) syscall buffer + // for this task (B), the mappings that just "float" due to being + // inherited after a fork, but from what I understood, isn't ever + // actually used. It's just "there". To keep the process' address + // space identical with normal execution, it is therefore mapped here. + char name[4096]; + strncpy(name, km.fsname().c_str(), sizeof(name) - 1); + name[sizeof(name) - 1] = 0; + create_shared_mmap(remote, km.size(), km.start(), + extract_name(name, sizeof(name)), km.prot(), 0, + nullptr); + remote.task()->vm()->mapping_flags_of(km.start()) |= + AddressSpace::Mapping::IS_SYSCALLBUF; + restore_map_contents(leader, path, km); + } + } + ASSERT(leader, leader->vm()->syscallbuf_enabled()) + << "syscall buffering should be enabled at this point"; + // Fool Task::copy_state that syscall buf has not been initialized. For + // pcp we need to since we never hit the events where syscall buffers get + // initialized like a normal executed tracee-replay would. + leader->syscallbuf_child = nullptr; + } + leader->ticks = cleader_captured_state.getTicks(); + + cloned_fd_tables[fd_table_key] = leader->fd_table(); + } // end of 1 clone leader setup iteration + + clone_completion = std::make_unique(); + clone_completion->address_spaces = std::move(partial_init_addr_spaces); + clone_completion->cloned_fd_tables = std::move(cloned_fd_tables); + + memcpy(¤t_step, cc_reader.getSessionCurrentStep().begin(), + sizeof(ReplayTraceStep)); + + trace_reader().rewind(); + trace_reader().forward_to(cp_info.clone_data.time); + + trace_frame = trace_reader().read_frame(); + memcpy(&last_siginfo_, cc_reader.getLastSigInfo().begin(), sizeof(siginfo_t)); + restore_session_info(cp_info); +} + +std::vector ReplaySession::get_persistent_checkpoints() { + return rr::get_checkpoint_infos(resolve_trace_name(trace_reader().dir()), + trace_reader().cpuid_records()); +} + +void ReplaySession::restore_session_info(const CheckpointInfo& cp) { + ticks_at_start_of_event = cp.clone_data.ticks_at_event_start; + next_task_serial_ = cp.next_serial; + statistics_ = cp.stats; +} + } // namespace rr diff --git a/src/ReplaySession.h b/src/ReplaySession.h index 74d36787ab9..9dee00caef5 100644 --- a/src/ReplaySession.h +++ b/src/ReplaySession.h @@ -21,6 +21,7 @@ struct syscallbuf_hdr; namespace rr { class ReplayTask; +class CheckpointInfo; /** * ReplayFlushBufferedSyscallState is saved in Session and cloned with its @@ -185,6 +186,8 @@ class ReplaySession final : public Session { */ shr_ptr clone(); + bool partially_initialized() const { return clone_completion != nullptr; } + /** * Return true if we're in a state where it's OK to clone. For example, * we can't clone in some syscalls. @@ -376,6 +379,27 @@ class ReplaySession final : public Session { bool mark_stdio() const override; bool echo_stdio() const; + /** + * Serializes this session to disk and associates it with the + * checkpoint represented in |cp_info|, which represents the time in key, + * ticks, etc found in the `Mark` data type. Responsibility is on the caller + * that these actually belong together. + */ + void serialize_checkpoint(pcp::CloneCompletionInfo::Builder& writer, + CheckpointInfo& cp_info); + + /** + * Deserializes into `this` session the session data found described by + * CheckpointInfo `cp`, restoring the process from disk. + */ + void load_checkpoint(const CheckpointInfo& cp); + + /** + * Returns persistent checkpoints stored in this trace. Returned + * CheckpointInfo list is sorted, ordered by event. + */ + std::vector get_persistent_checkpoints(); + private: ReplaySession(const std::string& dir, const Flags& flags); ReplaySession(const ReplaySession& other); @@ -422,6 +446,9 @@ class ReplaySession final : public Session { void clear_syscall_bp(); + // load `ReplaySession` session state from serialized checkpoint + void restore_session_info(const CheckpointInfo&); + std::shared_ptr emu_fs; std::shared_ptr tracee_output_fd_; TraceReader trace_in; diff --git a/src/ReplayTask.cc b/src/ReplayTask.cc index 5005828438c..1e18b486415 100644 --- a/src/ReplayTask.cc +++ b/src/ReplayTask.cc @@ -238,4 +238,27 @@ bool ReplayTask::post_vm_clone(CloneReason reason, int flags, Task* origin) { return false; } +std::string ReplayTask::original_exe() const { + TraceReader task_original_exe_reader = trace_reader(); + task_original_exe_reader.rewind(); + auto tid = rec_tid; + for (;;) { + auto tte = task_original_exe_reader.read_task_event(); + if (tte.type() == TraceTaskEvent::NONE) { + FATAL() + << "Could not find process of origin to grab original exe name from"; + } + if (tte.tid() == tid) { + if (tte.type() == TraceTaskEvent::CLONE) { + tid = tte.parent_tid(); + task_original_exe_reader.rewind(); + } else if (tte.type() == TraceTaskEvent::EXEC) { + return tte.file_name(); + } + } + } + FATAL() << "Could not find process of origin to grab original exe name from"; + return ""; +} + } // namespace rr diff --git a/src/ReplayTask.h b/src/ReplayTask.h index b127a36be47..8a3c1316db4 100644 --- a/src/ReplayTask.h +++ b/src/ReplayTask.h @@ -85,6 +85,9 @@ class ReplayTask final : public Task { seen_sched_in_syscallbuf_syscall_hook = true; } + /* Digs out the original executable image from the trace. */ + std::string original_exe() const; + std::string name() const override { return name_; } diff --git a/src/ReplayTimeline.cc b/src/ReplayTimeline.cc index 108cd9691d2..753359f62a7 100644 --- a/src/ReplayTimeline.cc +++ b/src/ReplayTimeline.cc @@ -4,7 +4,11 @@ #include #include +#include +#include +#include "CheckpointInfo.h" +#include "EmuFs.h" #include "core.h" #include "fast_forward.h" #include "log.h" @@ -99,10 +103,31 @@ static bool equal_regs(const Registers& r1, const Registers& r2) { return r1.ip() == r2.ip() && r1.matches(r2); } +bool ReplayTimeline::InternalMark::equal_states( + const InternalMark& mark) const { + if (this == &mark) { + return true; + } + if (ticks_at_event_start != mark.ticks_at_event_start) { + return false; + } + + return proto.equal_states(mark.proto); +} + bool ReplayTimeline::InternalMark::equal_states(ReplaySession& session) const { return proto.equal_states(session); } +bool ReplayTimeline::ProtoMark::equal_states( + const ReplayTimeline::ProtoMark& proto) const { + if (key != proto.key) { + return false; + } + return equal_regs(regs, proto.regs) && + return_addresses == proto.return_addresses; +} + bool ReplayTimeline::ProtoMark::equal_states(ReplaySession& session) const { if (session_mark_key(session) != key) { return false; @@ -218,6 +243,39 @@ ReplayTimeline::Mark ReplayTimeline::mark() { return result; } +ReplayTimeline::Mark ReplayTimeline::recreate_mark_from_data( + const MarkData& mark_data, ReplaySession::shr_ptr session) { + Mark result; + auto m = make_shared(this, mark_data, session); + m->inc_refcount(); + auto& mark_vector = marks[m->proto.key]; + // If we are recreating a mark, in the same debug sesssion where we actually + // created the persistent checkpoint in there's a pretty high likelihood that + // the same `InternalMark` still exists for that original checkpoint + + for (std::shared_ptr& internal_mark : mark_vector) { + if (internal_mark->equal_states(*m)) { + // swap out the existing mark with this one - they're the same, but `m` + // contains the deserialized session + internal_mark = m; + result.ptr = internal_mark; + return result; + } + } + + mark_vector.push_back(m); + result.ptr = mark_vector.back(); + return result; +} + +void ReplayTimeline::register_mark_as_checkpoint(Mark& m) { + DEBUG_ASSERT(m.ptr && m.ptr->checkpoint && + "Can't register mark as checkpoint if no checkpoint exists"); + auto key = m.ptr->proto.key; + increase_mark_with_checkpoints(key); + m.ptr->inc_refcount(); +} + void ReplayTimeline::mark_after_singlestep(const Mark& from, const ReplayResult& result) { DEBUG_ASSERT(result.break_status.singlestep_complete); @@ -288,6 +346,15 @@ ReplayTimeline::Mark ReplayTimeline::lazy_reverse_singlestep(const Mark& from, return Mark(); } +void ReplayTimeline::increase_mark_with_checkpoints( + const MarkKey& key) noexcept { + if (marks_with_checkpoints.find(key) == marks_with_checkpoints.end()) { + marks_with_checkpoints[key] = 1; + } else { + marks_with_checkpoints[key]++; + } +} + ReplayTimeline::Mark ReplayTimeline::add_explicit_checkpoint() { DEBUG_ASSERT(current->can_clone()); @@ -296,13 +363,9 @@ ReplayTimeline::Mark ReplayTimeline::add_explicit_checkpoint() { unapply_breakpoints_and_watchpoints(); m.ptr->checkpoint = current->clone(); auto key = m.ptr->proto.key; - if (marks_with_checkpoints.find(key) == marks_with_checkpoints.end()) { - marks_with_checkpoints[key] = 1; - } else { - marks_with_checkpoints[key]++; - } + increase_mark_with_checkpoints(key); } - ++m.ptr->checkpoint_refcount; + m.ptr->inc_refcount(); return m; } @@ -314,8 +377,7 @@ void ReplayTimeline::remove_mark_with_checkpoint(const MarkKey& key) { } void ReplayTimeline::remove_explicit_checkpoint(const Mark& mark) { - DEBUG_ASSERT(mark.ptr->checkpoint_refcount > 0); - if (--mark.ptr->checkpoint_refcount == 0) { + if (mark.ptr->dec_refcount() == 0) { mark.ptr->checkpoint = nullptr; remove_mark_with_checkpoint(mark.ptr->proto.key); } @@ -769,6 +831,20 @@ void ReplayTimeline::apply_breakpoints_and_watchpoints() { } } +ReplayTimeline::Mark ReplayTimeline::recreate_marks_for_non_explicit( + const CheckpointInfo& cp, std::shared_ptr clone) { + DEBUG_ASSERT(cp.non_explicit_mark_data && clone.get() != nullptr); + // first add the mark with an actual clone, this is not a GDB checkpoint, but + // an RR checkpoint + auto mark = recreate_mark_from_data(cp.clone_data, clone); + reverse_exec_checkpoints[mark] = estimate_progress_of(*clone); + register_mark_as_checkpoint(mark); + // then add the mark with no clone, the one that will be visible to GDB, i.e. + // non explicit checkpoint + Mark result = recreate_mark_from_data(*cp.non_explicit_mark_data, nullptr); + return result; +} + void ReplayTimeline::unapply_breakpoints_internal() { for (auto& bp : breakpoints) { AddressSpace* vm = current->find_address_space(get<0>(bp)); @@ -1493,7 +1569,13 @@ ReplayResult ReplayTimeline::reverse_singlestep( } ReplayTimeline::Progress ReplayTimeline::estimate_progress() { - Session::Statistics stats = current->statistics(); + return estimate_progress_of(*current); +} + +/* static */ +ReplayTimeline::Progress ReplayTimeline::estimate_progress_of( + ReplaySession& session) { + Session::Statistics stats = session.statistics(); // The following parameters were estimated by running Firefox startup // and shutdown in an opt build on a Lenovo W530 laptop, replaying with // DUMP_STATS_PERIOD set to 100 (twice, and using only values from the @@ -1668,4 +1750,55 @@ ReplayTimeline::Mark ReplayTimeline::set_short_checkpoint() { return reverse_exec_short_checkpoint; } +ReplayTimeline::ProtoMark ReplayTimeline::ProtoMark::from_serialized_markdata( + const MarkData& md) { + auto proto_mark = ProtoMark{ MarkKey{ + md.time, md.ticks, ReplayStepKey{ (ReplayTraceStepType)md.step_key } } }; + proto_mark.regs = md.regs; + proto_mark.return_addresses = md.return_addresses; + return proto_mark; +} + +std::shared_ptr +ReplayTimeline::find_closest_mark_with_clone(const Mark& mark) { + if (marks_with_checkpoints.empty()) + return nullptr; + + const MarkKey* k = &marks_with_checkpoints.begin()->first; + for (const auto& kvp : marks_with_checkpoints) { + if (kvp.first < mark.ptr->proto.key && kvp.first > *k) { + k = &kvp.first; + } + } + + auto marks_found = std::find_if(std::cbegin(marks), std::cend(marks), + [&](auto& kvp) { return kvp.first == *k; }); + + if (marks_found == std::end(marks)) + return nullptr; + + for (auto it = std::rbegin(marks_found->second); + it != std::rend(marks_found->second); it++) { + DEBUG_ASSERT(it->get() != nullptr); + if ((*it)->checkpoint) { + auto result = std::make_shared(); + result->ptr = *(it); + return result; + } + } + return nullptr; +} + +ReplayTimeline::InternalMark::InternalMark(ReplayTimeline* owner, + const MarkData& serialized, + ReplaySession::shr_ptr session) + : owner(owner), + proto(ProtoMark::from_serialized_markdata(serialized)), + extra_regs(serialized.extra_regs), + checkpoint(session), + ticks_at_event_start(serialized.ticks_at_event_start), + singlestep_to_next_mark_no_signal( + serialized.singlestep_to_next_mark_no_signal), + checkpoint_refcount(0) {} + } // namespace rr diff --git a/src/ReplayTimeline.h b/src/ReplayTimeline.h index f3961d7d7e5..b4cc403a90c 100644 --- a/src/ReplayTimeline.h +++ b/src/ReplayTimeline.h @@ -3,6 +3,7 @@ #ifndef RR_REPLAY_TIMELINE_H_ #define RR_REPLAY_TIMELINE_H_ +#include #include #include #include @@ -18,6 +19,9 @@ namespace rr { +class CheckpointInfo; +struct MarkData; + enum RunDirection { RUN_FORWARD, RUN_BACKWARD }; /** @@ -26,9 +30,8 @@ enum RunDirection { RUN_FORWARD, RUN_BACKWARD }; * checkpoints along this timeline and navigating to specific events. */ class ReplayTimeline { -private: struct InternalMark; - + struct MarkKey; public: ReplayTimeline(std::shared_ptr session); ~ReplayTimeline(); @@ -69,6 +72,29 @@ class ReplayTimeline { const Registers& regs() const { return ptr->proto.regs; } const ExtraRegisters& extra_regs() const { return ptr->extra_regs; } + const MarkKey& get_key() const { + DEBUG_ASSERT(ptr != nullptr && "Mark has no data"); + return ptr->proto.key; + } + + // XXX refactor and possibly move Mark and it's internal hierarchy + // into it's own file, making them public, or something like that. + std::shared_ptr get_internal() const { + if (!ptr) + FATAL() << "Mark has no data"; + return ptr; + } + + bool has_rr_checkpoint() const { + return ptr != nullptr && ptr->checkpoint != nullptr; + } + + ReplaySession::shr_ptr get_checkpoint() const { + DEBUG_ASSERT(ptr && "Mark has no data"); + DEBUG_ASSERT(ptr->checkpoint && "Mark has no checkpoint"); + return ptr->checkpoint; + } + FrameTime time() const { return ptr->proto.key.trace_time; } private: @@ -121,6 +147,10 @@ class ReplayTimeline { */ void remove_explicit_checkpoint(const Mark& mark); + /** Find mark that has `key` and increase the checkpoint count for that mark. + */ + void increase_mark_with_checkpoints(const MarkKey& key) noexcept; + /** * Return true if we're currently at the given mark. */ @@ -256,6 +286,35 @@ class ReplayTimeline { */ void apply_breakpoints_and_watchpoints(); + /** + * Creates the two marks associated with a non-explicit GDB checkpoint. The + * returned `Mark` is the mark that the non-explicit checkpoint references, + * i.e. the one without an actual checkpoint / session. + */ + Mark recreate_marks_for_non_explicit(const CheckpointInfo& cp, + std::shared_ptr clone); + + /** + * Re-create a mark from serialized MarkData `cp` and associate that mark with + * `session` which can be null in the case of for instance non-explicit + * checkpoints. + */ + Mark recreate_mark_from_data(const MarkData& cp, + ReplaySession::shr_ptr session); + + /* + * Registers a free-formed Mark with this ReplayTimeline. Used when + * deserializing checkpoints. + */ + void register_mark_as_checkpoint(Mark& m); + + /** + * Find a session clone before `mark`. + * Returns the mark associated with that clone or nullptr if not found. + */ + std::shared_ptr find_closest_mark_with_clone( + const Mark& mark); + private: /** * A MarkKey consists of FrameTime + Ticks + ReplayStepKey. These values @@ -316,10 +375,12 @@ class ReplayTimeline { ProtoMark(const MarkKey& key) : key(key) {} bool equal_states(ReplaySession& session) const; + bool equal_states(const ProtoMark& session) const; MarkKey key; Registers regs; ReturnAddressList return_addresses; + static ProtoMark from_serialized_markdata(const MarkData& md); }; /** @@ -328,13 +389,18 @@ class ReplayTimeline { * of two Marks. */ struct InternalMark { + // Construct InternalMark from serialized mark data, for `owner` timeline + // with deserialized `session` + InternalMark(ReplayTimeline* owner, const MarkData& serialized, + ReplaySession::shr_ptr session); + InternalMark(ReplayTimeline* owner, ReplaySession& session, const MarkKey& key) : owner(owner), proto(key), ticks_at_event_start(session.ticks_at_start_of_current_event()), - checkpoint_refcount(0), - singlestep_to_next_mark_no_signal(false) { + singlestep_to_next_mark_no_signal(false), + checkpoint_refcount(0) { ReplayTask* t = session.current_task(); if (t) { proto = ProtoMark(key, t); @@ -346,6 +412,7 @@ class ReplayTimeline { bool operator<(const std::shared_ptr other); bool equal_states(ReplaySession& session) const; + bool equal_states(const InternalMark& mark) const; void full_print(FILE* out) const; ReplayTimeline* owner; @@ -355,12 +422,25 @@ class ReplayTimeline { // Optional checkpoint for this Mark. ReplaySession::shr_ptr checkpoint; Ticks ticks_at_event_start; - // Number of users of `checkpoint`. - uint32_t checkpoint_refcount; // The next InternalMark in the ReplayTimeline's Mark vector is the result // of singlestepping from this mark *and* no signal is reported in the // break_status when doing such a singlestep. bool singlestep_to_next_mark_no_signal; + + // Increment refcount and return incremented value + [[maybe_unused]] uint32_t inc_refcount() noexcept { + return ++checkpoint_refcount; + } + + // Decrement refcount and return decremented value + [[maybe_unused]] uint32_t dec_refcount() noexcept { + DEBUG_ASSERT(checkpoint_refcount > 0); + return --checkpoint_refcount; + } + + private: + // Number of users of `checkpoint`. + uint32_t checkpoint_refcount; }; friend struct InternalMark; friend std::ostream& operator<<(std::ostream& s, const InternalMark& o); @@ -430,6 +510,8 @@ class ReplayTimeline { Progress estimate_progress(); + static Progress estimate_progress_of(ReplaySession& session); + /** * Called when the current session has moved forward to a new execution * point and we might want to make a checkpoint to support reverse-execution. diff --git a/src/RerunCommand.cc b/src/RerunCommand.cc index 7bc751118a9..cd919cd7d38 100644 --- a/src/RerunCommand.cc +++ b/src/RerunCommand.cc @@ -4,8 +4,10 @@ #include #include +#include #include +#include "CheckpointInfo.h" #include "Command.h" #include "ExportImportCheckpoints.h" #include "Flags.h" @@ -55,7 +57,9 @@ RerunCommand RerunCommand::singleton( " another rr instance exporting checkpoints at\n" " \n" " -r, --raw dump registers in raw format\n" - " -s, --trace-start= start tracing at \n" + " -s, --trace-start= start tracing at . If a persistent checkpoint exists\n" + " before the session will spawn from that point.\n" + " --ignore-pcp Ignore persistent checkpoints when running command.\n" " -u, --cpu-unbound allow replay to run on any CPU. Default is\n" " to run on the CPU stored in the trace.\n" " Note that this may diverge from the recording\n" @@ -107,6 +111,7 @@ struct RerunFlags { int export_checkpoints_count; bool raw; bool cpu_unbound; + bool ignore_pcp; RerunFlags() : trace_start(0), @@ -114,7 +119,8 @@ struct RerunFlags { export_checkpoints_event(0), export_checkpoints_count(0), raw(false), - cpu_unbound(false) {} + cpu_unbound(false), + ignore_pcp(false) {} }; #ifdef __x86_64__ @@ -491,11 +497,12 @@ static bool parse_rerun_arg(vector& args, RerunFlags& flags) { { 2, "event-regs", HAS_PARAMETER }, { 3, "export-checkpoints", HAS_PARAMETER }, { 4, "import-checkpoint", HAS_PARAMETER }, + { 5, "ignore-pcp", NO_PARAMETER }, { 'e', "trace-end", HAS_PARAMETER }, { 'f', "function", HAS_PARAMETER }, { 'r', "raw", NO_PARAMETER }, { 's', "trace-start", HAS_PARAMETER }, - { 'u', "cpu-unbound", NO_PARAMETER } + { 'u', "cpu-unbound", NO_PARAMETER }, }; ParsedOption opt; if (!Command::parse_option(args, options, &opt)) { @@ -521,6 +528,9 @@ static bool parse_rerun_arg(vector& args, RerunFlags& flags) { case 4: flags.import_checkpoint_socket = opt.value; break; + case 5: + flags.ignore_pcp = true; + break; case 'e': if (!opt.verify_valid_int(1, UINT32_MAX)) { return false; @@ -548,6 +558,7 @@ static bool parse_rerun_arg(vector& args, RerunFlags& flags) { case 'u': flags.cpu_unbound = true; break; + break; default: DEBUG_ASSERT(0 && "Unknown option"); } @@ -649,6 +660,18 @@ static int rerun(const string& trace_dir, const RerunFlags& flags, CommandForChe // Now that we've spawned the replay, raise our resource limits if // possible. raise_resource_limits(); + if (!flags.ignore_pcp) { + const auto pcps = replay_session->get_persistent_checkpoints(); + const auto cp = find_if(rbegin(pcps), rend(pcps), [&](const auto& cp) { + return flags.trace_start >= cp.event_time(); + }); + + if (cp != rend(pcps)) { + LOG(info) << "Spawning from persistent checkpoint at time " + << cp->event_time(); + replay_session->load_checkpoint(*cp); + } + } } else { vector fds; if (export_checkpoints_socket.is_open()) { diff --git a/src/Session.cc b/src/Session.cc index fc4723311a9..508d329e437 100644 --- a/src/Session.cc +++ b/src/Session.cc @@ -30,17 +30,6 @@ using namespace std; namespace rr { -struct Session::CloneCompletion { - struct AddressSpaceClone { - Task* clone_leader; - Task::CapturedState clone_leader_state; - vector member_states; - vector, vector>> captured_memory; - }; - vector address_spaces; - Task::ClonedFdTables cloned_fd_tables; -}; - Session::Session() : tracee_socket(make_shared()), tracee_socket_receiver(make_shared()), @@ -519,38 +508,6 @@ KernelMapping Session::create_shared_mmap( return km; } -static char* extract_name(char* name_buffer, size_t buffer_size) { - // Recover the name that was originally chosen by finding the part of the - // name between rr_mapping_prefix and the -%d-%d at the end. - char* path_start = strstr(name_buffer, Session::rr_mapping_prefix()); - DEBUG_ASSERT(path_start && - "Passed something to create_shared_mmap that" - " wasn't a mapping shared between rr and the tracee?"); - size_t prefix_len = path_start - name_buffer; - buffer_size -= prefix_len; - name_buffer += prefix_len; - - char* name_end = name_buffer + strnlen(name_buffer, buffer_size); - char* name_start = name_buffer + strlen(Session::rr_mapping_prefix()); - int hyphens_seen = 0; - while (name_end > name_start) { - --name_end; - if (*name_end == '-') { - ++hyphens_seen; - } else if (*name_end == '/') { - DEBUG_ASSERT(false && - "Passed something to create_shared_mmap that" - " wasn't a mapping shared between rr and the tracee?"); - } - if (hyphens_seen == 2) { - break; - } - } - DEBUG_ASSERT(hyphens_seen == 2); - *name_end = '\0'; - return name_start; -} - const AddressSpace::Mapping Session::recreate_shared_mmap( AutoRemoteSyscalls& remote, const AddressSpace::Mapping& m, PreserveContents preserve, MonitoredSharedMemory::shr_ptr monitored) { diff --git a/src/Session.h b/src/Session.h index 367d6c8c341..0993698d878 100644 --- a/src/Session.h +++ b/src/Session.h @@ -29,6 +29,18 @@ class Task; class ThreadGroup; class AutoRemoteSyscalls; +struct CloneCompletion { + struct AddressSpaceClone { + Task* clone_leader; + Task::CapturedState clone_leader_state; + std::vector member_states; + std::vector, std::vector>> + captured_memory; + }; + std::vector address_spaces; + Task::ClonedFdTables cloned_fd_tables; +}; + // The following types are used by step() APIs in Session subclasses. /** @@ -230,6 +242,8 @@ class Session { uint32_t next_task_serial() { return next_task_serial_++; } + uint32_t current_task_serial() const { return next_task_serial_; } + /** * Return the task created with |rec_tid|, or nullptr if no such * task exists. @@ -443,8 +457,6 @@ class Session { void copy_state_to(Session& dest, EmuFs& emu_fs, EmuFs& dest_emu_fs); - // XXX Move CloneCompletion/CaptureState etc to ReplayTask/ReplaySession - struct CloneCompletion; // Call this before doing anything that requires access to the full set // of tasks (i.e., almost anything!). Not really const! void finish_initializing() const; diff --git a/src/StdioMonitor.cc b/src/StdioMonitor.cc index 9c21720153e..ec480e18d3e 100644 --- a/src/StdioMonitor.cc +++ b/src/StdioMonitor.cc @@ -33,4 +33,9 @@ void StdioMonitor::did_write(Task* t, const std::vector& ranges, } } +void StdioMonitor::serialize_type( + pcp::FileMonitor::Builder& builder) const noexcept { + builder.setStdio(original_fd); +} + } // namespace rr diff --git a/src/StdioMonitor.h b/src/StdioMonitor.h index 4f67fc9f3ae..15211ff8867 100644 --- a/src/StdioMonitor.h +++ b/src/StdioMonitor.h @@ -22,7 +22,7 @@ class StdioMonitor : public FileMonitor { */ StdioMonitor(int original_fd) : original_fd(original_fd) {} - virtual Type type() override { return Stdio; } + virtual Type type() const override { return Stdio; } /** * Make writes to stdout/stderr blocking, to avoid nondeterminism in the @@ -44,6 +44,8 @@ class StdioMonitor : public FileMonitor { LazyOffset&) override; private: + void serialize_type( + pcp::FileMonitor::Builder& builder) const noexcept override; int original_fd; }; diff --git a/src/SysCpuMonitor.h b/src/SysCpuMonitor.h index 98546aaaaa9..f780525bdd0 100644 --- a/src/SysCpuMonitor.h +++ b/src/SysCpuMonitor.h @@ -17,7 +17,7 @@ class SysCpuMonitor : public FileMonitor { public: SysCpuMonitor(Task* t, const std::string& pathname); - virtual Type type() override { return SysCpu; } + virtual Type type() const override { return SysCpu; } bool emulate_read(RecordTask* t, const std::vector& ranges, LazyOffset&, uint64_t* result) override; diff --git a/src/Task.cc b/src/Task.cc index dcc0000d6f2..32e395a1ef3 100644 --- a/src/Task.cc +++ b/src/Task.cc @@ -371,6 +371,12 @@ std::string Task::proc_exe_path() { return path; } +std::string Task::proc_mem_path() const { + char path[PATH_MAX]; + snprintf(path, sizeof(path) - 1, "/proc/%d/mem", tid); + return path; +} + std::string Task::exe_path() { char proc_exe[PATH_MAX]; snprintf(proc_exe, sizeof(proc_exe), "/proc/%d/exe", tid); diff --git a/src/Task.h b/src/Task.h index a3b90e35945..d23906b9e04 100644 --- a/src/Task.h +++ b/src/Task.h @@ -223,6 +223,11 @@ class Task { */ std::string proc_exe_path(); + /** + * Return the path of /proc//mem + */ + std::string proc_mem_path() const; + /** * Return the path of the executable (i.e. what * /proc//exe points to). @@ -457,6 +462,10 @@ class Task { */ virtual std::string name() const; + /** + * Sets the OS-name of this task by injecting system call for PR_SET_NAME. + * Also updates |prname| to |name|. + */ virtual void set_name(AutoRemoteSyscalls& remote, const std::string& name); /** diff --git a/src/TraceStream.cc b/src/TraceStream.cc index 32586802f65..8dc80f81aeb 100644 --- a/src/TraceStream.cc +++ b/src/TraceStream.cc @@ -80,78 +80,6 @@ static TraceStream::Substream operator++(TraceStream::Substream& s) { return s; } -static bool dir_exists(const string& dir) { - struct stat dummy; - return !dir.empty() && stat(dir.c_str(), &dummy) == 0; -} - -static string default_rr_trace_dir() { - static string cached_dir; - - if (!cached_dir.empty()) { - return cached_dir; - } - - string dot_dir; - const char* home = getenv("HOME"); - if (home) { - dot_dir = string(home) + "/.rr"; - } - string xdg_dir; - const char* xdg_data_home = getenv("XDG_DATA_HOME"); - if (xdg_data_home) { - xdg_dir = string(xdg_data_home) + "/rr"; - } else if (home) { - xdg_dir = string(home) + "/.local/share/rr"; - } - - // If XDG dir does not exist but ~/.rr does, prefer ~/.rr for backwards - // compatibility. - if (dir_exists(xdg_dir)) { - cached_dir = xdg_dir; - } else if (dir_exists(dot_dir)) { - cached_dir = dot_dir; - } else if (!xdg_dir.empty()) { - cached_dir = xdg_dir; - } else { - cached_dir = "/tmp/rr"; - } - - return cached_dir; -} - -string trace_save_dir() { - const char* output_dir = getenv("_RR_TRACE_DIR"); - return output_dir ? output_dir : default_rr_trace_dir(); -} - -string latest_trace_symlink() { - return trace_save_dir() + "/latest-trace"; -} - -string resolve_trace_name(const string& trace_name) -{ - if (trace_name.empty()) { - return latest_trace_symlink(); - } - - // Single-component paths are looked up first in the current directory, next - // in the default trace dir. - - if (trace_name.find('/') == string::npos) { - if (dir_exists(trace_name)) { - return trace_name; - } - - string resolved_trace_name = trace_save_dir() + "/" + trace_name; - if (dir_exists(resolved_trace_name)) { - return resolved_trace_name; - } - } - - return trace_name; -} - class CompressedWriterOutputStream : public kj::OutputStream { public: CompressedWriterOutputStream(CompressedWriter& writer) : writer(writer) {} @@ -230,53 +158,10 @@ bool TraceReader::good() const { return true; } -static kj::ArrayPtr str_to_data(const string& str) { - return kj::ArrayPtr( - reinterpret_cast(str.data()), str.size()); -} - -static string data_to_str(const kj::ArrayPtr& data) { - if (!data.begin()) { - return string(); - } - if (memchr(data.begin(), 0, data.size())) { - FATAL() << "Invalid string: contains null character"; - } - return string(reinterpret_cast(data.begin()), data.size()); -} - -static trace::Arch to_trace_arch(SupportedArch arch) { - switch (arch) { - case x86: - return trace::Arch::X86; - case x86_64: - return trace::Arch::X8664; - case aarch64: - return trace::Arch::AARCH64; - default: - FATAL() << "Unknown arch"; - return trace::Arch::X86; - } -} - static trace::CpuTriState to_tristate(bool value) { return value ? trace::CpuTriState::KNOWN_TRUE : trace::CpuTriState::KNOWN_FALSE; } -static SupportedArch from_trace_arch(trace::Arch arch) { - switch (arch) { - case trace::Arch::X86: - return x86; - case trace::Arch::X8664: - return x86_64; - case trace::Arch::AARCH64: - return aarch64; - default: - FATAL() << "Unknown arch"; - return x86; - } -} - static trace::SignalDisposition to_trace_disposition( SignalResolvedDisposition disposition) { switch (disposition) { @@ -1794,4 +1679,56 @@ uint64_t TraceReader::xcr0() const { return (uint64_t(record->out.edx) << 32) | record->out.eax; } +// the dump command repurposed to a `forward_to` API. +void TraceReader::forward_to(FrameTime next_event_to_start_consuming) { + const auto stop_at = next_event_to_start_consuming - 1; + while (!at_end()) { + const auto frame = read_frame(); + // means the EVENTS stream is at the correct time, now RAW_DATA and MMAPS + // must catch up 1 "step" + if (frame.time() == stop_at) { + auto& mmaps = reader(MMAPS); + auto mmaps_pos_found = false; + while (!mmaps.at_end() && !mmaps_pos_found) { + // save state, if we find the MMAP record _after_ the one we're looking + // for we need to restore it to this point. + mmaps.save_state(); + CompressedReaderInputStream stream(mmaps); + PackedMessageReader map_msg(stream); + trace::MMap::Reader map = map_msg.getRoot(); + if (map.getFrameTime() > frame.time()) { + mmaps.restore_state(); + mmaps_pos_found = true; + } else { + mmaps.discard_state(); + } + } + + // consume RawData for frame (next_event_to_start_at - 1) + TraceReader::RawDataMetadata data; + TraceReader::RawData raw; + while (read_raw_data_metadata_for_frame(data)) { + read_raw_data_for_frame(raw); + } + return; + } else { + while (true) { + TraceReader::MappedData data; + bool found; + KernelMapping km = + read_mapped_region(&data, &found, TraceReader::DONT_VALIDATE, + TimeConstraint::CURRENT_TIME_ONLY); + if (!found) { + break; + } + } + TraceReader::RawDataMetadata data; + while (read_raw_data_metadata_for_frame(data)) { + } + } + } + FATAL() << "Could not forward stream(s) to event " + << next_event_to_start_consuming; +} + } // namespace rr diff --git a/src/TraceStream.h b/src/TraceStream.h index adcef26233f..1eab945631e 100644 --- a/src/TraceStream.h +++ b/src/TraceStream.h @@ -527,6 +527,14 @@ class TraceReader : public TraceStream { const TraceUtsName& uname() const { return uname_; } + /** + * Forwards this reader up until `event_number` (so that the next call to + * .read_frame() gives that event) This also forwards mmaps and raw_data + * streams, but leaves task event stream as is, as this can be read + * "arbitrarily" as it contains time information in each entry. + */ + void forward_to(FrameTime event_number); + private: CompressedReader& reader(Substream s) { return *readers[s]; } const CompressedReader& reader(Substream s) const { return *readers[s]; } diff --git a/src/VirtualPerfCounterMonitor.cc b/src/VirtualPerfCounterMonitor.cc index 1364688f615..312e591633a 100644 --- a/src/VirtualPerfCounterMonitor.cc +++ b/src/VirtualPerfCounterMonitor.cc @@ -168,4 +168,9 @@ VirtualPerfCounterMonitor::interrupting_virtual_pmc_for_task(Task* t) { return found->second; } +void VirtualPerfCounterMonitor::serialize_type( + pcp::FileMonitor::Builder&) const noexcept { + FATAL() << "VirtualPerCounter not implemented or supported"; +} + } // namespace rr diff --git a/src/VirtualPerfCounterMonitor.h b/src/VirtualPerfCounterMonitor.h index cdd85e8d3f4..b167165a8f5 100644 --- a/src/VirtualPerfCounterMonitor.h +++ b/src/VirtualPerfCounterMonitor.h @@ -23,7 +23,7 @@ class VirtualPerfCounterMonitor : public FileMonitor { const struct perf_event_attr& attr); virtual ~VirtualPerfCounterMonitor(); - virtual Type type() override { return VirtualPerfCounter; } + virtual Type type() const override { return VirtualPerfCounter; } virtual bool emulate_ioctl(RecordTask* t, uint64_t* result) override; virtual bool emulate_fcntl(RecordTask* t, uint64_t* result) override; @@ -44,6 +44,8 @@ class VirtualPerfCounterMonitor : public FileMonitor { static VirtualPerfCounterMonitor* interrupting_virtual_pmc_for_task(Task* t); private: + virtual void serialize_type( + pcp::FileMonitor::Builder&) const noexcept override; void maybe_enable_interrupt(Task* t, uint64_t after); void disable_interrupt() const; diff --git a/src/rr_pcp.capnp b/src/rr_pcp.capnp new file mode 100644 index 00000000000..72b9fa957a5 --- /dev/null +++ b/src/rr_pcp.capnp @@ -0,0 +1,193 @@ +# rr ReplaySession schema + +@0xf55676ebd869d6c1; + +using Cxx = import "/capnp/c++.capnp"; +$Cxx.namespace("rr::pcp"); + +using import "rr_trace.capnp".Registers; +using import "rr_trace.capnp".ExtraRegisters; +using import "rr_trace.capnp".Arch; +using import "rr_trace.capnp".RemoteFd; +using import "rr_trace.capnp".CString; +using import "rr_trace.capnp".Device; +using import "rr_trace.capnp".Inode; +using import "rr_trace.capnp".RemotePtr; +using import "rr_trace.capnp".FrameTime; +using import "rr_trace.capnp".Tid; +using import "rr_trace.capnp".Fd; +using import "rr_trace.capnp".Path; +using import "rr_trace.capnp".Ticks; + +struct ExtendedTaskId { + groupId @0 :Tid; + groupSerial @1: UInt32; + taskId @2 :Tid; + taskSerial @3: UInt32; +} + +using FileMonitorType = Int32; +struct FileMonitor { + fd @0 :Fd; + type @1 :FileMonitorType; + union { + mmap :group { + dead @2 :Bool; + device @3 :Device; + inode @4 :Inode; + } + procFd :group { + tid @5 :Tid; + serial @6 :UInt32; + } + procMem :group { + tid @7 :Tid; + serial @8 :UInt32; + execCount @9 :UInt32; + } + stdio @10 :Fd; + procStat @11 :Data; + bpf :group { + keySize @12: UInt64; + valueSize @13 :UInt64; + } + } +} + +struct KernelMapping { + start @0 :RemotePtr; + end @1 :RemotePtr; + fsname @2 :CString; + device @3 :Device; + inode @4 :Inode; + protection @5 :Int32; + flags @6 :Int32; + offset @7 :UInt64; + mapType :union { + file :group { # mapping of a file + contentsPath @8 :Path; + } + guardSegment @9 :Void; # Empty map segment, PROT NONE, no pages in physical memory, no fsname + # Mapping types below can all be compressed, as they need to be copied into the mapping anyhow + sharedAnon :group { + contentsPath @10 :Path; + isSysVSegment @11 :Bool; # if we're a SysV, we need to set AddressSpace::shm_sizes[start] = size; + } + privateAnon :group { # e.g. stack, heap, etc + contentsPath @12 :Path; + } + syscallBuffer :group { + contentsPath @13 :Path; + } + rrPage :group { + contentsPath @14 :Path; + } + } +} + +# For lack of a better name. +struct ProcessSpace { + virtualAddressSpace @0 :List(KernelMapping); + breakpointFaultAddress @1 :RemotePtr; + exe @2 :Data; # actual binary image exec'ed. + originalExe @3 :Data; # original binary image executed during record + monitors @4 :List(FileMonitor); + taskFirstRunEvent @5 :FrameTime; + vmFirstRunEvent @6 :FrameTime; +} + +struct CapturedState { + ticks @0 :Ticks; + regs @1 :Registers; + extraRegs @2 :ExtraRegisters; + prname @3 :Data; + fdtableIdentity @4 :UInt64; + syscallbufChild @5 :RemotePtr; + syscallbufSize @6 :UInt64; + numSyscallbufBytes @7 :UInt64; + preloadGlobals @8 :RemotePtr; + scratchPtr @9 :RemotePtr; + scratchSize @10 :UInt64; + topOfStack @11 :RemotePtr; + rseqState :group { + ptr @12 :RemotePtr; + abortPrefixSignature @13 :UInt32; + } + clonedFileDataOffset @14 :UInt64; + threadLocals @15 :Data; + recTid @16 :Tid; + ownNamespaceRecTid @17 :Tid; + serial @18 :UInt32; + tguid :group { + tid @19 :Tid; + serial @20 :UInt32; + } + deschedFdChild @21 :Int32; + clonedFileDataFdChild @22 :Int32; + clonedFileDataFname @23 :Data; + waitStatus @24 :Int32; + tlsRegister @25 :UInt64; + threadAreas @26 :List(Data); # std::vector +} + +struct CapturedMemory { + startAddress @0 :RemotePtr; + data @1 :Data; +} + +struct AddressSpaceClone { + processSpace @0 :ProcessSpace; + cloneLeaderState @1 :CapturedState; + memberState @2 :List(CapturedState); + capturedMemory @3 :List(CapturedMemory); + auxv @4 :Data; + # We need to know how to reconstitute the Register/ExtraRegister's in CapturedState + arch @5 :Arch; +} + +struct CloneCompletionInfo { + addressSpaces @0 :List(AddressSpaceClone); + sessionCurrentStep @1: Data; + lastSigInfo @2 :Data; + usesSyscallBuffering @3 :Bool; +} + +# Marks are kind of tricky to represents as serialized data, but this amounts to +# a flattened Mark / InternalMark / ProtoMark +struct MarkData { + time @0 :FrameTime; + ticks @1 :Ticks; + ticksAtEventStart @2 :Ticks; + stepKey @3 :Int32; + regs @4: Registers; + returnAddresses @5 :List(RemotePtr); + extraRegs @6: ExtraRegisters; + singlestepToNextMarkNoSignal @7 :Bool; + # The arch required to configure regs and extraRegs with the peristed data + arch @8 :Arch; +} + + +# A serialized checkpoint +struct CheckpointInfo { + cloneCompletion @0 :CloneCompletionInfo; + id @1 :UInt64; + lastContinueTask @2 :ExtendedTaskId; + where @3 :Data; + nextSerial @4 :UInt32; # next_serial_ value in Session. + union { + nonExplicit :group { + # The mark which has the actual clone data we have serialized + cloneMark @5 :MarkData; + # The actual mark for the checkpoint, to which we replay-seek-to + checkpointMark @6 :MarkData; + } + explicit @7 :MarkData; + } + # we need this data, to determine Progress, to be able to use them as reverse-exec + statistics :group { + bytesWritten @8 :UInt64; + ticksProcessed @9 :Ticks; + syscallsPerformed @10 :UInt32; + } +} \ No newline at end of file diff --git a/src/test/checkpoint_persistent_shmem.c b/src/test/checkpoint_persistent_shmem.c new file mode 100644 index 00000000000..f9eae5ca6b7 --- /dev/null +++ b/src/test/checkpoint_persistent_shmem.c @@ -0,0 +1,57 @@ +/* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ + +#include "util.h" + +#include +#include +#include +#include +#include + +#define SHM_NAME "/my_shared_memory" +#define SHM_SIZE 4096 + +static void breakpoint(void) {} + +int main(void) { + // Create shared memory + int shm_fd = shm_open(SHM_NAME, O_CREAT | O_RDWR, 0666); + if (shm_fd == -1) { + perror("shm_open"); + return 1; + } + ftruncate(shm_fd, SHM_SIZE); + + // Map shared memory + const char* ptr = + (char*)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0); + if (ptr == MAP_FAILED) { + perror("mmap"); + return 1; + } + + pid_t pid = fork(); + if (pid < 0) { + perror("fork"); + return 1; + } + + const char* parent_msg = "hello parent"; + const char* child_msg = "hello child\0"; + if (pid == 0) { + sleep(1); + memcpy((void*)ptr, parent_msg, strlen(parent_msg)); + return 1; + } else { + wait(NULL); + memcpy((void*)(ptr + strlen(parent_msg)), child_msg, strlen(child_msg)); + breakpoint(); + } + + // Cleanup + munmap((void*)ptr, SHM_SIZE); + shm_unlink(SHM_NAME); + + atomic_puts("EXIT-SUCCESS"); + return 0; +} \ No newline at end of file diff --git a/src/test/checkpoint_persistent_shmem.py b/src/test/checkpoint_persistent_shmem.py new file mode 100644 index 00000000000..bb8eaca34ee --- /dev/null +++ b/src/test/checkpoint_persistent_shmem.py @@ -0,0 +1,22 @@ +from util import * + +send_gdb('break 48') +expect_gdb('Breakpoint 1') +send_gdb('c') +expect_gdb('Breakpoint 1') + +send_gdb('checkpoint') +send_gdb('write-checkpoints') +send_gdb('delete checkpoint 1') +send_gdb('c') + +expect_rr('EXIT-SUCCESS') + +send_gdb('load-checkpoints') +send_gdb('restart 2') +expect_gdb('Program stopped') + +send_gdb('print ptr') +expect_gdb('"hello parenthello child"') + +ok() \ No newline at end of file diff --git a/src/test/checkpoint_persistent_shmem.run b/src/test/checkpoint_persistent_shmem.run new file mode 100644 index 00000000000..235ffcd1372 --- /dev/null +++ b/src/test/checkpoint_persistent_shmem.run @@ -0,0 +1,2 @@ +source `dirname $0`/util.sh +debug_test_gdb_only \ No newline at end of file diff --git a/src/util.cc b/src/util.cc index 1925d52b49b..1e47463d114 100644 --- a/src/util.cc +++ b/src/util.cc @@ -2631,6 +2631,185 @@ void base_name(string& s) { } } +char* extract_name(char* name_buffer, size_t buffer_size) { + // Recover the name that was originally chosen by finding the part of the + // name between rr_mapping_prefix and the -%d-%d at the end. + char* path_start = strstr(name_buffer, Session::rr_mapping_prefix()); + DEBUG_ASSERT(path_start && + "Passed something to create_shared_mmap that" + " wasn't a mapping shared between rr and the tracee?"); + size_t prefix_len = path_start - name_buffer; + buffer_size -= prefix_len; + name_buffer += prefix_len; + + char* name_end = name_buffer + strnlen(name_buffer, buffer_size); + char* name_start = name_buffer + strlen(Session::rr_mapping_prefix()); + int hyphens_seen = 0; + while (name_end > name_start) { + --name_end; + if (*name_end == '-') { + ++hyphens_seen; + } else if (*name_end == '/') { + DEBUG_ASSERT(false && + "Passed something to create_shared_mmap that" + " wasn't a mapping shared between rr and the tracee?"); + } + if (hyphens_seen == 2) { + break; + } + } + DEBUG_ASSERT(hyphens_seen == 2); + *name_end = '\0'; + return name_start; +} + +static bool dir_exists(const std::string& dir) { + struct stat dummy; + return !dir.empty() && stat(dir.c_str(), &dummy) == 0; +} + +std::string latest_trace_symlink() { + return trace_save_dir() + "/latest-trace"; +} + +std::string trace_save_dir() { + const char* output_dir = getenv("_RR_TRACE_DIR"); + return output_dir ? output_dir : default_rr_trace_dir(); +} + +std::string resolve_trace_name(const std::string& trace_name) { + if (trace_name.empty()) { + return latest_trace_symlink(); + } + + // Single-component paths are looked up first in the current directory, next + // in the default trace dir. + + if (trace_name.find('/') == std::string::npos) { + if (dir_exists(trace_name)) { + return trace_name; + } + + std::string resolved_trace_name = trace_save_dir() + "/" + trace_name; + if (dir_exists(resolved_trace_name)) { + return resolved_trace_name; + } + } + + return trace_name; +} + +std::string default_rr_trace_dir() { + static std::string cached_dir; + + if (!cached_dir.empty()) { + return cached_dir; + } + + std::string dot_dir; + const char* home = getenv("HOME"); + if (home) { + dot_dir = std::string(home) + "/.rr"; + } + std::string xdg_dir; + const char* xdg_data_home = getenv("XDG_DATA_HOME"); + if (xdg_data_home) { + xdg_dir = std::string(xdg_data_home) + "/rr"; + } else if (home) { + xdg_dir = std::string(home) + "/.local/share/rr"; + } + + // If XDG dir does not exist but ~/.rr does, prefer ~/.rr for backwards + // compatibility. + if (dir_exists(xdg_dir)) { + cached_dir = xdg_dir; + } else if (dir_exists(dot_dir)) { + cached_dir = dot_dir; + } else if (!xdg_dir.empty()) { + cached_dir = xdg_dir; + } else { + cached_dir = "/tmp/rr"; + } + + return cached_dir; +} + +SupportedArch from_trace_arch(trace::Arch arch) { + switch (arch) { + case trace::Arch::X86: + return x86; + case trace::Arch::X8664: + return x86_64; + case trace::Arch::AARCH64: + return aarch64; + default: + FATAL() << "Unknown arch"; + return x86; + } +} + +trace::Arch to_trace_arch(SupportedArch arch) { + switch (arch) { + case x86: + return trace::Arch::X86; + case x86_64: + return trace::Arch::X8664; + case aarch64: + return trace::Arch::AARCH64; + default: + FATAL() << "Unknown arch"; + return trace::Arch::X86; + } +} + +capnp::Data::Reader regs_to_raw(const Registers& regs) { + return { regs.get_ptrace_for_self_arch().data, + regs.get_ptrace_for_self_arch().size }; +} + +kj::ArrayPtr str_to_data(const std::string& str) { + return kj::ArrayPtr( + reinterpret_cast(str.data()), str.size()); +} + +// XXX move to trace_utils +capnp::Data::Reader extra_regs_to_raw(const ExtraRegisters& regs) { + return { regs.data_bytes(), static_cast(regs.data_size()) }; +}; + +std::string data_to_str(const kj::ArrayPtr& data) { + if (memchr(data.begin(), 0, data.size())) { + FATAL() << "Invalid string: contains null character"; + } + return std::string(reinterpret_cast(data.begin()), data.size()); +} + +void set_extra_regs_from_raw(SupportedArch arch, + const std::vector& records, + capnp::Data::Reader& raw, ExtraRegisters& out) { + if (raw.size()) { + ExtraRegisters::Format fmt; + switch (arch) { + default: + FATAL() << "Unknown architecture"; + RR_FALLTHROUGH; + case x86: + case x86_64: + fmt = ExtraRegisters::XSAVE; + break; + case aarch64: + fmt = ExtraRegisters::NT_FPR; + break; + } + auto success = out.set_to_raw_data(arch, fmt, raw.begin(), raw.size(), + xsave_layout_from_trace(records)); + if (!success) { + FATAL() << "Invalid extended register data in trace"; + } + } else { + out = ExtraRegisters(arch); + } +} static optional init_read_perf_event_paranoid() { ScopedFd fd("/proc/sys/kernel/perf_event_paranoid", O_RDONLY); if (fd.is_open()) { diff --git a/src/util.h b/src/util.h index a1dfa931e87..1171bdbe87b 100644 --- a/src/util.h +++ b/src/util.h @@ -23,6 +23,8 @@ #include "TraceFrame.h" #include "remote_ptr.h" #include "kernel_supplement.h" +#include +#include "rr_trace.capnp.h" /* This is pretty arbitrary. On Linux SIGPWR is sent to PID 1 (init) on * power failure, and it's unlikely rr will be recording that. @@ -698,6 +700,35 @@ void replace_in_buffer(MemoryRange src, const uint8_t* src_data, void base_name(std::string& s); std::optional read_perf_event_paranoid(); +char* extract_name(char* name_buffer, size_t buffer_size); + +std::string default_rr_trace_dir(); + +std::string resolve_trace_name(const std::string& trace_name); + +std::string trace_save_dir(); + +std::string latest_trace_symlink(); + +/** Convert `Registers` to data blob used in capnp */ +capnp::Data::Reader regs_to_raw(const Registers&); + +/** Write `ExtraRegisters` using the data from data blob reader `raw` */ +void set_extra_regs_from_raw(SupportedArch arch, + const std::vector& records, + capnp::Data::Reader& raw, ExtraRegisters& out); + +/** Convert `ExtraRegisters` to data blob used in capnp. */ +capnp::Data::Reader extra_regs_to_raw(const ExtraRegisters&); + +trace::Arch to_trace_arch(SupportedArch arch); +SupportedArch from_trace_arch(trace::Arch arch); + +/** Convert rr's capnp string representation into std::string. */ +std::string data_to_str(const kj::ArrayPtr& data); + +/** Convert std::string into rr's capnp string representation. */ +kj::ArrayPtr str_to_data(const std::string& str); bool virtual_address_size_supported(uint8_t bit_size);