diff --git a/etc/syscalls_linux_aarch64.md b/etc/syscalls_linux_aarch64.md index 53d69899..c132db3e 100644 --- a/etc/syscalls_linux_aarch64.md +++ b/etc/syscalls_linux_aarch64.md @@ -117,7 +117,7 @@ | 0x72 (114) | clock_getres | (const clockid_t which_clock, struct __kernel_timespec *tp) | __arm64_sys_clock_getres | false | | 0x73 (115) | clock_nanosleep | (const clockid_t which_clock, int flags, const struct __kernel_timespec *rqtp, struct __kernel_timespec *rmtp) | __arm64_sys_clock_nanosleep | false | | 0x74 (116) | syslog | (int type, char *buf, int len) | __arm64_sys_syslog | false | -| 0x75 (117) | ptrace | (long request, long pid, unsigned long addr, unsigned long data) | __arm64_sys_ptrace | false | +| 0x75 (117) | ptrace | (long request, long pid, unsigned long addr, unsigned long data) | __arm64_sys_ptrace | partially | | 0x76 (118) | sched_setparam | (pid_t pid, struct sched_param *param) | __arm64_sys_sched_setparam | false | | 0x77 (119) | sched_setscheduler | (pid_t pid, int policy, struct sched_param *param) | __arm64_sys_sched_setscheduler | false | | 0x78 (120) | sched_getscheduler | (pid_t pid) | __arm64_sys_sched_getscheduler | false | @@ -254,7 +254,7 @@ | 0x10b (267) | syncfs | (int fd) | __arm64_sys_syncfs | true | | 0x10c (268) | setns | (int fd, int flags) | __arm64_sys_setns | false | | 0x10d (269) | sendmmsg | (int fd, struct mmsghdr *mmsg, unsigned int vlen, unsigned int flags) | __arm64_sys_sendmmsg | false | -| 0x10e (270) | process_vm_readv | (pid_t pid, const struct iovec *lvec, unsigned long liovcnt, const struct iovec *rvec, unsigned long riovcnt, unsigned long flags) | __arm64_sys_process_vm_readv | false | +| 0x10e (270) | process_vm_readv | (pid_t pid, const struct iovec *lvec, unsigned long liovcnt, const struct iovec *rvec, unsigned long riovcnt, unsigned long flags) | __arm64_sys_process_vm_readv | true | | 0x10f (271) | process_vm_writev | (pid_t pid, const struct iovec *lvec, unsigned long liovcnt, const struct iovec *rvec, unsigned long riovcnt, unsigned long flags) | __arm64_sys_process_vm_writev | false | | 0x110 (272) | kcmp | (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) | __arm64_sys_kcmp | false | | 0x111 (273) | finit_module | (int fd, const char *uargs, int flags) | __arm64_sys_finit_module | false | diff --git a/libkernel/src/error.rs b/libkernel/src/error.rs index c51c82e5..7f2776e3 100644 --- a/libkernel/src/error.rs +++ b/libkernel/src/error.rs @@ -177,6 +177,9 @@ pub enum KernelError { #[error("No such process")] NoProcess, + #[error("No child process")] + NoChildProcess, + #[error("Operation timed out")] TimedOut, diff --git a/libkernel/src/error/syscall_error.rs b/libkernel/src/error/syscall_error.rs index 78b01521..d17f473e 100644 --- a/libkernel/src/error/syscall_error.rs +++ b/libkernel/src/error/syscall_error.rs @@ -53,6 +53,7 @@ pub fn kern_err_to_syscall(err: KernelError) -> isize { KernelError::NotSupported => ENOSYS, KernelError::NoMemory => ENOMEM, KernelError::TimedOut => ETIMEDOUT, + KernelError::NoChildProcess => ECHILD, e => todo!("{e}"), } } diff --git a/src/arch/arm64/exceptions/syscall.rs b/src/arch/arm64/exceptions/syscall.rs index b7918323..1d596992 100644 --- a/src/arch/arm64/exceptions/syscall.rs +++ b/src/arch/arm64/exceptions/syscall.rs @@ -38,8 +38,10 @@ use crate::{ memory::{ brk::sys_brk, mmap::{sys_mmap, sys_mprotect, sys_munmap}, + process_vm::sys_process_vm_readv, }, process::{ + TaskState, caps::{sys_capget, sys_capset}, clone::sys_clone, creds::{ @@ -54,6 +56,7 @@ use crate::{ select::{sys_ppoll, sys_pselect6}, }, prctl::sys_prctl, + ptrace::{TracePoint, ptrace_stop, sys_ptrace}, sleep::sys_nanosleep, thread_group::{ Pgid, @@ -79,6 +82,8 @@ use libkernel::{ }; pub async fn handle_syscall() { + ptrace_stop(TracePoint::SyscallEntry).await; + let (nr, arg1, arg2, arg3, arg4, arg5, arg6) = { let mut task = current_task(); @@ -274,8 +279,28 @@ pub async fn handle_syscall() { } 0x5a => sys_capget(TUA::from_value(arg1 as _), TUA::from_value(arg2 as _)).await, 0x5b => sys_capset(TUA::from_value(arg1 as _), TUA::from_value(arg2 as _)).await, - 0x5d => sys_exit(arg1 as _).await, - 0x5e => sys_exit_group(arg1 as _), + 0x5d => { + let _ = sys_exit(arg1 as _).await; + + debug_assert!(matches!( + *current_task().state.lock_save_irq(), + TaskState::Finished + )); + + // Don't process result on exit. + return; + } + 0x5e => { + let _ = sys_exit_group(arg1 as _).await; + + debug_assert!(matches!( + *current_task().state.lock_save_irq(), + TaskState::Finished + )); + + // Don't process result on exit. + return; + } 0x60 => sys_set_tid_address(TUA::from_value(arg1 as _)), 0x62 => { sys_futex( @@ -291,6 +316,15 @@ pub async fn handle_syscall() { 0x63 => sys_set_robust_list(TUA::from_value(arg1 as _), arg2 as _).await, 0x65 => sys_nanosleep(TUA::from_value(arg1 as _), TUA::from_value(arg2 as _)).await, 0x71 => sys_clock_gettime(arg1 as _, TUA::from_value(arg2 as _)).await, + 0x75 => { + sys_ptrace( + arg1 as _, + arg2 as _, + TUA::from_value(arg3 as _), + TUA::from_value(arg4 as _), + ) + .await + } 0x7b => Err(KernelError::NotSupported), 0x7c => sys_sched_yield(), 0x81 => sys_kill(arg1 as _, arg2.into()), @@ -402,6 +436,17 @@ pub async fn handle_syscall() { .await } 0x10b => sys_syncfs(arg1.into()).await, + 0x10e => { + sys_process_vm_readv( + arg1 as _, + TUA::from_value(arg2 as _), + arg3 as _, + TUA::from_value(arg4 as _), + arg5 as _, + arg6 as _, + ) + .await + } 0x114 => { sys_renameat2( arg1.into(), @@ -466,4 +511,5 @@ pub async fn handle_syscall() { }; current_task().ctx.user_mut().x[0] = ret_val.cast_unsigned() as u64; + ptrace_stop(TracePoint::SyscallExit).await; } diff --git a/src/arch/arm64/memory/fault.rs b/src/arch/arm64/memory/fault.rs index 0f2e5851..46da57e0 100644 --- a/src/arch/arm64/memory/fault.rs +++ b/src/arch/arm64/memory/fault.rs @@ -43,11 +43,14 @@ fn run_mem_fault_handler(exception: Exception, info: AbortIss) -> Result handle_demand_fault(&mut vm, fault_addr, access_kind), + IfscCategory::TranslationFault => { + handle_demand_fault(task.vm.clone(), fault_addr, access_kind) + } IfscCategory::PermissionFault => { + let mut vm = task.vm.lock_save_irq(); + let pg_info = vm .mm_mut() .address_space_mut() diff --git a/src/arch/arm64/mod.rs b/src/arch/arm64/mod.rs index 7a33b0dc..6461b408 100644 --- a/src/arch/arm64/mod.rs +++ b/src/arch/arm64/mod.rs @@ -17,6 +17,7 @@ use memory::{ mmu::{Arm64KernelAddressSpace, KERN_ADDR_SPC}, uaccess::{Arm64CopyFromUser, Arm64CopyStrnFromUser, Arm64CopyToUser, try_copy_from_user}, }; +use ptrace::Arm64PtraceGPRegs; use crate::{ process::{ @@ -36,6 +37,7 @@ mod fdt; mod memory; mod proc; pub mod psci; +pub mod ptrace; pub struct Aarch64 {} @@ -77,6 +79,7 @@ impl VirtualMemory for Aarch64 { impl Arch for Aarch64 { type UserContext = ExceptionState; + type PTraceGpRegs = Arm64PtraceGPRegs; fn new_user_context(entry_point: VA, stack_top: VA) -> Self::UserContext { ExceptionState { diff --git a/src/arch/arm64/ptrace.rs b/src/arch/arm64/ptrace.rs new file mode 100644 index 00000000..d0db8ead --- /dev/null +++ b/src/arch/arm64/ptrace.rs @@ -0,0 +1,25 @@ +use crate::memory::uaccess::UserCopyable; + +use super::exceptions::ExceptionState; + +#[repr(C)] +#[derive(Clone, Copy)] +pub struct Arm64PtraceGPRegs { + pub x: [u64; 31], // x0-x30 + pub sp: u64, + pub pc: u64, + pub pstate: u64, +} + +unsafe impl UserCopyable for Arm64PtraceGPRegs {} + +impl From<&ExceptionState> for Arm64PtraceGPRegs { + fn from(value: &ExceptionState) -> Self { + Self { + x: value.x, + sp: value.sp_el0, + pc: value.elr_el1, + pstate: value.spsr_el1, + } + } +} diff --git a/src/arch/mod.rs b/src/arch/mod.rs index 4234bbc0..1287e14e 100644 --- a/src/arch/mod.rs +++ b/src/arch/mod.rs @@ -8,10 +8,13 @@ //! The rest of the kernel should use the `ArchImpl` type alias to access //! architecture-specific functions and types. -use crate::process::{ - Task, - owned::OwnedTask, - thread_group::signal::{SigId, ksigaction::UserspaceSigAction}, +use crate::{ + memory::uaccess::UserCopyable, + process::{ + Task, + owned::OwnedTask, + thread_group::signal::{SigId, ksigaction::UserspaceSigAction}, + }, }; use alloc::sync::Arc; use libkernel::{ @@ -26,6 +29,9 @@ pub trait Arch: CpuOps + VirtualMemory { /// with this type. type UserContext: Sized + Send + Sync + Clone; + /// The type for GP regs copied via `PTRACE_GETREGSET`. + type PTraceGpRegs: UserCopyable + for<'a> From<&'a Self::UserContext>; + fn name() -> &'static str; fn cpu_count() -> usize; diff --git a/src/memory/fault.rs b/src/memory/fault.rs index d501ff38..ce350fc6 100644 --- a/src/memory/fault.rs +++ b/src/memory/fault.rs @@ -1,5 +1,6 @@ -use crate::{process::ProcVM, sched::current::current_task}; +use crate::{process::ProcVM, sync::SpinLock}; use alloc::boxed::Box; +use alloc::sync::Arc; use libkernel::{ PageInfo, UserAddressSpace, error::{KernelError, MapError, Result}, @@ -38,10 +39,12 @@ pub enum FaultResolution { /// Handle a page fault when a PTE is not present. pub fn handle_demand_fault( - vm: &mut ProcVM, + proc_vm: Arc>, faulting_addr: VA, access_kind: AccessKind, ) -> Result { + let mut vm = proc_vm.lock_save_irq(); + let vma = match vm.find_vma_for_fault(faulting_addr, access_kind) { Some(vma) => vma, None => return Ok(FaultResolution::Denied), @@ -52,6 +55,8 @@ pub fn handle_demand_fault( let page_va = faulting_addr.page_aligned(); if let Some(vma_read) = vma.resolve_fault(faulting_addr) { + drop(vm); + Ok(FaultResolution::Deferred(Box::new(async move { let pg_buf = &mut new_page.as_slice_mut() [vma_read.page_offset..vma_read.page_offset + vma_read.read_len]; @@ -60,8 +65,7 @@ pub fn handle_demand_fault( // Since the above may have put the task to sleep, revalidate the // VMA access. - let task = current_task(); - let mut vm = task.vm.lock_save_irq(); + let mut vm = proc_vm.lock_save_irq(); // If the handler in the deferred case is no longer valid. Allow // the program to back to user-space without touching the page diff --git a/src/memory/mod.rs b/src/memory/mod.rs index 338454c2..0222da1a 100644 --- a/src/memory/mod.rs +++ b/src/memory/mod.rs @@ -12,6 +12,7 @@ pub mod brk; pub mod fault; pub mod mmap; pub mod page; +pub mod process_vm; pub mod uaccess; pub type PageOffsetTranslator = libkernel::memory::pg_offset::PageOffsetTranslator; diff --git a/src/memory/process_vm.rs b/src/memory/process_vm.rs new file mode 100644 index 00000000..e3780185 --- /dev/null +++ b/src/memory/process_vm.rs @@ -0,0 +1,117 @@ +use core::{cmp::min, slice}; + +use super::{ + PageOffsetTranslator, + uaccess::{copy_obj_array_from_user, copy_to_user_slice}, +}; +use crate::{ + fs::syscalls::iov::IoVec, + process::{ + TaskDescriptor, Tid, find_task_by_descriptor, + thread_group::{Tgid, pid::PidT}, + }, +}; +use libkernel::{ + error::{KernelError, Result}, + memory::{PAGE_SIZE, address::TUA, proc_vm::vmarea::AccessKind}, +}; + +pub async fn sys_process_vm_readv( + pid: PidT, + local_iov: TUA, + liov_count: usize, + remote_iov: TUA, + riov_count: usize, + _flags: usize, +) -> Result { + let tgid = Tgid::from_pid_t(pid); + let remote_proc = + find_task_by_descriptor(&TaskDescriptor::from_tgid_tid(tgid, Tid::from_tgid(tgid))) + .ok_or(KernelError::NoProcess)?; + let local_iovs = copy_obj_array_from_user(local_iov, liov_count).await?; + let remote_iovs = copy_obj_array_from_user(remote_iov, riov_count).await?; + + let mut total_bytes_copied = 0; + + let mut local_iov_idx = 0; + let mut local_iov_curr_offset = 0; + + let mut remote_iov_idx = 0; + let mut remote_iov_curr_offset = 0; + + while let Some(remote_iov) = remote_iovs.get(remote_iov_idx) + && let Some(local_iov) = local_iovs.get(local_iov_idx) + { + let remote_remaining = remote_iov.iov_len - remote_iov_curr_offset; + let local_remaining = local_iov.iov_len - local_iov_curr_offset; + + let remote_va = remote_iov.iov_base.add_bytes(remote_iov_curr_offset); + + let chunk_sz = min( + PAGE_SIZE - remote_va.page_offset(), + min(remote_remaining, local_remaining), + ); + + // If we have nothing left to copy in current vectors, advance them. + if chunk_sz == 0 { + if remote_remaining == 0 { + remote_iov_idx += 1; + remote_iov_curr_offset = 0; + } + + if local_remaining == 0 { + local_iov_idx += 1; + local_iov_curr_offset = 0; + } + + continue; + } + + let copy_result = async { + // Get the page (pins it) + // SAFETY: We only read. + let remote_page = unsafe { remote_proc.get_page(remote_va, AccessKind::Read).await? }; + + // Map physical page to kernel virtual address (Direct Map) + let remote_pg_slice = unsafe { + slice::from_raw_parts( + remote_page + .region() + .start_address() + .to_va::() + .cast::() + .add_bytes(remote_va.page_offset()) + .as_ptr(), + chunk_sz, + ) + }; + + // Copy to local user memory + copy_to_user_slice( + remote_pg_slice, + local_iov.iov_base.add_bytes(local_iov_curr_offset), + ) + .await + } + .await; + + match copy_result { + Ok(_) => { + total_bytes_copied += chunk_sz; + remote_iov_curr_offset += chunk_sz; + local_iov_curr_offset += chunk_sz; + } + Err(e) => { + if total_bytes_copied > 0 { + // Partial success: return what we got so far. + return Ok(total_bytes_copied); + } else { + // No data copied at all: return the error. + return Err(e); + } + } + } + } + + Ok(total_bytes_copied) +} diff --git a/src/process/clone.rs b/src/process/clone.rs index 77b46ae1..30ae5c8d 100644 --- a/src/process/clone.rs +++ b/src/process/clone.rs @@ -1,4 +1,5 @@ use super::owned::OwnedTask; +use super::ptrace::PTrace; use super::{ctx::Context, thread_group::signal::SigSet}; use crate::kernel::cpu_id::CpuId; use crate::memory::uaccess::copy_to_user; @@ -127,6 +128,12 @@ pub async fn sys_clone( Arc::new(SpinLock::new(current_task.root.lock_save_irq().clone())) }; + let ptrace = if flags.contains(CloneFlags::CLONE_PTRACE) { + current_task.ptrace.lock_save_irq().clone() + } else { + PTrace::new() + }; + let creds = current_task.creds.lock_save_irq().clone(); let new_sigmask = current_task.sig_mask; @@ -153,6 +160,7 @@ pub async fn sys_clone( creds: SpinLock::new(creds), state: Arc::new(SpinLock::new(TaskState::Runnable)), last_cpu: SpinLock::new(CpuId::this()), + ptrace: SpinLock::new(ptrace), }), } }; diff --git a/src/process/exec.rs b/src/process/exec.rs index f120cb74..879edec0 100644 --- a/src/process/exec.rs +++ b/src/process/exec.rs @@ -1,5 +1,6 @@ use crate::ArchImpl; use crate::process::Comm; +use crate::process::ptrace::{TracePoint, ptrace_stop}; use crate::sched::current::current_task_shared; use crate::{ arch::Arch, @@ -172,6 +173,9 @@ pub async fn kernel_exec( let mut mem_map = MemoryMap::from_vmas(vmas)?; let stack_ptr = setup_user_stack(&mut mem_map, &argv, &envp, auxv)?; + // We are now committed to the exec. Inform ptrace. + ptrace_stop(TracePoint::Exec).await; + let user_ctx = ArchImpl::new_user_context(entry_addr, stack_ptr); let mut vm = ProcessVM::from_map(mem_map); @@ -183,15 +187,17 @@ pub async fn kernel_exec( let new_comm = argv.first().map(|s| Comm::new(s.as_str())); - let mut current_task = current_task(); + { + let mut current_task = current_task(); - if let Some(new_comm) = new_comm { - *current_task.comm.lock_save_irq() = new_comm; - } + if let Some(new_comm) = new_comm { + *current_task.comm.lock_save_irq() = new_comm; + } - current_task.ctx = Context::from_user_ctx(user_ctx); - *current_task.vm.lock_save_irq() = vm; - *current_task.process.signals.lock_save_irq() = SignalActionState::new_default(); + current_task.ctx = Context::from_user_ctx(user_ctx); + *current_task.vm.lock_save_irq() = vm; + *current_task.process.signals.lock_save_irq() = SignalActionState::new_default(); + } Ok(()) } diff --git a/src/process/exit.rs b/src/process/exit.rs index d8ca7688..0831fd78 100644 --- a/src/process/exit.rs +++ b/src/process/exit.rs @@ -1,5 +1,6 @@ use super::{ TASK_LIST, TaskState, + ptrace::{TracePoint, ptrace_stop}, thread_group::{ProcessState, Tgid, ThreadGroup, signal::SigId, wait::ChildState}, threading::futex::{self, key::FutexKey}, }; @@ -96,7 +97,9 @@ pub fn kernel_exit_with_signal(signal: SigId, core: bool) { do_exit_group(ChildState::SignalExit { signal, core }); } -pub fn sys_exit_group(exit_code: usize) -> Result { +pub async fn sys_exit_group(exit_code: usize) -> Result { + ptrace_stop(TracePoint::Exit).await; + do_exit_group(ChildState::NormalExit { code: exit_code as _, }); @@ -108,6 +111,8 @@ pub async fn sys_exit(exit_code: usize) -> Result { // Honour CLONE_CHILD_CLEARTID: clear the user TID word and futex-wake any waiters. let ptr = current_task().child_tid_ptr.take(); + ptrace_stop(TracePoint::Exit).await; + if let Some(ptr) = ptr { copy_to_user(ptr, 0u32).await?; diff --git a/src/process/mod.rs b/src/process/mod.rs index dbcf6b76..83bcefbb 100644 --- a/src/process/mod.rs +++ b/src/process/mod.rs @@ -1,13 +1,32 @@ -use crate::{arch::ArchImpl, kernel::cpu_id::CpuId, sync::SpinLock}; +use crate::{ + arch::ArchImpl, + kernel::cpu_id::CpuId, + memory::{ + PAGE_ALLOC, + fault::{FaultResolution, handle_demand_fault}, + }, + sync::SpinLock, +}; use alloc::{ + boxed::Box, collections::btree_map::BTreeMap, sync::{Arc, Weak}, }; use core::fmt::Display; use creds::Credentials; use fd_table::FileDescriptorTable; -use libkernel::{VirtualMemory, fs::Inode}; +use libkernel::{ + UserAddressSpace, VirtualMemory, + error::{KernelError, Result}, + fs::Inode, + memory::{ + address::{UA, VA}, + page_alloc::PageAllocation, + proc_vm::vmarea::AccessKind, + }, +}; use libkernel::{fs::pathbuf::PathBuf, memory::proc_vm::ProcessVM}; +use ptrace::PTrace; use thread_group::{Tgid, ThreadGroup}; pub mod caps; @@ -19,6 +38,7 @@ pub mod exit; pub mod fd_table; pub mod owned; pub mod prctl; +pub mod ptrace; pub mod sleep; pub mod thread_group; pub mod threading; @@ -164,6 +184,7 @@ pub struct Task { pub fd_table: Arc>, pub state: Arc>, pub last_cpu: SpinLock, + pub ptrace: SpinLock, } impl Task { @@ -184,6 +205,69 @@ impl Task { pub fn descriptor(&self) -> TaskDescriptor { TaskDescriptor::from_tgid_tid(self.process.tgid, self.tid) } + + /// Get a page from the task's address space, in an atomic fasion - i.e. + /// with the process address space locked. + /// + /// Handle any faults such that the page will be resident in memory and return + /// an incremented refcount for the page such that it will not be free'd until + /// the returned allocation handle is dropped. + /// + /// SAFETY: The caller *must* guarantee that the returned page will only be + /// used as described in `access_kind`. i.e. if `AccessKind::Read` is passed + /// but data is written to this page, *bad* things will happen. + pub async unsafe fn get_page( + &self, + va: UA, + access_kind: AccessKind, + ) -> Result> { + let va = VA::from_value(va.value()); + + let mut fut = None; + + loop { + if let Some(fut) = fut.take() { + // Handle async fault. + Box::into_pin(fut).await?; + } + + { + let mut vm = self.vm.lock_save_irq(); + + if let Some(pa) = vm.mm_mut().address_space_mut().translate(va) { + let region = pa.pfn.as_phys_range(); + + if match access_kind { + AccessKind::Read => pa.perms.is_read(), + AccessKind::Write => pa.perms.is_write(), + AccessKind::Execute => pa.perms.is_execute(), + } { + let alloc = unsafe { PAGE_ALLOC.get().unwrap().alloc_from_region(region) }; + // Increase refcount on this page, ensuring it isn't reused + // while we copy the data. + let ret = alloc.clone(); + + // The original allocation is still owned by the address + // space. + alloc.leak(); + + return Ok(ret); + } + } + } + + // Try to handle the fault. + match handle_demand_fault(self.vm.clone(), va, access_kind)? { + // Resolved the fault. Try again + FaultResolution::Resolved => continue, + FaultResolution::Denied => return Err(KernelError::Fault), + FaultResolution::Deferred(future) => { + fut = Some(future); + continue; + } + } + } + } } pub fn find_task_by_descriptor(descriptor: &TaskDescriptor) -> Option> { diff --git a/src/process/owned.rs b/src/process/owned.rs index db05b4fa..82a4b6f3 100644 --- a/src/process/owned.rs +++ b/src/process/owned.rs @@ -5,6 +5,7 @@ use super::{ creds::Credentials, ctx::{Context, UserCtx}, fd_table::FileDescriptorTable, + ptrace::PTrace, thread_group::{ Tgid, builder::ThreadGroupBuilder, @@ -75,6 +76,7 @@ impl OwnedTask { vm: Arc::new(SpinLock::new(vm)), fd_table: Arc::new(SpinLock::new(FileDescriptorTable::new())), last_cpu: SpinLock::new(CpuId::this()), + ptrace: SpinLock::new(PTrace::new()), }; Self { @@ -102,6 +104,7 @@ impl OwnedTask { )), fd_table: Arc::new(SpinLock::new(FileDescriptorTable::new())), last_cpu: SpinLock::new(CpuId::this()), + ptrace: SpinLock::new(PTrace::new()), }; Self { diff --git a/src/process/ptrace.rs b/src/process/ptrace.rs new file mode 100644 index 00000000..59642331 --- /dev/null +++ b/src/process/ptrace.rs @@ -0,0 +1,404 @@ +use core::future::poll_fn; +use core::task::{Poll, Waker}; + +use crate::arch::{Arch, ArchImpl}; +use crate::fs::syscalls::iov::IoVec; +use crate::memory::uaccess::{copy_from_user, copy_to_user}; +use crate::process::TASK_LIST; +use crate::process::thread_group::signal::SigId; +use crate::sched::current::{current_task, current_task_shared}; +use alloc::sync::Arc; +use bitflags::Flags; +use libkernel::error::{KernelError, Result}; +use libkernel::memory::address::UA; +use log::warn; + +type GpRegs = ::PTraceGpRegs; + +use super::TaskState; +use super::thread_group::ThreadGroup; +use super::thread_group::wait::ChildState; + +const PTRACE_EVENT_FORK: usize = 1; +const PTRACE_EVENT_VFORK: usize = 2; +const PTRACE_EVENT_CLONE: usize = 3; +const PTRACE_EVENT_EXEC: usize = 4; +const PTRACE_EVENT_VFORK_DONE: usize = 5; +const PTRACE_EVENT_EXIT: usize = 6; +const PTRACE_EVENT_SECCOMP: usize = 7; +const PTRACE_EVENT_STOP: usize = 128; + +bitflags::bitflags! { + #[derive(Clone, Copy, PartialEq)] + pub struct PTraceOptions: usize { + const PTRACE_O_TRACESYSGOOD = 1; + const PTRACE_O_TRACEFORK = 1 << PTRACE_EVENT_FORK; + const PTRACE_O_TRACEVFORK = 1 << PTRACE_EVENT_VFORK; + const PTRACE_O_TRACECLONE = 1 << PTRACE_EVENT_CLONE; + const PTRACE_O_TRACEEXEC = 1 << PTRACE_EVENT_EXEC; + const PTRACE_O_TRACEVFORK_DONE = 1 << PTRACE_EVENT_VFORK_DONE; + const PTRACE_O_TRACEEXIT = 1 << PTRACE_EVENT_EXIT; + const PTRACE_O_TRACESECCOMP = 1 << PTRACE_EVENT_SECCOMP; + const PTRACE_O_EXITKILL = 1 << 20; + const PTRACE_O_SUSPEND_SECCOMP = 1 << 21; + } + + #[derive(Clone, Copy, PartialEq)] + pub struct TracePoint: u32 { + const SyscallEntry = 0x01; + const SyscallExit = 0x02; + /// A new process has begin tracing after being `exec()`d. + const Exec = 0x08; + const Clone = 0x10; + const Exit = 0x20; + const Fork = 0x40; + } +} + +#[derive(Clone)] +enum PTraceState { + /// The traced program should run until `break_points`. + Running, + /// The program hit a trace point `TracePoint`, + TracePointHit { + reg_set: GpRegs, + hit_point: TracePoint, + }, + /// A signal was sent to the traced task. + SignalTrap { reg_set: GpRegs, signal: SigId }, +} + +#[derive(Clone)] +pub struct PTrace { + break_points: TracePoint, + state: Option, + waker: Option, + sysgood: bool, +} + +impl PTrace { + pub fn new() -> Self { + Self { + state: None, + break_points: TracePoint::empty(), + waker: None, + sysgood: false, + } + } + + pub fn is_being_traced(&self) -> bool { + self.state.is_some() + } + + /// Tells ptrace that the task has hit one of the trace points in the + /// kernel. If tracing is in progress *and* the trace point is active within + /// `break_points`, `true` is returned and the kernel should yield to allow + /// the tracer to be informed. Otherwise, `false` is returned. + pub fn hit_trace_point( + &mut self, + point: TracePoint, + regs: &::UserContext, + ) -> bool { + let should_stop = match self.state { + Some(PTraceState::Running) => self.break_points.contains(point), + _ => false, + }; + + if should_stop { + self.state = Some(PTraceState::TracePointHit { + reg_set: regs.into(), + hit_point: point, + }) + } + + should_stop + } + + /// Calculate what extra bits to set (mask) in the status flag of the tracer + /// upon return of `wait()`. + fn calc_trace_point_mask(&self) -> i32 { + match self.state { + None => 0, + Some(PTraceState::Running) => 0, + // No masking for real signal delivery. + Some(PTraceState::SignalTrap { signal, .. }) => { + if signal.is_stopping() { + (PTRACE_EVENT_STOP as i32) << 8 + } else { + 0 + } + } + Some(PTraceState::TracePointHit { hit_point, .. }) => match hit_point { + TracePoint::SyscallEntry | TracePoint::SyscallExit => { + if self.sysgood { + 0x80 + } else { + 0 + } + } + TracePoint::Exec => (PTRACE_EVENT_EXEC as i32) << 8, + TracePoint::Clone => (PTRACE_EVENT_CLONE as i32) << 8, + TracePoint::Exit => (PTRACE_EVENT_EXIT as i32) << 8, + TracePoint::Fork => (PTRACE_EVENT_FORK as i32) << 8, + _ => unreachable!(), + }, + } + } + + /// Notify parents of a trap event. + pub fn notify_parent_of_trap(&self, process: Arc) { + let Some(trap_signal) = (match self.state { + // For non-signal trace events, we use SIGTRAP. + Some(PTraceState::TracePointHit { hit_point, .. }) => match hit_point { + TracePoint::Exec => Some(SigId::SIGSTOP), + _ => Some(SigId::SIGTRAP), + }, + Some(PTraceState::SignalTrap { signal, .. }) => Some(signal), + _ => None, + }) else { + warn!("notification of parent failed when in non-traced state"); + return; + }; + + // Notify the parent that we have stopped (SIGCHLD). + if let Some(parent) = process + .parent + .lock_save_irq() + .as_ref() + .and_then(|p| p.upgrade()) + { + parent.child_notifiers.child_update( + process.tgid, + ChildState::TraceTrap { + signal: trap_signal, + mask: self.calc_trace_point_mask(), + }, + ); + + parent + .pending_signals + .lock_save_irq() + .set_signal(SigId::SIGCHLD); + } + } + + pub fn set_waker(&mut self, waker: Waker) { + // Ensure we never override an already existing waker. + debug_assert!(self.waker.is_none()); + + self.waker = Some(waker); + } + + /// Notify ptrace that a signal has been delivered for the task. + /// + /// This function returns `true` if the task should be put to sleep and wait + /// for the tracer, `false` if the signal should be delivered as per-ususal. + pub fn trace_signal(&mut self, signal: SigId, regs: &::UserContext) -> bool { + // Never handle a SIGKILL. + if signal == SigId::SIGKILL { + return false; + } + + let should_stop = matches!(self.state, Some(PTraceState::Running)); + + if should_stop { + self.state = Some(PTraceState::SignalTrap { + reg_set: regs.into(), + signal, + }) + } + + should_stop + } + + /// Returns the current GP regset when the program has been halted. + pub fn regset(&self) -> Option { + match self.state.as_ref()? { + PTraceState::Running => None, + PTraceState::TracePointHit { reg_set, .. } => Some(*reg_set), + PTraceState::SignalTrap { reg_set, .. } => Some(*reg_set), + } + } +} + +#[repr(i32)] +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +enum PtraceOperation { + TraceMe = 0, + PeekText = 1, + PeekData = 2, + // PeekUser = 3, + // PokeText = 4, + // PokeData = 5, + // PokeUser = 6, + Cont = 7, + // Kill = 8, + // SingleStep = 9, + // GetRegs = 12, + // SetRegs = 13, + // GetFpRegs = 14, + // SetFpRegs = 15, + // Attach = 16, + // Detach = 17, + Syscall = 24, + SetOptions = 0x4200, + GetRegSet = 0x4204, +} + +impl TryFrom for PtraceOperation { + type Error = KernelError; + + fn try_from(value: i32) -> Result { + match value { + 0 => Ok(PtraceOperation::TraceMe), + 1 => Ok(PtraceOperation::PeekText), + 2 => Ok(PtraceOperation::PeekData), + 7 => Ok(PtraceOperation::Cont), + 24 => Ok(PtraceOperation::Syscall), + 0x4200 => Ok(PtraceOperation::SetOptions), + 0x4204 => Ok(PtraceOperation::GetRegSet), + // TODO: Should be EIO + _ => Err(KernelError::InvalidValue), + } + } +} + +pub async fn ptrace_stop(point: TracePoint) { + let task_sh = current_task_shared(); + { + let mut ptrace = task_sh.ptrace.lock_save_irq(); + + if ptrace.hit_trace_point(point, current_task().ctx.user()) { + ptrace.notify_parent_of_trap(task_sh.process.clone()); + } else { + return; + } + } + + poll_fn(|cx| { + let mut ptrace = task_sh.ptrace.lock_save_irq(); + + if matches!(ptrace.state, Some(PTraceState::Running)) { + Poll::Ready(()) + } else { + ptrace.set_waker(cx.waker().clone()); + Poll::Pending + } + }) + .await; +} + +pub async fn sys_ptrace(op: i32, pid: u64, addr: UA, data: UA) -> Result { + let op = PtraceOperation::try_from(op)?; + + if op == PtraceOperation::TraceMe { + let current_task = current_task_shared(); + let mut ptrace = current_task.ptrace.lock_save_irq(); + + ptrace.state = Some(PTraceState::Running); + + // Set default breakpoint for TraceMe. + ptrace.break_points = TracePoint::Exec; + + return Ok(0); + } + + let target_task = { + TASK_LIST + .lock_save_irq() + .iter() + .find(|(desc, _)| desc.tid.value() == pid as u32) + .and_then(|(_, task)| task.upgrade()) + .ok_or(KernelError::NoProcess)? + }; + + // TODO: Check CAP_SYS_PTRACE & security + match op { + PtraceOperation::TraceMe => { + unreachable!(); + } + PtraceOperation::GetRegSet => { + let regs = target_task.ptrace.lock_save_irq().regset(); + + if addr.value() != 1 { + // TODO: Suppoer other reg sets, vector, VFP, etc... + return Err(KernelError::InvalidValue); + } + + let user_iov = data.cast::(); + + let mut iov = copy_from_user(user_iov).await?; + + if iov.iov_len < size_of::() { + return Err(KernelError::InvalidValue); + } + + if let Some(regs) = regs { + copy_to_user(iov.iov_base.cast::(), regs).await?; + iov.iov_len = size_of::(); + copy_to_user(user_iov, iov).await?; + + Ok(0) + } else { + Err(KernelError::NoProcess) + } + } + PtraceOperation::SetOptions => { + let opts = PTraceOptions::from_bits_truncate(data.value()); + let mut ptrace = target_task.ptrace.lock_save_irq(); + + // Reset to defaults. + ptrace.break_points.clear(); + ptrace.sysgood = false; + + for opt in opts.iter() { + match opt { + PTraceOptions::PTRACE_O_TRACESYSGOOD => ptrace.sysgood = true, + PTraceOptions::PTRACE_O_EXITKILL => todo!(), + PTraceOptions::PTRACE_O_TRACECLONE => { + ptrace.break_points.insert(TracePoint::Clone) + } + PTraceOptions::PTRACE_O_TRACEEXIT => { + ptrace.break_points.insert(TracePoint::Exit) + } + PTraceOptions::PTRACE_O_TRACEFORK => { + ptrace.break_points.insert(TracePoint::Fork) + } + PTraceOptions::PTRACE_O_TRACEEXEC => { + ptrace.break_points.insert(TracePoint::Exec); + } + _ => todo!(), + } + } + + Ok(0) + } + PtraceOperation::Cont => { + let mut ptrace = target_task.ptrace.lock_save_irq(); + ptrace.state = Some(PTraceState::Running); + + ptrace + .break_points + .remove(TracePoint::SyscallEntry | TracePoint::SyscallExit); + + *target_task.state.lock_save_irq() = TaskState::Runnable; + + Ok(0) + } + PtraceOperation::Syscall => { + let mut ptrace = target_task.ptrace.lock_save_irq(); + ptrace.state = Some(PTraceState::Running); + ptrace + .break_points + .insert(TracePoint::SyscallEntry | TracePoint::SyscallExit); + + if let Some(waker) = ptrace.waker.take() { + waker.wake(); + } + + Ok(0) + } + // TODO: Wrong error + _ => Err(KernelError::InvalidValue), + } +} diff --git a/src/process/thread_group.rs b/src/process/thread_group.rs index 77f8444f..b5937e9b 100644 --- a/src/process/thread_group.rs +++ b/src/process/thread_group.rs @@ -1,5 +1,5 @@ -use super::{Task, Tid}; -use crate::{memory::uaccess::UserCopyable, sync::SpinLock}; +use super::{Task, TaskState, Tid}; +use crate::{memory::uaccess::UserCopyable, sched::waker::create_waker, sync::SpinLock}; use alloc::{ collections::btree_map::BTreeMap, sync::{Arc, Weak}, @@ -12,7 +12,7 @@ use core::{ }; use pid::PidT; use rsrc_lim::ResourceLimits; -use signal::{SigSet, SignalActionState}; +use signal::{SigId, SigSet, SignalActionState}; use wait::ChildNotifiers; pub mod builder; @@ -155,6 +155,30 @@ impl ThreadGroup { pub fn get(id: Tgid) -> Option> { TG_LIST.lock_save_irq().get(&id).and_then(|x| x.upgrade()) } + + pub fn deliver_signal(&self, signal: SigId) { + match signal { + SigId::SIGKILL => { + // Set the sigkill marker in the pending signals and wake up all + // tasks in this group. + *self.pending_signals.lock_save_irq() = SigSet::SIGKILL; + + for task in self.tasks.lock_save_irq().values() { + if let Some(task) = task.upgrade() + && matches!( + *task.state.lock_save_irq(), + TaskState::Stopped | TaskState::Sleeping + ) + { + create_waker(task.descriptor()).wake(); + } + } + } + _ => { + self.pending_signals.lock_save_irq().set_signal(signal); + } + } + } } impl Drop for ThreadGroup { diff --git a/src/process/thread_group/signal.rs b/src/process/thread_group/signal.rs index 5aef4e2a..af594033 100644 --- a/src/process/thread_group/signal.rs +++ b/src/process/thread_group/signal.rs @@ -138,6 +138,13 @@ impl SigId { pub fn user_id(self) -> u64 { self as u64 + 1 } + + pub fn is_stopping(self) -> bool { + matches!( + self, + Self::SIGSTOP | Self::SIGTSTP | Self::SIGTTIN | Self::SIGTTOU + ) + } } impl Display for SigId { diff --git a/src/process/thread_group/signal/kill.rs b/src/process/thread_group/signal/kill.rs index d2de055b..e0c60a07 100644 --- a/src/process/thread_group/signal/kill.rs +++ b/src/process/thread_group/signal/kill.rs @@ -16,18 +16,15 @@ pub fn sys_kill(pid: PidT, signal: UserSigId) -> Result { let current_task = current_task(); // Kill ourselves if pid == current_task.process.tgid.value() as PidT { - current_task - .process - .pending_signals - .lock_save_irq() - .set_signal(signal); + current_task.process.deliver_signal(signal); + return Ok(0); } match pid { p if p > 0 => { let target_tg = ThreadGroup::get(Tgid(p as _)).ok_or(KernelError::NoProcess)?; - target_tg.pending_signals.lock_save_irq().set_signal(signal); + target_tg.deliver_signal(signal); } 0 => { @@ -41,7 +38,7 @@ pub fn sys_kill(pid: PidT, signal: UserSigId) -> Result { if let Some(tg) = tg_weak.upgrade() && *tg.pgid.lock_save_irq() == our_pgid { - tg.pending_signals.lock_save_irq().set_signal(signal); + tg.deliver_signal(signal); } } } @@ -55,7 +52,7 @@ pub fn sys_kill(pid: PidT, signal: UserSigId) -> Result { if let Some(tg) = tg_weak.upgrade() && *tg.pgid.lock_save_irq() == target_pgid { - tg.pending_signals.lock_save_irq().set_signal(signal); + tg.deliver_signal(signal); } } } diff --git a/src/process/thread_group/signal/sigprocmask.rs b/src/process/thread_group/signal/sigprocmask.rs index a516c5d0..eb643fb4 100644 --- a/src/process/thread_group/signal/sigprocmask.rs +++ b/src/process/thread_group/signal/sigprocmask.rs @@ -38,7 +38,7 @@ pub async fn sys_rt_sigprocmask( }; // SIGSTOP and SIGKILL can never be masked. - new_sigmask = new_sigmask.union(UNMASKABLE_SIGNALS); + new_sigmask.remove(UNMASKABLE_SIGNALS); task.sig_mask = new_sigmask; } diff --git a/src/process/thread_group/wait.rs b/src/process/thread_group/wait.rs index 556fdf9c..b0e511a2 100644 --- a/src/process/thread_group/wait.rs +++ b/src/process/thread_group/wait.rs @@ -54,6 +54,7 @@ pub enum ChildState { NormalExit { code: u32 }, SignalExit { signal: SigId, core: bool }, Stop { signal: SigId }, + TraceTrap { signal: SigId, mask: i32 }, Continue, } @@ -64,6 +65,8 @@ impl ChildState { flags.contains(WaitFlags::WEXITED) } ChildState::Stop { .. } => flags.contains(WaitFlags::WSTOPPED), + // Always wake up on a trace trap. + ChildState::TraceTrap { .. } => true, ChildState::Continue => flags.contains(WaitFlags::WCONTINUED), } } @@ -167,7 +170,12 @@ pub async fn sys_wait4( let task = current_task_shared(); - let (tgid, child_state) = if flags.contains(WaitFlags::WNOHANG) { + let child_proc_count = task.process.children.lock_save_irq().iter().count(); + + let (tgid, child_state) = if child_proc_count == 0 || flags.contains(WaitFlags::WNOHANG) { + // Special case for no children. See if there are any pending child + // notification events without sleeping. If there are no children and no + // pending events, return ECHILD. let mut ret = None; task.process.child_notifiers.inner.update(|s| { ret = do_wait(s, pid, flags); @@ -175,8 +183,9 @@ pub async fn sys_wait4( }); match ret { - None => return Ok(0), Some(ret) => ret, + None if child_proc_count == 0 => return Err(KernelError::NoChildProcess), + None => return Ok(0), } } else { task.process @@ -199,7 +208,14 @@ pub async fn sys_wait4( .await?; } ChildState::Stop { signal } => { - copy_to_user(stat_addr, ((signal as i32) << 8) | 0x7f).await?; + copy_to_user(stat_addr, ((signal.user_id() as i32) << 8) | 0x7f).await?; + } + ChildState::TraceTrap { signal, mask } => { + copy_to_user( + stat_addr, + ((signal.user_id() as i32) << 8) | 0x7f | mask << 8, + ) + .await?; } ChildState::Continue => { copy_to_user(stat_addr, 0xffff).await?; diff --git a/src/sched/uspc_ret.rs b/src/sched/uspc_ret.rs index d9980043..acc6b5af 100644 --- a/src/sched/uspc_ret.rs +++ b/src/sched/uspc_ret.rs @@ -178,9 +178,10 @@ pub fn dispatch_userspace_task(ctx: *mut UserCtx) { } // Kernel work finished. Ensure we have no other new - // work to process (i.e. a signal was rasied). We - // don't need to clear the kernel context here as we - // used the *take* function above. + // work to process (i.e. a signal was rasied, trace + // point was hit). We don't need to clear the kernel + // context here as we used the *take* function + // above. state = State::ProcessKernelWork; continue; } @@ -232,6 +233,19 @@ pub fn dispatch_userspace_task(ctx: *mut UserCtx) { let mut task = current_task(); while let Some(signal) = task.take_signal() { + let mut ptrace = task.ptrace.lock_save_irq(); + if ptrace.trace_signal(signal, task.ctx.user()) { + ptrace.notify_parent_of_trap(task.process.clone()); + ptrace.set_waker(create_waker(task.descriptor())); + + *task.state.lock_save_irq() = TaskState::Stopped; + force_resched(); + + state = State::PickNewTask; + continue 'dispatch; + } + drop(ptrace); + let sigaction = task.process.signals.lock_save_irq().action_signal(signal); match sigaction { @@ -239,6 +253,7 @@ pub fn dispatch_userspace_task(ctx: *mut UserCtx) { None => continue, Some(KSignalAction::Term | KSignalAction::Core) => { // Terminate the process, and find a new task. + drop(task); kernel_exit_with_signal(signal, false); state = State::PickNewTask; @@ -259,10 +274,7 @@ pub fn dispatch_userspace_task(ctx: *mut UserCtx) { .child_notifiers .child_update(process.tgid, ChildState::Stop { signal }); - parent - .pending_signals - .lock_save_irq() - .set_signal(SigId::SIGCHLD); + parent.deliver_signal(SigId::SIGCHLD); } for thr_weak in process.tasks.lock_save_irq().values() { @@ -271,6 +283,7 @@ pub fn dispatch_userspace_task(ctx: *mut UserCtx) { } } + force_resched(); state = State::PickNewTask; continue 'dispatch; } @@ -297,10 +310,8 @@ pub fn dispatch_userspace_task(ctx: *mut UserCtx) { parent .child_notifiers .child_update(process.tgid, ChildState::Continue); - parent - .pending_signals - .lock_save_irq() - .set_signal(SigId::SIGCHLD); + + parent.deliver_signal(SigId::SIGCHLD); } // Re-process kernel work for this task (there may be more to do). diff --git a/src/sched/waker.rs b/src/sched/waker.rs index 2ae8d073..4bac0ab2 100644 --- a/src/sched/waker.rs +++ b/src/sched/waker.rs @@ -26,7 +26,7 @@ unsafe fn wake_waker(data: *const ()) { match *state { // If the task has been put to sleep, then wake it up. - TaskState::Sleeping => { + TaskState::Sleeping | TaskState::Stopped => { if locus == CpuId::this() { *state = TaskState::Runnable; SCHED_STATE.borrow_mut().wakeup(desc);