diff --git a/README.md b/README.md index 56e1fd0..ba1967a 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,36 @@ All subclasses carry the same attributes, so existing handlers can migrate by ca Pass `--codetracer-json-errors` (or set the policy via `configure_policy(json_errors=True)`) to stream a one-line JSON trailer on stderr. The payload includes `run_id`, `trace_id`, `error_code`, `error_kind`, `message`, and the `context` map so downstream tooling can log failures without scraping text. +### IO capture configuration + +Line-aware capture (see [ADR 0008](design-docs/adr/0008-line-aware-io-capture.md)) installs `LineAwareStdout`, `LineAwareStderr`, and `LineAwareStdin` proxies so every chunk carries `{path_id, line, frame_id}` metadata. The proxies forward writes immediately to keep TTY behaviour unchanged and the batching sink emits newline/flush/step-delimited chunks. When the FD mirror fallback observes bytes that bypassed the proxies, the resulting `IoChunk` carries the `mirror` flag so downstream tooling can highlight native writers separately. Recorder logs and telemetry use `ScopedMuteIoCapture` to avoid recursive capture. + +Control the feature through the policy layer: + +- CLI: `python -m codetracer_python_recorder --io-capture=off script.py` disables capture, while `--io-capture=proxies+fd` also mirrors raw file-descriptor writes. +- Python: `configure_policy(io_capture_line_proxies=False)` toggles proxies, and `configure_policy(io_capture_fd_fallback=True)` enables the FD fallback. +- Environment: set `CODETRACER_CAPTURE_IO=off`, `proxies`, or `proxies+fd` (`,` is also accepted) to match the CLI and Python helpers. + +Manual smoke check: `python -m codetracer_python_recorder examples/stdout_script.py` should report the proxied output while leaving the console live. + +#### Troubleshooting replaced stdout/stderr + +Third-party tooling occasionally replaces `sys.stdout` / `sys.stderr` after the proxies install. When that happens, IO metadata stops updating and the recorder falls back to passthrough behaviour. You can verify the binding at runtime: + +```python +import sys +from codetracer_python_recorder.runtime import LineAwareStdout, LineAwareStderr + +print(type(sys.stdout).__name__, isinstance(sys.stdout, LineAwareStdout)) +print(type(sys.stderr).__name__, isinstance(sys.stderr, LineAwareStderr)) +``` + +Both `isinstance` checks should return `True`. If they do not: + +1. Re-run `configure_policy(io_capture_line_proxies=True)` (or restart tracing) to reinstall the proxies before the other tool mutates the streams. +2. Fall back to FD mirroring by enabling `CODETRACER_CAPTURE_IO=proxies+fd` so native writes still reach the ledger-backed mirror. +3. As a last resort, disable IO capture (`--io-capture=off`) and rely on console output while investigating the conflicting integration. + ### Migration checklist for downstream tools - Catch `RecorderError` (or a subclass) instead of `RuntimeError`. diff --git a/codetracer-python-recorder/CHANGELOG.md b/codetracer-python-recorder/CHANGELOG.md index 748f276..421398e 100644 --- a/codetracer-python-recorder/CHANGELOG.md +++ b/codetracer-python-recorder/CHANGELOG.md @@ -6,6 +6,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) ## [Unreleased] ### Added +- Introduced a line-aware IO capture pipeline that records stdout/stderr chunks with `{path_id, line, frame_id}` attribution via the shared `LineSnapshotStore` and multi-threaded `IoEventSink`. +- Added `LineAwareStdout`, `LineAwareStderr`, and `LineAwareStdin` proxies that forward to the original streams while batching writes on newline, explicit `flush()`, 5 ms idle gaps, and step boundaries. +- Added policy, CLI, and environment toggles for IO capture (`--io-capture`, `configure_policy(io_capture_line_proxies=..., io_capture_fd_fallback=...)`, `CODETRACER_CAPTURE_IO`) alongside the `ScopedMuteIoCapture` guard that suppresses recursive recorder logging. +- Added an optional FD mirror fallback that duplicates `stdout`/`stderr`, diffs native writes against the proxy ledger, emits `mirror`-flagged `IoChunk`s, and restores descriptors on teardown. +- Documented IO capture behaviour in the README with ADR 0008 context, manual smoke instructions, and troubleshooting steps for replaced `sys.stdout` / `sys.stderr`. - Documented the error-handling policy in the README, including the `RecorderError` hierarchy, policy hooks, JSON error trailers, exit codes, and sample handlers for structured failures. - Added an onboarding guide at `docs/onboarding/error-handling.md` with migration steps for downstream tools. - Added contributor guidance for assertions: prefer `bug!` / `ensure_internal!` over `panic!` / `.unwrap()`, and pair `debug_assert!` with classified errors. diff --git a/codetracer-python-recorder/Cargo.lock b/codetracer-python-recorder/Cargo.lock index 2399ba2..c210728 100644 --- a/codetracer-python-recorder/Cargo.lock +++ b/codetracer-python-recorder/Cargo.lock @@ -76,6 +76,7 @@ version = "0.1.0" dependencies = [ "bitflags", "dashmap", + "libc", "log", "once_cell", "pyo3", diff --git a/codetracer-python-recorder/Cargo.toml b/codetracer-python-recorder/Cargo.toml index d368a27..63eb836 100644 --- a/codetracer-python-recorder/Cargo.toml +++ b/codetracer-python-recorder/Cargo.toml @@ -31,6 +31,7 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" uuid = { version = "1.10", features = ["v4"] } recorder-errors = { version = "0.1.0", path = "crates/recorder-errors" } +libc = "0.2" [dev-dependencies] pyo3 = { version = "0.25.1", features = ["auto-initialize"] } diff --git a/codetracer-python-recorder/codetracer_python_recorder/cli.py b/codetracer-python-recorder/codetracer_python_recorder/cli.py index 13c27dc..0131532 100644 --- a/codetracer-python-recorder/codetracer_python_recorder/cli.py +++ b/codetracer-python-recorder/codetracer_python_recorder/cli.py @@ -97,6 +97,15 @@ def _parse_args(argv: Sequence[str]) -> RecorderCLIConfig: action="store_true", help="Emit JSON error trailers on stderr.", ) + parser.add_argument( + "--io-capture", + choices=["off", "proxies", "proxies+fd"], + help=( + "Control stdout/stderr capture. Without this flag, line-aware proxies stay enabled. " + "'off' disables capture, 'proxies' forces proxies without FD mirroring, " + "'proxies+fd' also mirrors raw file-descriptor writes." + ), + ) known, remainder = parser.parse_known_args(argv) pending: list[str] = list(remainder) @@ -145,6 +154,19 @@ def _parse_args(argv: Sequence[str]) -> RecorderCLIConfig: policy["log_file"] = Path(known.log_file).expanduser().resolve() if known.json_errors: policy["json_errors"] = True + if known.io_capture: + match known.io_capture: + case "off": + policy["io_capture_line_proxies"] = False + policy["io_capture_fd_fallback"] = False + case "proxies": + policy["io_capture_line_proxies"] = True + policy["io_capture_fd_fallback"] = False + case "proxies+fd": + policy["io_capture_line_proxies"] = True + policy["io_capture_fd_fallback"] = True + case other: # pragma: no cover - argparse choices block this + parser.error(f"unsupported io-capture mode '{other}'") return RecorderCLIConfig( trace_dir=trace_dir, diff --git a/codetracer-python-recorder/src/ffi.rs b/codetracer-python-recorder/src/ffi.rs index 04f51c4..9ca4783 100644 --- a/codetracer-python-recorder/src/ffi.rs +++ b/codetracer-python-recorder/src/ffi.rs @@ -145,12 +145,9 @@ mod tests { #[test] fn map_recorder_error_sets_python_attributes() { Python::with_gil(|py| { - let err = usage!( - ErrorCode::UnsupportedFormat, - "invalid trace format" - ) - .with_context("format", "yaml") - .with_source(std::io::Error::new(std::io::ErrorKind::Other, "boom")); + let err = usage!(ErrorCode::UnsupportedFormat, "invalid trace format") + .with_context("format", "yaml") + .with_source(std::io::Error::new(std::io::ErrorKind::Other, "boom")); let pyerr = map_recorder_error(err); let ty = pyerr.get_type(py); assert!(ty.is(py.get_type::())); @@ -191,9 +188,8 @@ mod tests { #[test] fn dispatch_converts_recorder_error_to_pyerr() { Python::with_gil(|py| { - let result: PyResult<()> = dispatch("dispatch_env", || { - Err(enverr!(ErrorCode::Io, "disk full")) - }); + let result: PyResult<()> = + dispatch("dispatch_env", || Err(enverr!(ErrorCode::Io, "disk full"))); let err = result.expect_err("expected PyErr"); let ty = err.get_type(py); assert!(ty.is(py.get_type::())); diff --git a/codetracer-python-recorder/src/policy.rs b/codetracer-python-recorder/src/policy.rs index 091a639..4826bf5 100644 --- a/codetracer-python-recorder/src/policy.rs +++ b/codetracer-python-recorder/src/policy.rs @@ -20,6 +20,8 @@ pub const ENV_LOG_LEVEL: &str = "CODETRACER_LOG_LEVEL"; pub const ENV_LOG_FILE: &str = "CODETRACER_LOG_FILE"; /// Environment variable enabling JSON error trailers on stderr. pub const ENV_JSON_ERRORS: &str = "CODETRACER_JSON_ERRORS"; +/// Environment variable toggling IO capture strategies. +pub const ENV_CAPTURE_IO: &str = "CODETRACER_CAPTURE_IO"; static POLICY: OnceCell> = OnceCell::new(); @@ -61,6 +63,21 @@ impl FromStr for OnRecorderError { } } +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct IoCapturePolicy { + pub line_proxies: bool, + pub fd_fallback: bool, +} + +impl Default for IoCapturePolicy { + fn default() -> Self { + Self { + line_proxies: true, + fd_fallback: false, + } + } +} + /// Recorder-wide runtime configuration. #[derive(Debug, Clone, PartialEq, Eq)] pub struct RecorderPolicy { @@ -70,6 +87,7 @@ pub struct RecorderPolicy { pub log_level: Option, pub log_file: Option, pub json_errors: bool, + pub io_capture: IoCapturePolicy, } impl Default for RecorderPolicy { @@ -81,6 +99,7 @@ impl Default for RecorderPolicy { log_level: None, log_file: None, json_errors: false, + io_capture: IoCapturePolicy::default(), } } } @@ -111,6 +130,16 @@ impl RecorderPolicy { if let Some(json_errors) = update.json_errors { self.json_errors = json_errors; } + if let Some(line_proxies) = update.io_capture_line_proxies { + self.io_capture.line_proxies = line_proxies; + if !self.io_capture.line_proxies { + self.io_capture.fd_fallback = false; + } + } + if let Some(fd_fallback) = update.io_capture_fd_fallback { + // fd fallback requires proxies to be on. + self.io_capture.fd_fallback = fd_fallback && self.io_capture.line_proxies; + } } } @@ -130,6 +159,8 @@ struct PolicyUpdate { log_level: Option, log_file: Option, json_errors: Option, + io_capture_line_proxies: Option, + io_capture_fd_fallback: Option, } /// Snapshot the current policy. @@ -178,6 +209,12 @@ pub fn configure_policy_from_env() -> RecorderResult<()> { update.json_errors = Some(parse_bool(&value)?); } + if let Ok(value) = env::var(ENV_CAPTURE_IO) { + let (line_proxies, fd_fallback) = parse_capture_io(&value)?; + update.io_capture_line_proxies = Some(line_proxies); + update.io_capture_fd_fallback = Some(fd_fallback); + } + apply_policy_update(update); Ok(()) } @@ -194,6 +231,54 @@ fn parse_bool(value: &str) -> RecorderResult { } } +fn parse_capture_io(value: &str) -> RecorderResult<(bool, bool)> { + let trimmed = value.trim(); + if trimmed.is_empty() { + let default = IoCapturePolicy::default(); + return Ok((default.line_proxies, default.fd_fallback)); + } + + let lower = trimmed.to_ascii_lowercase(); + if matches!( + lower.as_str(), + "0" | "off" | "false" | "disable" | "disabled" | "none" + ) { + return Ok((false, false)); + } + if matches!(lower.as_str(), "1" | "on" | "true" | "enable" | "enabled") { + return Ok((true, false)); + } + + let mut line_proxies = false; + let mut fd_fallback = false; + for token in lower.split(|c| matches!(c, ',' | '+')) { + match token.trim() { + "" => {} + "proxies" | "proxy" => line_proxies = true, + "fd" | "mirror" | "fallback" => { + line_proxies = true; + fd_fallback = true; + } + other => { + return Err(usage!( + ErrorCode::InvalidPolicyValue, + "invalid CODETRACER_CAPTURE_IO value '{}'", + other + )); + } + } + } + + if !line_proxies && !fd_fallback { + return Err(usage!( + ErrorCode::InvalidPolicyValue, + "CODETRACER_CAPTURE_IO must enable at least 'proxies' or 'fd'" + )); + } + + Ok((line_proxies, fd_fallback)) +} + // === PyO3 helpers === use pyo3::prelude::*; @@ -202,7 +287,7 @@ use pyo3::types::PyDict; use crate::ffi; #[pyfunction(name = "configure_policy")] -#[pyo3(signature = (on_recorder_error=None, require_trace=None, keep_partial_trace=None, log_level=None, log_file=None, json_errors=None))] +#[pyo3(signature = (on_recorder_error=None, require_trace=None, keep_partial_trace=None, log_level=None, log_file=None, json_errors=None, io_capture_line_proxies=None, io_capture_fd_fallback=None))] pub fn configure_policy_py( on_recorder_error: Option<&str>, require_trace: Option, @@ -210,6 +295,8 @@ pub fn configure_policy_py( log_level: Option<&str>, log_file: Option<&str>, json_errors: Option, + io_capture_line_proxies: Option, + io_capture_fd_fallback: Option, ) -> PyResult<()> { let mut update = PolicyUpdate::default(); @@ -245,6 +332,14 @@ pub fn configure_policy_py( update.json_errors = Some(value); } + if let Some(value) = io_capture_line_proxies { + update.io_capture_line_proxies = Some(value); + } + + if let Some(value) = io_capture_fd_fallback { + update.io_capture_fd_fallback = Some(value); + } + apply_policy_update(update); Ok(()) } @@ -278,6 +373,11 @@ pub fn py_policy_snapshot(py: Python<'_>) -> PyResult { dict.set_item("log_file", py.None())?; } dict.set_item("json_errors", snapshot.json_errors)?; + + let io_dict = PyDict::new(py); + io_dict.set_item("line_proxies", snapshot.io_capture.line_proxies)?; + io_dict.set_item("fd_fallback", snapshot.io_capture.fd_fallback)?; + dict.set_item("io_capture", io_dict)?; Ok(dict.into()) } @@ -301,6 +401,8 @@ mod tests { assert!(!snap.json_errors); assert!(snap.log_level.is_none()); assert!(snap.log_file.is_none()); + assert!(snap.io_capture.line_proxies); + assert!(!snap.io_capture.fd_fallback); } #[test] @@ -313,6 +415,8 @@ mod tests { update.log_level = Some("debug".to_string()); update.log_file = Some(PolicyPath::Value(PathBuf::from("/tmp/log.txt"))); update.json_errors = Some(true); + update.io_capture_line_proxies = Some(true); + update.io_capture_fd_fallback = Some(true); apply_policy_update(update); @@ -323,6 +427,8 @@ mod tests { assert_eq!(snap.log_level.as_deref(), Some("debug")); assert_eq!(snap.log_file.as_deref(), Some(Path::new("/tmp/log.txt"))); assert!(snap.json_errors); + assert!(snap.io_capture.line_proxies); + assert!(snap.io_capture.fd_fallback); reset_policy(); } @@ -336,6 +442,7 @@ mod tests { env::set_var(ENV_LOG_LEVEL, "info"); env::set_var(ENV_LOG_FILE, "/tmp/out.log"); env::set_var(ENV_JSON_ERRORS, "yes"); + env::set_var(ENV_CAPTURE_IO, "proxies,fd"); configure_policy_from_env().expect("configure from env"); @@ -348,6 +455,24 @@ mod tests { assert_eq!(snap.log_level.as_deref(), Some("info")); assert_eq!(snap.log_file.as_deref(), Some(Path::new("/tmp/out.log"))); assert!(snap.json_errors); + assert!(snap.io_capture.line_proxies); + assert!(snap.io_capture.fd_fallback); + reset_policy(); + } + + #[test] + fn configure_policy_from_env_accepts_plus_separator() { + reset_policy(); + let env_guard = env_lock(); + env::set_var(ENV_CAPTURE_IO, "proxies+fd"); + + configure_policy_from_env().expect("configure from env with plus separator"); + + drop(env_guard); + + let snap = policy_snapshot(); + assert!(snap.io_capture.line_proxies); + assert!(snap.io_capture.fd_fallback); reset_policy(); } @@ -364,6 +489,19 @@ mod tests { reset_policy(); } + #[test] + fn configure_policy_from_env_rejects_invalid_capture_io() { + reset_policy(); + let env_guard = env_lock(); + env::set_var(ENV_CAPTURE_IO, "invalid-token"); + + let err = configure_policy_from_env().expect_err("invalid capture io should error"); + assert_eq!(err.code, ErrorCode::InvalidPolicyValue); + + drop(env_guard); + reset_policy(); + } + fn env_lock() -> EnvGuard { EnvGuard } @@ -379,6 +517,7 @@ mod tests { ENV_LOG_LEVEL, ENV_LOG_FILE, ENV_JSON_ERRORS, + ENV_CAPTURE_IO, ] { env::remove_var(key); } diff --git a/codetracer-python-recorder/src/runtime/frame_inspector.rs b/codetracer-python-recorder/src/runtime/frame_inspector.rs index 3bb06d2..a8c5571 100644 --- a/codetracer-python-recorder/src/runtime/frame_inspector.rs +++ b/codetracer-python-recorder/src/runtime/frame_inspector.rs @@ -40,6 +40,11 @@ impl<'py> FrameSnapshot<'py> { pub fn locals_is_globals(&self) -> bool { self.locals_is_globals } + + /// Expose the raw frame pointer for correlation purposes. + pub fn frame_ptr(&self) -> *mut ffi::PyFrameObject { + self.frame_ptr + } } impl<'py> Drop for FrameSnapshot<'py> { diff --git a/codetracer-python-recorder/src/runtime/io_capture/events.rs b/codetracer-python-recorder/src/runtime/io_capture/events.rs new file mode 100644 index 0000000..dcd9061 --- /dev/null +++ b/codetracer-python-recorder/src/runtime/io_capture/events.rs @@ -0,0 +1,62 @@ +use crate::runtime::line_snapshots::FrameId; +use pyo3::Python; +use runtime_tracing::{Line, PathId}; +use std::fmt; +use std::thread::ThreadId; +use std::time::Instant; + +/// Distinguishes the proxied streams. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum IoStream { + Stdout, + Stderr, + Stdin, +} + +impl fmt::Display for IoStream { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + IoStream::Stdout => write!(f, "stdout"), + IoStream::Stderr => write!(f, "stderr"), + IoStream::Stdin => write!(f, "stdin"), + } + } +} + +/// Operations surfaced by the proxies. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum IoOperation { + Write, + Writelines, + Flush, + Read, + ReadLine, + ReadLines, + ReadInto, +} + +/// Raw proxy payload collected during Stage 1. +#[derive(Clone, Debug)] +pub struct ProxyEvent { + pub stream: IoStream, + pub operation: IoOperation, + pub payload: Vec, + pub thread_id: ThreadId, + pub timestamp: Instant, + pub frame_id: Option, + pub path_id: Option, + pub line: Option, + pub path: Option, +} + +/// Sink for proxy events. Later stages swap in a real writer-backed implementation. +pub trait ProxySink: Send + Sync + 'static { + fn record(&self, py: Python<'_>, event: ProxyEvent); +} + +/// No-op sink for scenarios where IO capture is disabled but proxies must install. +pub struct NullSink; + +impl ProxySink for NullSink { + fn record(&self, _py: Python<'_>, _event: ProxyEvent) {} +} diff --git a/codetracer-python-recorder/src/runtime/io_capture/fd_mirror/controller.rs b/codetracer-python-recorder/src/runtime/io_capture/fd_mirror/controller.rs new file mode 100644 index 0000000..f2bf2b9 --- /dev/null +++ b/codetracer-python-recorder/src/runtime/io_capture/fd_mirror/controller.rs @@ -0,0 +1,46 @@ +use super::ledger::MirrorLedgers; +use crate::runtime::io_capture::sink::IoChunkConsumer; +use std::sync::Arc; + +#[cfg(not(unix))] +use super::stub as unix; +#[cfg(unix)] +use super::unix; + +#[cfg(not(unix))] +pub use super::stub::FdMirrorError; +#[cfg(unix)] +pub use super::unix::FdMirrorError; + +pub struct FdMirrorController { + inner: Option, +} + +impl FdMirrorController { + pub fn new( + ledgers: MirrorLedgers, + consumer: Arc, + ) -> Result { + let inner = if let Some(set) = ledgers.inner() { + Some(unix::FdMirrorController::new(set, consumer)?) + } else { + None + }; + Ok(Self { inner }) + } + + pub fn shutdown(&mut self) { + if let Some(inner) = self.inner.as_mut() { + inner.shutdown(); + } + self.inner = None; + } +} + +impl Drop for FdMirrorController { + fn drop(&mut self) { + if let Some(inner) = self.inner.as_mut() { + inner.shutdown(); + } + } +} diff --git a/codetracer-python-recorder/src/runtime/io_capture/fd_mirror/ledger.rs b/codetracer-python-recorder/src/runtime/io_capture/fd_mirror/ledger.rs new file mode 100644 index 0000000..182e88f --- /dev/null +++ b/codetracer-python-recorder/src/runtime/io_capture/fd_mirror/ledger.rs @@ -0,0 +1,42 @@ +use crate::runtime::io_capture::events::IoStream; +use std::sync::Arc; + +#[cfg(not(unix))] +use super::stub::MirrorLedgerSet; +#[cfg(unix)] +use super::unix::MirrorLedgerSet; + +#[derive(Clone, Default)] +pub struct MirrorLedgers(Option>); + +impl MirrorLedgers { + pub fn new_enabled() -> Self { + #[cfg(unix)] + { + Self(Some(Arc::new(MirrorLedgerSet::new()))) + } + #[cfg(not(unix))] + { + Self(None) + } + } + + pub fn is_enabled(&self) -> bool { + self.0.is_some() + } + + pub fn begin_proxy_write(&self, stream: IoStream, payload: &[u8]) -> Option { + self.0 + .as_ref() + .and_then(|inner| inner.begin_proxy_write(stream, payload)) + } + + pub(crate) fn inner(&self) -> Option> { + self.0.clone() + } +} + +#[cfg(not(unix))] +pub use super::stub::LedgerTicket; +#[cfg(unix)] +pub use super::unix::LedgerTicket; diff --git a/codetracer-python-recorder/src/runtime/io_capture/fd_mirror/mod.rs b/codetracer-python-recorder/src/runtime/io_capture/fd_mirror/mod.rs new file mode 100644 index 0000000..94e9394 --- /dev/null +++ b/codetracer-python-recorder/src/runtime/io_capture/fd_mirror/mod.rs @@ -0,0 +1,9 @@ +mod controller; +mod ledger; +#[cfg(not(unix))] +mod stub; +#[cfg(unix)] +mod unix; + +pub use controller::FdMirrorController; +pub use ledger::{LedgerTicket, MirrorLedgers}; diff --git a/codetracer-python-recorder/src/runtime/io_capture/fd_mirror/stub.rs b/codetracer-python-recorder/src/runtime/io_capture/fd_mirror/stub.rs new file mode 100644 index 0000000..9b09a3c --- /dev/null +++ b/codetracer-python-recorder/src/runtime/io_capture/fd_mirror/stub.rs @@ -0,0 +1,56 @@ +use crate::runtime::io_capture::events::IoStream; +use crate::runtime::io_capture::sink::IoChunkConsumer; +use std::sync::Arc; + +#[derive(Debug, Clone)] +pub struct FdMirrorError { + message: String, +} + +impl FdMirrorError { + pub fn new(message: impl Into) -> Self { + Self { + message: message.into(), + } + } +} + +impl std::fmt::Display for FdMirrorError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.message) + } +} + +impl std::error::Error for FdMirrorError {} + +#[derive(Default)] +pub struct MirrorLedgerSet; + +pub struct LedgerTicket; + +pub struct FdMirrorController; + +impl MirrorLedgerSet { + pub fn new() -> Self { + Self + } + + pub fn begin_proxy_write(&self, _: IoStream, _: &[u8]) -> Option { + None + } +} + +impl LedgerTicket { + pub fn commit(self) {} +} + +impl FdMirrorController { + pub fn new( + _: Arc, + _: Arc, + ) -> Result { + Ok(Self) + } + + pub fn shutdown(&mut self) {} +} diff --git a/codetracer-python-recorder/src/runtime/io_capture/fd_mirror/unix.rs b/codetracer-python-recorder/src/runtime/io_capture/fd_mirror/unix.rs new file mode 100644 index 0000000..7f6a71e --- /dev/null +++ b/codetracer-python-recorder/src/runtime/io_capture/fd_mirror/unix.rs @@ -0,0 +1,517 @@ +use crate::runtime::io_capture::events::IoStream; +use crate::runtime::io_capture::sink::{IoChunk, IoChunkConsumer, IoChunkFlags}; +use log::warn; +use std::collections::VecDeque; +use std::io; +use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd}; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::sync::{Arc, Mutex}; +use std::thread; +use std::time::{Duration, Instant}; + +#[derive(Debug)] +pub struct FdMirrorError { + message: String, +} + +impl FdMirrorError { + pub fn new(msg: impl Into) -> Self { + Self { + message: msg.into(), + } + } +} + +impl std::fmt::Display for FdMirrorError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.message) + } +} + +impl std::error::Error for FdMirrorError {} + +#[derive(Debug)] +struct LedgerEntry { + seq: u64, + data: Vec, + offset: usize, +} + +impl LedgerEntry { + fn remaining(&self) -> &[u8] { + &self.data[self.offset..] + } + + fn consume(&mut self, amount: usize) { + self.offset = std::cmp::min(self.offset + amount, self.data.len()); + } + + fn is_spent(&self) -> bool { + self.offset >= self.data.len() + } +} + +#[derive(Debug)] +struct Ledger { + next_seq: AtomicU64, + entries: Mutex>, + matched_bytes: AtomicU64, + mirrored_bytes: AtomicU64, +} + +impl Ledger { + fn new() -> Self { + Self { + next_seq: AtomicU64::new(0), + entries: Mutex::new(VecDeque::new()), + matched_bytes: AtomicU64::new(0), + mirrored_bytes: AtomicU64::new(0), + } + } + + fn begin_entry(self: &Arc, payload: &[u8]) -> LedgerTicket { + let seq = self.next_seq.fetch_add(1, Ordering::Relaxed); + let entry = LedgerEntry { + seq, + data: payload.to_vec(), + offset: 0, + }; + let mut guard = self.entries.lock().expect("ledger lock poisoned"); + guard.push_back(entry); + LedgerTicket::new(Arc::clone(self), seq) + } + + fn cancel_entry(&self, seq: u64) { + let mut guard = self.entries.lock().expect("ledger lock poisoned"); + guard.retain(|entry| entry.seq != seq); + } + + fn subtract_from_chunk(&self, chunk: &[u8]) -> Vec { + if chunk.is_empty() { + return Vec::new(); + } + + let mut leftover = Vec::new(); + let mut guard = self.entries.lock().expect("ledger lock poisoned"); + let mut idx = 0usize; + + while idx < chunk.len() { + if let Some(front) = guard.front_mut() { + let remaining = front.remaining(); + if remaining.is_empty() { + guard.pop_front(); + continue; + } + + if chunk[idx] != remaining[0] { + leftover.push(chunk[idx]); + idx += 1; + continue; + } + + let full_len = remaining.len(); + let end_idx = idx + full_len; + if end_idx <= chunk.len() && &chunk[idx..end_idx] == remaining { + front.consume(full_len); + self.matched_bytes + .fetch_add(full_len as u64, Ordering::Relaxed); + idx = end_idx; + if front.is_spent() { + guard.pop_front(); + } + continue; + } + + // Partial match; advance by one byte. + leftover.push(chunk[idx]); + idx += 1; + } else { + leftover.extend_from_slice(&chunk[idx..]); + break; + } + } + + if !leftover.is_empty() { + self.mirrored_bytes + .fetch_add(leftover.len() as u64, Ordering::Relaxed); + } + leftover + } + + fn clear(&self) { + let mut guard = self.entries.lock().expect("ledger lock poisoned"); + guard.clear(); + } +} + +#[derive(Clone)] +pub struct MirrorLedgerSet { + stdout: Arc, + stderr: Arc, +} + +impl MirrorLedgerSet { + pub fn new() -> Self { + Self { + stdout: Arc::new(Ledger::new()), + stderr: Arc::new(Ledger::new()), + } + } + + pub fn begin_proxy_write(&self, stream: IoStream, payload: &[u8]) -> Option { + match stream { + IoStream::Stdout => Some(self.stdout.begin_entry(payload)), + IoStream::Stderr => Some(self.stderr.begin_entry(payload)), + IoStream::Stdin => None, + } + } + + fn ledger_for(&self, stream: IoStream) -> Option> { + match stream { + IoStream::Stdout => Some(Arc::clone(&self.stdout)), + IoStream::Stderr => Some(Arc::clone(&self.stderr)), + IoStream::Stdin => None, + } + } +} + +pub struct LedgerTicket { + ledger: Arc, + seq: u64, + committed: AtomicBool, +} + +impl LedgerTicket { + fn new(ledger: Arc, seq: u64) -> Self { + Self { + ledger, + seq, + committed: AtomicBool::new(false), + } + } + + pub fn commit(self) { + self.committed.store(true, Ordering::Relaxed); + } +} + +impl Drop for LedgerTicket { + fn drop(&mut self) { + if !self.committed.load(Ordering::Relaxed) { + self.ledger.cancel_entry(self.seq); + } + } +} + +struct StreamMirror { + target_fd: RawFd, + preserved_fd: OwnedFd, + ledger: Arc, + join: Option>, + shutdown_trigger: Arc, +} + +impl StreamMirror { + const SHUTDOWN_TIMEOUT: Duration = Duration::from_millis(250); + const SHUTDOWN_POLL_INTERVAL: Duration = Duration::from_millis(10); + + fn start( + stream: IoStream, + ledger: Arc, + consumer: Arc, + ) -> Result { + let target_fd = match stream { + IoStream::Stdout => libc::STDOUT_FILENO, + IoStream::Stderr => libc::STDERR_FILENO, + IoStream::Stdin => { + return Err(FdMirrorError::new("stdin mirroring not supported")); + } + }; + + let preserved = unsafe { libc::dup(target_fd) }; + if preserved < 0 { + return Err(FdMirrorError::new("dup failed for target fd")); + } + let preserved_fd = unsafe { OwnedFd::from_raw_fd(preserved) }; + + let mut pipe_fds = [0; 2]; + if unsafe { libc::pipe(pipe_fds.as_mut_ptr()) } != 0 { + return Err(FdMirrorError::new("pipe setup failed")); + } + let read_fd = unsafe { OwnedFd::from_raw_fd(pipe_fds[0]) }; + let write_fd = pipe_fds[1]; + + if unsafe { libc::dup2(write_fd, target_fd) } < 0 { + unsafe { + libc::close(write_fd); + } + return Err(FdMirrorError::new("dup2 failed while installing mirror")); + } + + unsafe { + libc::close(write_fd); + } + + let forward_fd = unsafe { libc::dup(preserved_fd.as_raw_fd()) }; + if forward_fd < 0 { + return Err(FdMirrorError::new("dup failed for forward fd")); + } + let forward_owned = unsafe { OwnedFd::from_raw_fd(forward_fd) }; + + let shutdown = Arc::new(ShutdownSignal::default()); + let thread_shutdown = shutdown.clone(); + let ledger_clone = ledger.clone(); + let consumer_clone = consumer.clone(); + + let join = thread::Builder::new() + .name(format!("codetracer-fd-mirror-{}", stream)) + .spawn(move || { + mirror_loop( + stream, + ledger_clone, + consumer_clone, + read_fd, + forward_owned, + thread_shutdown, + ); + }) + .map_err(|err| FdMirrorError::new(format!("spawn failed: {err}")))?; + + Ok(Self { + target_fd, + preserved_fd, + ledger, + join: Some(join), + shutdown_trigger: shutdown, + }) + } + + fn shutdown(&mut self) { + self.shutdown_trigger.request_shutdown(); + if unsafe { libc::dup2(self.preserved_fd.as_raw_fd(), self.target_fd) } < 0 { + warn!("failed to restore fd {} after mirroring", self.target_fd); + } + if let Some(join) = self.join.take() { + if wait_for_join(&join, Self::SHUTDOWN_TIMEOUT, Self::SHUTDOWN_POLL_INTERVAL) { + if let Err(err) = join.join() { + warn!("mirror thread join failed: {err:?}"); + } + } else { + warn!( + "mirror thread on fd {} did not stop within {:?}; detaching background reader", + self.target_fd, + Self::SHUTDOWN_TIMEOUT + ); + drop(join); + } + } + self.ledger.clear(); + } +} + +fn wait_for_join( + handle: &thread::JoinHandle<()>, + timeout: Duration, + poll_interval: Duration, +) -> bool { + if handle.is_finished() { + return true; + } + if timeout.is_zero() { + return false; + } + let mut remaining = timeout; + loop { + if handle.is_finished() { + return true; + } + let sleep_for = std::cmp::min(poll_interval, remaining); + thread::sleep(sleep_for); + if handle.is_finished() { + return true; + } + if remaining <= sleep_for { + return false; + } + remaining -= sleep_for; + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn wait_for_join_succeeds_for_completed_thread() { + let handle = thread::spawn(|| {}); + assert!(wait_for_join( + &handle, + Duration::from_millis(100), + Duration::from_millis(5) + )); + handle.join().expect("thread join should succeed"); + } + + #[test] + fn wait_for_join_times_out_for_blocked_thread() { + let shutdown = Arc::new(AtomicBool::new(false)); + let worker_shutdown = shutdown.clone(); + let handle = thread::spawn(move || { + while !worker_shutdown.load(Ordering::Relaxed) { + thread::sleep(Duration::from_millis(5)); + } + }); + + assert!( + !wait_for_join(&handle, Duration::from_millis(15), Duration::from_millis(5)), + "wait_for_join should time out on a stuck thread" + ); + + shutdown.store(true, Ordering::Relaxed); + handle + .join() + .expect("thread join should succeed after shutdown signal"); + } +} + +impl Drop for StreamMirror { + fn drop(&mut self) { + self.shutdown(); + } +} + +pub struct FdMirrorController { + stdout: Option, + stderr: Option, +} + +impl FdMirrorController { + pub fn new( + set: Arc, + consumer: Arc, + ) -> Result { + let stdout = set + .ledger_for(IoStream::Stdout) + .map(|ledger| StreamMirror::start(IoStream::Stdout, ledger, consumer.clone())) + .transpose()?; + let stderr = set + .ledger_for(IoStream::Stderr) + .map(|ledger| StreamMirror::start(IoStream::Stderr, ledger, consumer)) + .transpose()?; + Ok(Self { stdout, stderr }) + } + + pub fn shutdown(&mut self) { + if let Some(stream) = self.stdout.as_mut() { + stream.shutdown(); + } + if let Some(stream) = self.stderr.as_mut() { + stream.shutdown(); + } + self.stdout = None; + self.stderr = None; + } +} + +impl Drop for FdMirrorController { + fn drop(&mut self) { + self.shutdown(); + } +} + +#[derive(Default)] +struct ShutdownSignal { + triggered: AtomicBool, +} + +impl ShutdownSignal { + fn request_shutdown(&self) { + self.triggered.store(true, Ordering::Relaxed); + } + + fn should_exit(&self) -> bool { + self.triggered.load(Ordering::Relaxed) + } +} + +fn mirror_loop( + stream: IoStream, + ledger: Arc, + consumer: Arc, + read_fd: OwnedFd, + forward_fd: OwnedFd, + shutdown: Arc, +) { + let mut buffer = vec![0u8; 8192]; + while !shutdown.should_exit() { + let read = unsafe { + libc::read( + read_fd.as_raw_fd(), + buffer.as_mut_ptr() as *mut libc::c_void, + buffer.len(), + ) + }; + if read < 0 { + let err = io::Error::last_os_error(); + if err.kind() == io::ErrorKind::Interrupted { + continue; + } + warn!("fd mirror read error on {stream}: {err}"); + break; + } + if read == 0 { + break; + } + + let payload = &buffer[..read as usize]; + let leftover = ledger.subtract_from_chunk(payload); + if leftover.is_empty() { + continue; + } + + if let Err(err) = write_all(forward_fd.as_raw_fd(), &leftover) { + warn!("fd mirror write back error on {stream}: {err}"); + break; + } + + let chunk = IoChunk { + stream, + payload: leftover, + thread_id: thread::current().id(), + timestamp: Instant::now(), + frame_id: None, + path_id: None, + line: None, + path: None, + flags: IoChunkFlags::FD_MIRROR, + }; + consumer.consume(chunk); + } +} + +fn write_all(fd: RawFd, mut data: &[u8]) -> io::Result<()> { + while !data.is_empty() { + let written = unsafe { + libc::write( + fd, + data.as_ptr() as *const libc::c_void, + data.len().min(isize::MAX as usize), + ) + }; + if written < 0 { + let err = io::Error::last_os_error(); + if err.kind() == io::ErrorKind::Interrupted { + continue; + } + return Err(err); + } + if written == 0 { + return Err(io::Error::new( + io::ErrorKind::WriteZero, + "failed to write to preserved fd", + )); + } + data = &data[written as usize..]; + } + Ok(()) +} diff --git a/codetracer-python-recorder/src/runtime/io_capture/install.rs b/codetracer-python-recorder/src/runtime/io_capture/install.rs new file mode 100644 index 0000000..901b247 --- /dev/null +++ b/codetracer-python-recorder/src/runtime/io_capture/install.rs @@ -0,0 +1,222 @@ +use crate::runtime::io_capture::events::ProxySink; +use crate::runtime::io_capture::fd_mirror::MirrorLedgers; +use crate::runtime::io_capture::proxies::{LineAwareStderr, LineAwareStdin, LineAwareStdout}; +use pyo3::prelude::*; +use std::sync::Arc; + +#[cfg_attr(not(test), allow(dead_code))] +/// Controller that installs the proxies and restores the original streams. +pub struct IoStreamProxies { + _stdout_proxy: Py, + _stderr_proxy: Py, + _stdin_proxy: Py, + original_stdout: PyObject, + original_stderr: PyObject, + original_stdin: PyObject, + installed: bool, +} + +#[cfg_attr(not(test), allow(dead_code))] +impl IoStreamProxies { + pub fn install( + py: Python<'_>, + sink: Arc, + ledgers: Option, + ) -> PyResult { + let sys = py.import("sys")?; + let stdout_original = sys.getattr("stdout")?.unbind(); + let stderr_original = sys.getattr("stderr")?.unbind(); + let stdin_original = sys.getattr("stdin")?.unbind(); + + let stdout_proxy = Py::new( + py, + LineAwareStdout::new(stdout_original.clone_ref(py), sink.clone(), ledgers.clone()), + )?; + let stderr_proxy = Py::new( + py, + LineAwareStderr::new(stderr_original.clone_ref(py), sink.clone(), ledgers.clone()), + )?; + let stdin_proxy = Py::new( + py, + LineAwareStdin::new(stdin_original.clone_ref(py), sink.clone()), + )?; + + sys.setattr("stdout", stdout_proxy.clone_ref(py))?; + sys.setattr("stderr", stderr_proxy.clone_ref(py))?; + sys.setattr("stdin", stdin_proxy.clone_ref(py))?; + + Ok(Self { + _stdout_proxy: stdout_proxy, + _stderr_proxy: stderr_proxy, + _stdin_proxy: stdin_proxy, + original_stdout: stdout_original, + original_stderr: stderr_original, + original_stdin: stdin_original, + installed: true, + }) + } + + pub fn uninstall(&mut self, py: Python<'_>) -> PyResult<()> { + if !self.installed { + return Ok(()); + } + let sys = py.import("sys")?; + sys.setattr("stdout", &self.original_stdout)?; + sys.setattr("stderr", &self.original_stderr)?; + sys.setattr("stdin", &self.original_stdin)?; + self.installed = false; + Ok(()) + } +} + +impl Drop for IoStreamProxies { + fn drop(&mut self) { + Python::with_gil(|py| { + if let Err(err) = self.uninstall(py) { + err.print(py); + } + }); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::runtime::io_capture::events::{IoOperation, IoStream, ProxyEvent}; + use pyo3::Python; + use std::ffi::CString; + use std::sync::Mutex; + + #[derive(Default)] + struct RecordingSink { + events: Mutex>, + } + + impl RecordingSink { + fn new() -> Self { + Self { + events: Mutex::new(Vec::new()), + } + } + + fn events(&self) -> Vec { + self.events.lock().expect("lock poisoned").clone() + } + } + + impl ProxySink for RecordingSink { + fn record(&self, _py: Python<'_>, event: ProxyEvent) { + self.events.lock().expect("lock poisoned").push(event); + } + } + + fn with_string_io(py: Python<'_>, sink: Arc, func: F) -> PyResult + where + F: FnOnce(&mut IoStreamProxies) -> PyResult, + { + let sys = py.import("sys")?; + let io = py.import("io")?; + let stdout_buf = io.call_method0("StringIO")?; + let stderr_buf = io.call_method0("StringIO")?; + let stdin_buf = io.call_method1("StringIO", ("line1\nline2\n",))?; + sys.setattr("stdout", stdout_buf)?; + sys.setattr("stderr", stderr_buf)?; + sys.setattr("stdin", stdin_buf)?; + + let mut proxies = IoStreamProxies::install(py, sink, None)?; + let result = func(&mut proxies)?; + proxies.uninstall(py)?; + Ok(result) + } + + #[test] + fn stdout_write_is_captured() { + Python::with_gil(|py| { + let sink = Arc::new(RecordingSink::new()); + with_string_io(py, sink.clone(), |_| { + let code = CString::new("print('hello', end='')").unwrap(); + py.run(code.as_c_str(), None, None)?; + Ok(()) + }) + .unwrap(); + let events = sink.events(); + assert!(!events.is_empty()); + assert_eq!(events[0].stream, IoStream::Stdout); + assert_eq!(events[0].operation, IoOperation::Write); + assert_eq!(std::str::from_utf8(&events[0].payload).unwrap(), "hello"); + }); + } + + #[test] + fn stderr_write_is_captured() { + Python::with_gil(|py| { + let sink = Arc::new(RecordingSink::new()); + with_string_io(py, sink.clone(), |_| { + let code = CString::new("import sys\nsys.stderr.write('oops')").unwrap(); + py.run(code.as_c_str(), None, None)?; + Ok(()) + }) + .unwrap(); + let events = sink.events(); + assert!(!events.is_empty()); + assert_eq!(events[0].stream, IoStream::Stderr); + assert_eq!(events[0].operation, IoOperation::Write); + assert_eq!(std::str::from_utf8(&events[0].payload).unwrap(), "oops"); + }); + } + + #[test] + fn stdin_read_is_captured() { + Python::with_gil(|py| { + let sink = Arc::new(RecordingSink::new()); + with_string_io(py, sink.clone(), |_| { + let code = CString::new("import sys\n_ = sys.stdin.readline()").unwrap(); + py.run(code.as_c_str(), None, None)?; + Ok(()) + }) + .unwrap(); + let events = sink.events(); + assert!(!events.is_empty()); + let latest = events.last().unwrap(); + assert_eq!(latest.stream, IoStream::Stdin); + assert_eq!(latest.operation, IoOperation::ReadLine); + assert_eq!(std::str::from_utf8(&latest.payload).unwrap(), "line1\n"); + }); + } + + #[test] + fn reentrant_sink_does_not_loop() { + #[derive(Default)] + struct Reentrant { + inner: RecordingSink, + } + + impl ProxySink for Reentrant { + fn record(&self, py: Python<'_>, event: ProxyEvent) { + self.inner.record(py, event.clone()); + let _ = py + .import("sys") + .and_then(|sys| sys.getattr("stdout")) + .and_then(|stdout| stdout.call_method1("write", ("[sink]",))); + } + } + + Python::with_gil(|py| { + let sink = Arc::new(Reentrant::default()); + with_string_io(py, sink.clone(), |_| { + let code = CString::new("print('loop')").unwrap(); + py.run(code.as_c_str(), None, None)?; + Ok(()) + }) + .unwrap(); + let events = sink.inner.events(); + let payloads: Vec<&[u8]> = events + .iter() + .map(|event| event.payload.as_slice()) + .filter(|payload| !payload.is_empty() && *payload != b"\n") + .collect(); + assert_eq!(payloads.len(), 1); + assert_eq!(payloads[0], b"loop"); + }); + } +} diff --git a/codetracer-python-recorder/src/runtime/io_capture/mod.rs b/codetracer-python-recorder/src/runtime/io_capture/mod.rs new file mode 100644 index 0000000..bb785f3 --- /dev/null +++ b/codetracer-python-recorder/src/runtime/io_capture/mod.rs @@ -0,0 +1,22 @@ +pub mod events; +pub mod fd_mirror; +pub mod install; +pub mod mute; +pub mod pipeline; +pub mod proxies; +pub mod sink; + +#[allow(unused_imports)] +pub use events::{IoOperation, IoStream, NullSink, ProxyEvent, ProxySink}; +#[allow(unused_imports)] +pub use fd_mirror::{FdMirrorController, MirrorLedgers}; +#[allow(unused_imports)] +pub use install::IoStreamProxies; +#[allow(unused_imports)] +pub use mute::{is_io_capture_muted, ScopedMuteIoCapture}; +#[allow(unused_imports)] +pub use pipeline::{IoCapturePipeline, IoCaptureSettings}; +#[allow(unused_imports)] +pub use proxies::{LineAwareStderr, LineAwareStdin, LineAwareStdout}; +#[allow(unused_imports)] +pub use sink::{IoChunk, IoChunkConsumer, IoChunkFlags, IoEventSink}; diff --git a/codetracer-python-recorder/src/runtime/io_capture/mute.rs b/codetracer-python-recorder/src/runtime/io_capture/mute.rs new file mode 100644 index 0000000..0967dff --- /dev/null +++ b/codetracer-python-recorder/src/runtime/io_capture/mute.rs @@ -0,0 +1,35 @@ +use std::cell::Cell; + +thread_local! { + static IO_CAPTURE_MUTE_DEPTH: Cell = const { Cell::new(0) }; +} + +/// RAII guard that suppresses IO capture while it is alive. +pub struct ScopedMuteIoCapture; + +impl ScopedMuteIoCapture { + pub fn new() -> Self { + IO_CAPTURE_MUTE_DEPTH.with(|depth| depth.set(depth.get().saturating_add(1))); + ScopedMuteIoCapture + } +} + +impl Default for ScopedMuteIoCapture { + fn default() -> Self { + Self::new() + } +} + +impl Drop for ScopedMuteIoCapture { + fn drop(&mut self) { + IO_CAPTURE_MUTE_DEPTH.with(|depth| { + let current = depth.get(); + depth.set(current.saturating_sub(1)); + }); + } +} + +/// Returns true when IO capture should be bypassed for the current thread. +pub fn is_io_capture_muted() -> bool { + IO_CAPTURE_MUTE_DEPTH.with(|depth| depth.get() > 0) +} diff --git a/codetracer-python-recorder/src/runtime/io_capture/pipeline.rs b/codetracer-python-recorder/src/runtime/io_capture/pipeline.rs new file mode 100644 index 0000000..2fcdf99 --- /dev/null +++ b/codetracer-python-recorder/src/runtime/io_capture/pipeline.rs @@ -0,0 +1,134 @@ +//! Assembly and lifetime management for the IO capture pipeline. + +use super::fd_mirror::{FdMirrorController, MirrorLedgers}; +use super::install::IoStreamProxies; +use super::sink::{IoChunk, IoChunkConsumer, IoEventSink}; +use crate::runtime::io_capture::events::ProxySink; +use crate::runtime::line_snapshots::LineSnapshotStore; +use pyo3::prelude::*; +use std::sync::{Arc, Mutex}; +use std::thread::ThreadId; + +/// Configuration flags controlling which capture components install. +#[derive(Clone, Copy, Debug, Default)] +pub struct IoCaptureSettings { + pub line_proxies: bool, + pub fd_mirror: bool, +} + +pub struct IoCapturePipeline { + sink: Arc, + buffer: Arc, + proxies: Option, + fd_mirror: Option, +} + +impl IoCapturePipeline { + /// Install the requested capture pipeline components. + pub fn install( + py: Python<'_>, + snapshots: Arc, + settings: IoCaptureSettings, + ) -> PyResult> { + if !settings.line_proxies { + return Ok(None); + } + + let buffer = Arc::new(IoChunkBuffer::new()); + let consumer: Arc = buffer.clone(); + + let mirror_ledgers = if settings.fd_mirror { + let ledgers = MirrorLedgers::new_enabled(); + if !ledgers.is_enabled() { + log::warn!("fd_fallback requested but not supported on this platform"); + None + } else { + Some(ledgers) + } + } else { + None + }; + + let sink = Arc::new(IoEventSink::new(consumer.clone(), Arc::clone(&snapshots))); + let sink_for_proxies: Arc = sink.clone(); + let proxies = IoStreamProxies::install(py, sink_for_proxies, mirror_ledgers.clone())?; + + let fd_mirror = if let Some(ledgers) = mirror_ledgers { + match FdMirrorController::new(ledgers.clone(), consumer.clone()) { + Ok(controller) => Some(controller), + Err(err) => { + log::warn!("failed to enable FD mirror fallback: {err}"); + None + } + } + } else { + None + }; + + Ok(Some(Self { + sink, + buffer, + proxies: Some(proxies), + fd_mirror, + })) + } + + /// Flush buffered output for a specific thread before emitting a Step event. + pub fn flush_before_step(&self, thread_id: ThreadId) { + self.sink.flush_before_step(thread_id); + } + + /// Drain all in-flight output regardless of thread affinity. + pub fn flush_all(&self) { + self.sink.flush_all(); + } + + /// Take ownership of the buffered IO chunks accumulated so far. + pub fn drain_chunks(&self) -> Vec { + self.buffer.drain() + } + + /// Restore the original IO streams and tear down the FD mirror, if present. + pub fn uninstall(&mut self, py: Python<'_>) { + if let Some(mut mirror) = self.fd_mirror.take() { + mirror.shutdown(); + } + if let Some(mut proxies) = self.proxies.take() { + if let Err(err) = proxies.uninstall(py) { + err.print(py); + } + } + } +} + +impl Drop for IoCapturePipeline { + fn drop(&mut self) { + Python::with_gil(|py| { + self.uninstall(py); + }); + } +} + +struct IoChunkBuffer { + queue: Mutex>, +} + +impl IoChunkBuffer { + fn new() -> Self { + Self { + queue: Mutex::new(Vec::new()), + } + } + + fn drain(&self) -> Vec { + let mut guard = self.queue.lock().expect("io chunk buffer poisoned"); + std::mem::take(&mut *guard) + } +} + +impl IoChunkConsumer for IoChunkBuffer { + fn consume(&self, chunk: IoChunk) { + let mut guard = self.queue.lock().expect("io chunk buffer poisoned"); + guard.push(chunk); + } +} diff --git a/codetracer-python-recorder/src/runtime/io_capture/proxies/common.rs b/codetracer-python-recorder/src/runtime/io_capture/proxies/common.rs new file mode 100644 index 0000000..388bc3c --- /dev/null +++ b/codetracer-python-recorder/src/runtime/io_capture/proxies/common.rs @@ -0,0 +1,212 @@ +use crate::runtime::io_capture::events::{IoOperation, IoStream, ProxyEvent, ProxySink}; +use crate::runtime::io_capture::fd_mirror::{LedgerTicket, MirrorLedgers}; +use pyo3::prelude::*; +use pyo3::types::{PyAny, PyAnyMethods, PyList, PyTuple}; +use pyo3::IntoPyObject; +use std::sync::Arc; +use std::thread::{self, ThreadId}; +use std::time::Instant; + +pub(crate) fn current_thread_id() -> ThreadId { + thread::current().id() +} + +pub(crate) fn now() -> Instant { + Instant::now() +} + +fn build_iterator_list(iterable: &Bound<'_, PyAny>) -> PyResult<(Vec, Py)> { + let mut iterator = iterable.try_iter()?; + let mut captured = Vec::new(); + while let Some(item) = iterator.next() { + let obj = item?; + captured.push(obj.extract::()?); + } + let py_list = PyList::new(iterable.py(), &captured)?.unbind(); + Ok((captured, py_list)) +} + +pub(crate) fn buffer_snapshot(buffer: &Bound<'_, PyAny>) -> Option> { + buffer + .call_method0("__bytes__") + .ok() + .and_then(|obj| obj.extract::>().ok()) +} + +// Thread-local guard to prevent recursion when sinks write back to the proxies. +// +// Reentrancy hazard and rationale: +// +// - ProxySink::record implementations may perform Python I/O (e.g. sys.stdout.write or sys.stderr.write) +// while we are already inside a proxied I/O call (stdout/stderr writes or stdin reads). +// - Without a guard, those sink-triggered writes would re-enter these proxies, which would call the sink +// again, and so on. That can cause infinite recursion, stack overflow, and duplicate event capture. +// +// How we avoid it: +// +// - On first entry into a proxied I/O method we set a thread-local flag. +// - While that flag is set, we still forward I/O to the original Python object, but we skip recording. +// - This allows sink-triggered I/O to pass through to Python without being captured, breaking the cycle. +// +// See test coverage in `install.rs`. +thread_local! { + static IN_PROXY_CALLBACK: std::cell::Cell = const { std::cell::Cell::new(false) }; +} + +pub(crate) fn enter_reentrancy_guard() -> bool { + IN_PROXY_CALLBACK.with(|flag| { + if flag.get() { + false + } else { + flag.set(true); + true + } + }) +} + +pub(crate) fn exit_reentrancy_guard(entered: bool) { + if entered { + IN_PROXY_CALLBACK.with(|flag| flag.set(false)); + } +} + +#[allow(dead_code)] +pub(crate) struct OutputProxy { + pub(crate) original: PyObject, + sink: Arc, + stream: IoStream, + ledgers: Option, +} + +impl OutputProxy { + pub(crate) fn new( + original: PyObject, + sink: Arc, + stream: IoStream, + ledgers: Option, + ) -> Self { + Self { + original, + sink, + stream, + ledgers, + } + } + + fn record(&self, py: Python<'_>, operation: IoOperation, payload: Vec) { + let event = ProxyEvent { + stream: self.stream, + operation, + payload, + thread_id: current_thread_id(), + timestamp: now(), + frame_id: None, + path_id: None, + line: None, + path: None, + }; + self.sink.record(py, event); + } + + fn begin_ledger_entry(&self, payload: &[u8]) -> Option { + if payload.is_empty() { + return None; + } + self.ledgers + .as_ref() + .and_then(|ledgers| ledgers.begin_proxy_write(self.stream, payload)) + } + + fn call_method_with_payload<'py, A>( + &self, + py: Python<'py>, + method: &str, + args: A, + payload: Option>, + operation: IoOperation, + ) -> PyResult> + where + A: IntoPyObject<'py, Target = PyTuple>, + { + let entered = enter_reentrancy_guard(); + let mut ticket: Option = None; + if entered { + if let Some(bytes) = payload.as_ref() { + ticket = self.begin_ledger_entry(bytes); + } + } + + let result = self + .original + .call_method1(py, method, args) + .map(|value| value.into()); + + if entered { + if let (Ok(_), Some(data)) = (&result, payload) { + self.record(py, operation, data); + if let Some(ticket) = ticket.take() { + ticket.commit(); + } + } + } + + exit_reentrancy_guard(entered); + result + } + + pub(crate) fn passthrough<'py, A>( + &self, + py: Python<'py>, + method: &str, + args: A, + ) -> PyResult> + where + A: IntoPyObject<'py, Target = PyTuple>, + { + self.original + .call_method1(py, method, args) + .map(|value| value.into()) + } + + pub(crate) fn write(&self, py: Python<'_>, text: &Bound<'_, PyAny>) -> PyResult> { + let captured = text.extract::()?.into_bytes(); + let args = (text.clone().unbind(),); + self.call_method_with_payload(py, "write", args, Some(captured), IoOperation::Write) + } + + pub(crate) fn writelines( + &self, + py: Python<'_>, + lines: &Bound<'_, PyAny>, + ) -> PyResult> { + let (captured, replay) = build_iterator_list(lines)?; + let payload = captured.join("").into_bytes(); + self.call_method_with_payload( + py, + "writelines", + (replay.clone_ref(py),), + Some(payload), + IoOperation::Writelines, + ) + } + + pub(crate) fn flush(&self, py: Python<'_>) -> PyResult> { + self.call_method_with_payload(py, "flush", (), Some(Vec::new()), IoOperation::Flush) + } + + pub(crate) fn fileno(&self, py: Python<'_>) -> PyResult> { + self.passthrough(py, "fileno", ()) + } + + pub(crate) fn isatty(&self, py: Python<'_>) -> PyResult> { + self.passthrough(py, "isatty", ()) + } + + pub(crate) fn close(&self, py: Python<'_>) -> PyResult> { + self.passthrough(py, "close", ()) + } + + pub(crate) fn getattr(&self, py: Python<'_>, name: &str) -> PyResult { + self.original.bind(py).getattr(name).map(|obj| obj.unbind()) + } +} diff --git a/codetracer-python-recorder/src/runtime/io_capture/proxies/input.rs b/codetracer-python-recorder/src/runtime/io_capture/proxies/input.rs new file mode 100644 index 0000000..8ed7c0d --- /dev/null +++ b/codetracer-python-recorder/src/runtime/io_capture/proxies/input.rs @@ -0,0 +1,151 @@ +use super::common::{ + buffer_snapshot, current_thread_id, enter_reentrancy_guard, exit_reentrancy_guard, now, +}; +use crate::runtime::io_capture::events::{IoOperation, IoStream, ProxyEvent, ProxySink}; +use pyo3::exceptions::PyStopIteration; +use pyo3::prelude::*; +use pyo3::types::{PyAny, PyAnyMethods}; +use std::sync::Arc; + +#[pyclass(module = "codetracer_python_recorder.runtime")] +pub struct LineAwareStdin { + original: PyObject, + sink: Arc, +} + +impl LineAwareStdin { + pub fn new(original: PyObject, sink: Arc) -> Self { + Self { original, sink } + } + + fn record(&self, py: Python<'_>, operation: IoOperation, payload: Vec) { + let event = ProxyEvent { + stream: IoStream::Stdin, + operation, + payload, + thread_id: current_thread_id(), + timestamp: now(), + frame_id: None, + path_id: None, + line: None, + path: None, + }; + self.sink.record(py, event); + } +} + +#[pymethods] +impl LineAwareStdin { + #[pyo3(signature = (size=None))] + fn read(&self, py: Python<'_>, size: Option) -> PyResult> { + let entered = enter_reentrancy_guard(); + let result: PyResult> = match size { + Some(n) => self + .original + .call_method1(py, "read", (n,)) + .map(|value| value.into()), + None => self + .original + .call_method1(py, "read", ()) + .map(|value| value.into()), + }; + if entered { + if let Ok(ref obj) = result { + let bound = obj.bind(py); + if let Ok(text) = bound.extract::() { + if !text.is_empty() { + self.record(py, IoOperation::Read, text.into_bytes()); + } + } + } + } + exit_reentrancy_guard(entered); + result + } + + #[pyo3(signature = (limit=None))] + fn readline(&self, py: Python<'_>, limit: Option) -> PyResult> { + let entered = enter_reentrancy_guard(); + let result: PyResult> = match limit { + Some(n) => self + .original + .call_method1(py, "readline", (n,)) + .map(|value| value.into()), + None => self + .original + .call_method1(py, "readline", ()) + .map(|value| value.into()), + }; + if entered { + if let Ok(ref obj) = result { + let bound = obj.bind(py); + if let Ok(text) = bound.extract::() { + if !text.is_empty() { + self.record(py, IoOperation::ReadLine, text.into_bytes()); + } + } + } + } + exit_reentrancy_guard(entered); + result + } + + fn readinto(&self, py: Python<'_>, buffer: &Bound<'_, PyAny>) -> PyResult> { + let entered = enter_reentrancy_guard(); + let args = (buffer.clone().unbind(),); + let result: PyResult> = self + .original + .call_method1(py, "readinto", args) + .map(|value| value.into()); + if entered { + if let Ok(ref obj) = result { + if let Some(mut bytes) = buffer_snapshot(buffer) { + if let Ok(count) = obj.bind(py).extract::() { + let count = count.min(bytes.len()); + if count > 0 { + bytes.truncate(count); + self.record(py, IoOperation::ReadInto, bytes); + } + } + } + } + } + exit_reentrancy_guard(entered); + result + } + + fn fileno(&self, py: Python<'_>) -> PyResult> { + self.original + .call_method1(py, "fileno", ()) + .map(|value| value.into()) + } + + fn isatty(&self, py: Python<'_>) -> PyResult> { + self.original + .call_method1(py, "isatty", ()) + .map(|value| value.into()) + } + + fn close(&self, py: Python<'_>) -> PyResult> { + self.original + .call_method1(py, "close", ()) + .map(|value| value.into()) + } + + fn __iter__(slf: PyRef) -> Py { + slf.into() + } + + fn __next__(&self, py: Python<'_>) -> PyResult>> { + let line = self.readline(py, None)?; + if line.bind(py).extract::()?.is_empty() { + Err(PyStopIteration::new_err(())) + } else { + Ok(Some(line)) + } + } + + fn __getattr__(&self, py: Python<'_>, name: &str) -> PyResult { + self.original.bind(py).getattr(name).map(|obj| obj.unbind()) + } +} diff --git a/codetracer-python-recorder/src/runtime/io_capture/proxies/mod.rs b/codetracer-python-recorder/src/runtime/io_capture/proxies/mod.rs new file mode 100644 index 0000000..2584442 --- /dev/null +++ b/codetracer-python-recorder/src/runtime/io_capture/proxies/mod.rs @@ -0,0 +1,6 @@ +mod common; +mod input; +mod output; + +pub use input::LineAwareStdin; +pub use output::{LineAwareStderr, LineAwareStdout}; diff --git a/codetracer-python-recorder/src/runtime/io_capture/proxies/output.rs b/codetracer-python-recorder/src/runtime/io_capture/proxies/output.rs new file mode 100644 index 0000000..e8bd800 --- /dev/null +++ b/codetracer-python-recorder/src/runtime/io_capture/proxies/output.rs @@ -0,0 +1,76 @@ +use super::common::OutputProxy; +use crate::runtime::io_capture::events::IoStream; +use crate::runtime::io_capture::fd_mirror::MirrorLedgers; +use pyo3::prelude::*; +use pyo3::types::PyAny; +use std::sync::Arc; + +macro_rules! define_output_proxy { + ($name:ident, $stream:expr) => { + #[pyclass(module = "codetracer_python_recorder.runtime")] + pub struct $name { + inner: OutputProxy, + } + + impl $name { + pub fn new( + original: PyObject, + sink: Arc, + ledgers: Option, + ) -> Self { + Self { + inner: OutputProxy::new(original, sink, $stream, ledgers), + } + } + } + + #[pymethods] + impl $name { + fn write(&self, py: Python<'_>, text: &Bound<'_, PyAny>) -> PyResult> { + self.inner.write(py, text) + } + + fn writelines(&self, py: Python<'_>, lines: &Bound<'_, PyAny>) -> PyResult> { + self.inner.writelines(py, lines) + } + + fn flush(&self, py: Python<'_>) -> PyResult> { + self.inner.flush(py) + } + + fn fileno(&self, py: Python<'_>) -> PyResult> { + self.inner.fileno(py) + } + + fn isatty(&self, py: Python<'_>) -> PyResult> { + self.inner.isatty(py) + } + + fn close(&self, py: Python<'_>) -> PyResult> { + self.inner.close(py) + } + + #[getter] + fn encoding(&self, py: Python<'_>) -> PyResult { + self.inner.getattr(py, "encoding") + } + + #[getter] + fn errors(&self, py: Python<'_>) -> PyResult { + self.inner.getattr(py, "errors") + } + + #[getter] + fn buffer(&self, py: Python<'_>) -> PyResult { + self.inner.getattr(py, "buffer") + } + + fn __getattr__(&self, py: Python<'_>, name: &str) -> PyResult { + self.inner.getattr(py, name) + } + } + }; +} + +define_output_proxy!(LineAwareStdout, IoStream::Stdout); +define_output_proxy!(LineAwareStderr, IoStream::Stderr); diff --git a/codetracer-python-recorder/src/runtime/io_capture/sink/batcher.rs b/codetracer-python-recorder/src/runtime/io_capture/sink/batcher.rs new file mode 100644 index 0000000..4f6abdc --- /dev/null +++ b/codetracer-python-recorder/src/runtime/io_capture/sink/batcher.rs @@ -0,0 +1,300 @@ +use super::types::{IoChunk, IoChunkConsumer, IoChunkFlags}; +use crate::runtime::io_capture::events::{IoOperation, IoStream, ProxyEvent}; +use crate::runtime::line_snapshots::FrameId; +use runtime_tracing::{Line, PathId}; +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; +use std::thread::ThreadId; +use std::time::{Duration, Instant}; + +const MAX_BATCH_AGE: Duration = Duration::from_millis(5); + +pub struct IoChunkBatcher { + consumer: Arc, + state: Mutex, + time_source: Arc Instant + Send + Sync>, +} + +impl IoChunkBatcher { + pub fn new(consumer: Arc) -> Self { + Self::with_time_source(consumer, Arc::new(Instant::now)) + } + + pub fn with_time_source( + consumer: Arc, + time_source: Arc Instant + Send + Sync>, + ) -> Self { + Self { + consumer, + state: Mutex::new(IoSinkState::default()), + time_source, + } + } + + fn now(&self) -> Instant { + (self.time_source)() + } + + pub fn flush_before_step(&self, thread_id: ThreadId) { + let timestamp = self.now(); + let mut state = self.state.lock().expect("lock poisoned"); + if let Some(buffers) = state.threads.get_mut(&thread_id) { + buffers.flush_all( + thread_id, + timestamp, + IoChunkFlags::STEP_BOUNDARY, + &*self.consumer, + ); + } + } + + pub fn flush_all(&self) { + let timestamp = self.now(); + let mut state = self.state.lock().expect("lock poisoned"); + for (thread_id, buffers) in state.threads.iter_mut() { + buffers.flush_all( + *thread_id, + timestamp, + IoChunkFlags::STEP_BOUNDARY, + &*self.consumer, + ); + } + } + + pub fn handle_output(&self, mut event: ProxyEvent) { + let mut state = self.state.lock().expect("lock poisoned"); + let buffers = state + .threads + .entry(event.thread_id) + .or_insert_with(ThreadBuffers::new); + let buffer = buffers.buffer_mut(event.stream); + + if buffer.is_stale(event.timestamp) { + let flush_timestamp = buffer.last_timestamp.unwrap_or(event.timestamp); + buffer.emit( + event.thread_id, + event.stream, + flush_timestamp, + IoChunkFlags::TIME_SPLIT, + &*self.consumer, + ); + } + + match event.operation { + IoOperation::Write | IoOperation::Writelines => { + if event.payload.is_empty() { + return; + } + buffer.append( + &event.payload, + event.frame_id, + event.path_id, + event.line, + event.path.take(), + event.timestamp, + ); + buffer.flush_complete_lines( + event.thread_id, + event.stream, + event.timestamp, + &*self.consumer, + ); + } + IoOperation::Flush => { + buffer.emit( + event.thread_id, + event.stream, + event.timestamp, + IoChunkFlags::EXPLICIT_FLUSH, + &*self.consumer, + ); + } + _ => {} + } + } + + pub fn handle_input(&self, event: ProxyEvent) { + if event.payload.is_empty() { + return; + } + let chunk = IoChunk { + stream: IoStream::Stdin, + payload: event.payload, + thread_id: event.thread_id, + timestamp: event.timestamp, + frame_id: event.frame_id, + path_id: event.path_id, + line: event.line, + path: event.path, + flags: IoChunkFlags::INPUT_CHUNK, + }; + self.consumer.consume(chunk); + } +} + +#[derive(Default)] +struct IoSinkState { + threads: HashMap, +} + +struct ThreadBuffers { + stdout: StreamBuffer, + stderr: StreamBuffer, +} + +impl ThreadBuffers { + fn new() -> Self { + Self { + stdout: StreamBuffer::new(), + stderr: StreamBuffer::new(), + } + } + + fn buffer_mut(&mut self, stream: IoStream) -> &mut StreamBuffer { + match stream { + IoStream::Stdout => &mut self.stdout, + IoStream::Stderr => &mut self.stderr, + IoStream::Stdin => panic!("stdin does not use output buffers"), + } + } + + fn flush_all( + &mut self, + thread_id: ThreadId, + timestamp: Instant, + flags: IoChunkFlags, + consumer: &dyn IoChunkConsumer, + ) { + for stream in [IoStream::Stdout, IoStream::Stderr] { + let buffer = self.buffer_mut(stream); + buffer.emit(thread_id, stream, timestamp, flags, consumer); + } + } +} + +struct StreamBuffer { + payload: Vec, + last_timestamp: Option, + frame_id: Option, + path_id: Option, + line: Option, + path: Option, +} + +impl StreamBuffer { + fn new() -> Self { + Self { + payload: Vec::new(), + last_timestamp: None, + frame_id: None, + path_id: None, + line: None, + path: None, + } + } + + fn append( + &mut self, + payload: &[u8], + frame_id: Option, + path_id: Option, + line: Option, + path: Option, + timestamp: Instant, + ) { + if let Some(id) = frame_id { + self.frame_id = Some(id); + } + if let Some(id) = path_id { + self.path_id = Some(id); + } + if let Some(line) = line { + self.line = Some(line); + } + if let Some(path) = path { + self.path = Some(path); + } + self.payload.extend_from_slice(payload); + self.last_timestamp = Some(timestamp); + } + + fn take_all(&mut self) -> Option> { + if self.payload.is_empty() { + return None; + } + Some(std::mem::take(&mut self.payload)) + } + + fn emit( + &mut self, + thread_id: ThreadId, + stream: IoStream, + timestamp: Instant, + flags: IoChunkFlags, + consumer: &dyn IoChunkConsumer, + ) { + if let Some(payload) = self.take_all() { + let chunk = IoChunk { + stream, + payload, + thread_id, + timestamp, + frame_id: self.frame_id, + path_id: self.path_id, + line: self.line, + path: self.path.take(), + flags, + }; + self.frame_id = None; + self.path_id = None; + self.line = None; + self.path = None; + self.last_timestamp = Some(timestamp); + consumer.consume(chunk); + } + } + + fn flush_complete_lines( + &mut self, + thread_id: ThreadId, + stream: IoStream, + timestamp: Instant, + consumer: &dyn IoChunkConsumer, + ) { + while let Some(index) = self.payload.iter().position(|byte| *byte == b'\n') { + let prefix: Vec = self.payload.drain(..=index).collect(); + let chunk = IoChunk { + stream, + payload: prefix, + thread_id, + timestamp, + frame_id: self.frame_id, + path_id: self.path_id, + line: self.line, + path: self.path.clone(), + flags: IoChunkFlags::NEWLINE_TERMINATED, + }; + consumer.consume(chunk); + if self.payload.is_empty() { + self.frame_id = None; + self.path_id = None; + self.line = None; + self.path = None; + } + self.last_timestamp = Some(timestamp); + } + } + + fn is_stale(&self, now: Instant) -> bool { + if self.payload.is_empty() { + return false; + } + match self.last_timestamp { + Some(last) => now + .checked_duration_since(last) + .map(|elapsed| elapsed >= MAX_BATCH_AGE) + .unwrap_or(false), + None => false, + } + } +} diff --git a/codetracer-python-recorder/src/runtime/io_capture/sink/enricher.rs b/codetracer-python-recorder/src/runtime/io_capture/sink/enricher.rs new file mode 100644 index 0000000..e12b555 --- /dev/null +++ b/codetracer-python-recorder/src/runtime/io_capture/sink/enricher.rs @@ -0,0 +1,97 @@ +use crate::runtime::io_capture::events::ProxyEvent; +use crate::runtime::io_capture::mute::is_io_capture_muted; +use crate::runtime::line_snapshots::LineSnapshotStore; +use pyo3::types::PyAnyMethods; +use pyo3::Python; +use runtime_tracing::Line; +use std::sync::Arc; + +pub struct EventEnricher { + snapshots: Arc, +} + +impl EventEnricher { + pub fn new(snapshots: Arc) -> Self { + Self { snapshots } + } + + pub fn enrich(&self, py: Python<'_>, mut event: ProxyEvent) -> Option { + if is_io_capture_muted() { + return None; + } + + if event.frame_id.is_none() || event.path_id.is_none() || event.line.is_none() { + if let Some(snapshot) = self.snapshots.snapshot_for_thread(event.thread_id) { + if event.frame_id.is_none() { + event.frame_id = Some(snapshot.frame_id()); + } + if event.path_id.is_none() { + event.path_id = Some(snapshot.path_id()); + } + if event.line.is_none() { + event.line = Some(snapshot.line()); + } + } + } + + if event.line.is_none() || (event.path_id.is_none() && event.path.is_none()) { + populate_from_stack(py, &mut event); + } + + Some(event) + } +} + +fn populate_from_stack(py: Python<'_>, event: &mut ProxyEvent) { + if event.line.is_some() && (event.path_id.is_some() || event.path.is_some()) { + return; + } + + let frame_result = (|| { + let sys = py.import("sys")?; + sys.getattr("_getframe") + })(); + + let getframe = match frame_result { + Ok(obj) => obj, + Err(_) => return, + }; + + for depth in [2_i32, 1, 0] { + let frame_obj = match getframe.call1((depth,)) { + Ok(frame) => frame, + Err(_) => continue, + }; + + let frame = frame_obj; + + if event.line.is_none() { + if let Ok(lineno) = frame + .getattr("f_lineno") + .and_then(|obj| obj.extract::()) + { + event.line = Some(Line(lineno as i64)); + } + } + + if event.path.is_none() { + if let Ok(code) = frame.getattr("f_code") { + if let Ok(filename) = code + .getattr("co_filename") + .and_then(|obj| obj.extract::()) + { + event.path = Some(filename); + } + } + } + + if event.frame_id.is_none() { + let raw = frame.as_ptr() as usize as u64; + event.frame_id = Some(crate::runtime::line_snapshots::FrameId::from_raw(raw)); + } + + if event.line.is_some() && (event.path_id.is_some() || event.path.is_some()) { + break; + } + } +} diff --git a/codetracer-python-recorder/src/runtime/io_capture/sink/mod.rs b/codetracer-python-recorder/src/runtime/io_capture/sink/mod.rs new file mode 100644 index 0000000..bf455b9 --- /dev/null +++ b/codetracer-python-recorder/src/runtime/io_capture/sink/mod.rs @@ -0,0 +1,287 @@ +mod batcher; +mod enricher; +mod types; + +pub use batcher::IoChunkBatcher; +pub use enricher::EventEnricher; +pub use types::{IoChunk, IoChunkConsumer, IoChunkFlags}; + +use crate::runtime::io_capture::events::{IoStream, ProxyEvent, ProxySink}; +use crate::runtime::line_snapshots::LineSnapshotStore; +use pyo3::Python; +use std::sync::Arc; +use std::thread::ThreadId; + +/// Batching sink that groups proxy events into line-aware IO chunks. +pub struct IoEventSink { + enricher: EventEnricher, + batcher: IoChunkBatcher, +} + +impl IoEventSink { + pub fn new(consumer: Arc, snapshots: Arc) -> Self { + let enricher = EventEnricher::new(Arc::clone(&snapshots)); + let batcher = IoChunkBatcher::new(consumer); + Self { enricher, batcher } + } + + #[cfg(test)] + pub(crate) fn with_time_source( + consumer: Arc, + snapshots: Arc, + time_source: Arc std::time::Instant + Send + Sync>, + ) -> Self { + let enricher = EventEnricher::new(Arc::clone(&snapshots)); + let batcher = IoChunkBatcher::with_time_source(consumer, time_source); + Self { enricher, batcher } + } + + fn handle_enriched_event(&self, event: ProxyEvent) { + match event.stream { + IoStream::Stdout | IoStream::Stderr => self.batcher.handle_output(event), + IoStream::Stdin => self.batcher.handle_input(event), + } + } + + pub fn flush_before_step(&self, thread_id: ThreadId) { + self.batcher.flush_before_step(thread_id); + } + + pub fn flush_all(&self) { + self.batcher.flush_all(); + } +} + +impl ProxySink for IoEventSink { + fn record(&self, py: Python<'_>, event: ProxyEvent) { + if let Some(enriched) = self.enricher.enrich(py, event) { + self.handle_enriched_event(enriched); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::runtime::line_snapshots::LineSnapshotStore; + use runtime_tracing::{Line, PathId}; + use std::sync::{Arc, Mutex}; + use std::thread; + use std::time::{Duration, Instant}; + + use crate::runtime::io_capture::events::IoOperation; + + #[derive(Default)] + struct ChunkRecorder { + chunks: Mutex>, + } + + impl ChunkRecorder { + fn chunks(&self) -> Vec { + self.chunks.lock().expect("lock poisoned").clone() + } + } + + impl IoChunkConsumer for ChunkRecorder { + fn consume(&self, chunk: IoChunk) { + self.chunks.lock().expect("lock poisoned").push(chunk); + } + } + + fn make_write_event( + thread_id: ThreadId, + stream: IoStream, + payload: &[u8], + timestamp: Instant, + path_id: PathId, + line: Line, + ) -> ProxyEvent { + ProxyEvent { + stream, + operation: IoOperation::Write, + payload: payload.to_vec(), + thread_id, + timestamp, + frame_id: Some(crate::runtime::line_snapshots::FrameId::from_raw(42)), + path_id: Some(path_id), + line: Some(line), + path: Some(format!("/tmp/test{}_{}.py", path_id.0, line.0)), + } + } + + #[test] + fn sink_batches_until_newline_flushes() { + Python::with_gil(|py| { + let collector: Arc = Arc::new(ChunkRecorder::default()); + let snapshots = Arc::new(LineSnapshotStore::new()); + let sink = IoEventSink::new(collector.clone(), snapshots); + let thread_id = thread::current().id(); + let base = Instant::now(); + + sink.record( + py, + make_write_event( + thread_id, + IoStream::Stdout, + b"hello", + base, + PathId(1), + Line(10), + ), + ); + assert!(collector.chunks().is_empty()); + + sink.record( + py, + make_write_event( + thread_id, + IoStream::Stdout, + b" world\ntrailing", + base + std::time::Duration::from_millis(1), + PathId(1), + Line(10), + ), + ); + + let chunks = collector.chunks(); + assert_eq!(chunks.len(), 1); + assert_eq!(chunks[0].payload, b"hello world\n"); + assert!(chunks[0].flags.contains(IoChunkFlags::NEWLINE_TERMINATED)); + assert_eq!( + chunks[0].frame_id, + Some(crate::runtime::line_snapshots::FrameId::from_raw(42)) + ); + assert_eq!(chunks[0].path_id, Some(PathId(1))); + assert_eq!(chunks[0].line, Some(Line(10))); + assert_eq!(chunks[0].path.as_deref(), Some("/tmp/test1_10.py")); + + sink.flush_before_step(thread_id); + let chunks = collector.chunks(); + assert_eq!(chunks.len(), 2); + assert_eq!(chunks[1].payload, b"trailing"); + assert!(chunks[1].flags.contains(IoChunkFlags::STEP_BOUNDARY)); + assert_eq!(chunks[1].path_id, Some(PathId(1))); + assert_eq!(chunks[1].line, Some(Line(10))); + assert_eq!(chunks[1].path.as_deref(), Some("/tmp/test1_10.py")); + }); + } + + #[test] + fn sink_flushes_on_time_gap() { + Python::with_gil(|py| { + let collector: Arc = Arc::new(ChunkRecorder::default()); + let snapshots = Arc::new(LineSnapshotStore::new()); + let sink = IoEventSink::new(collector.clone(), snapshots); + let thread_id = thread::current().id(); + let base = Instant::now(); + + sink.record( + py, + make_write_event(thread_id, IoStream::Stdout, b"a", base, PathId(2), Line(20)), + ); + sink.record( + py, + make_write_event( + thread_id, + IoStream::Stdout, + b"b", + base + std::time::Duration::from_millis(10), + PathId(2), + Line(20), + ), + ); + + let chunks = collector.chunks(); + assert_eq!(chunks.len(), 1); + assert_eq!(chunks[0].payload, b"a"); + assert!(chunks[0].flags.contains(IoChunkFlags::TIME_SPLIT)); + assert_eq!(chunks[0].path_id, Some(PathId(2))); + assert_eq!(chunks[0].line, Some(Line(20))); + assert_eq!(chunks[0].path.as_deref(), Some("/tmp/test2_20.py")); + + sink.flush_before_step(thread_id); + let chunks = collector.chunks(); + assert_eq!(chunks.len(), 2); + assert_eq!(chunks[1].payload, b"b"); + assert_eq!(chunks[1].path_id, Some(PathId(2))); + assert_eq!(chunks[1].line, Some(Line(20))); + assert_eq!(chunks[1].path.as_deref(), Some("/tmp/test2_20.py")); + }); + } + + #[test] + fn sink_flushes_on_explicit_flush() { + Python::with_gil(|py| { + let collector: Arc = Arc::new(ChunkRecorder::default()); + let snapshots = Arc::new(LineSnapshotStore::new()); + let sink = IoEventSink::new(collector.clone(), snapshots); + let thread_id = thread::current().id(); + let base = Instant::now(); + + sink.record( + py, + ProxyEvent { + stream: IoStream::Stderr, + operation: IoOperation::Write, + payload: b"log".to_vec(), + thread_id, + timestamp: base, + frame_id: Some(crate::runtime::line_snapshots::FrameId::from_raw(99)), + path_id: Some(PathId(3)), + line: Some(Line(30)), + path: Some("/tmp/test3_30.py".to_string()), + }, + ); + sink.record( + py, + ProxyEvent { + stream: IoStream::Stderr, + operation: IoOperation::Flush, + payload: Vec::new(), + thread_id, + timestamp: base + std::time::Duration::from_millis(1), + frame_id: Some(crate::runtime::line_snapshots::FrameId::from_raw(99)), + path_id: Some(PathId(3)), + line: Some(Line(30)), + path: Some("/tmp/test3_30.py".to_string()), + }, + ); + + let chunks = collector.chunks(); + assert_eq!(chunks.len(), 1); + assert_eq!(chunks[0].payload, b"log"); + assert!(chunks[0].flags.contains(IoChunkFlags::EXPLICIT_FLUSH)); + }); + } + + #[test] + fn flush_before_step_uses_custom_time_source() { + Python::with_gil(|py| { + let collector: Arc = Arc::new(ChunkRecorder::default()); + let snapshots = Arc::new(LineSnapshotStore::new()); + let fixed = Instant::now() + Duration::from_secs(5); + let time_source = Arc::new(move || fixed); + let sink = IoEventSink::with_time_source(collector.clone(), snapshots, time_source); + let thread_id = thread::current().id(); + let base = Instant::now(); + + sink.record( + py, + make_write_event( + thread_id, + IoStream::Stdout, + b"pending", + base, + PathId(7), + Line(70), + ), + ); + + sink.flush_before_step(thread_id); + + let chunks = collector.chunks(); + assert_eq!(chunks.len(), 1); + assert_eq!(chunks[0].timestamp, fixed); + }); + } +} diff --git a/codetracer-python-recorder/src/runtime/io_capture/sink/types.rs b/codetracer-python-recorder/src/runtime/io_capture/sink/types.rs new file mode 100644 index 0000000..866ff74 --- /dev/null +++ b/codetracer-python-recorder/src/runtime/io_capture/sink/types.rs @@ -0,0 +1,45 @@ +use crate::runtime::io_capture::events::IoStream; +use crate::runtime::line_snapshots::FrameId; +use bitflags::bitflags; +use runtime_tracing::{Line, PathId}; +use std::thread::ThreadId; +use std::time::Instant; + +bitflags! { + /// Additional metadata describing why a chunk flushed. + #[derive(Clone, Copy, Debug, PartialEq, Eq)] + pub struct IoChunkFlags: u8 { + /// The buffer ended because a newline character was observed. + const NEWLINE_TERMINATED = 0b0000_0001; + /// The user triggered `flush()` on the underlying TextIOBase. + const EXPLICIT_FLUSH = 0b0000_0010; + /// The recorder forced a flush immediately before emitting a Step event. + const STEP_BOUNDARY = 0b0000_0100; + /// The buffer aged past the batching deadline. + const TIME_SPLIT = 0b0000_1000; + /// The chunk represents stdin data flowing into the program. + const INPUT_CHUNK = 0b0001_0000; + /// The chunk originated from the FD mirror fallback. + const FD_MIRROR = 0b0010_0000; + } +} + +/// Normalised chunk emitted by the batching sink. +#[allow(dead_code)] +#[derive(Clone, Debug)] +pub struct IoChunk { + pub stream: IoStream, + pub payload: Vec, + pub thread_id: ThreadId, + pub timestamp: Instant, + pub frame_id: Option, + pub path_id: Option, + pub line: Option, + pub path: Option, + pub flags: IoChunkFlags, +} + +/// Consumer invoked when the sink emits a chunk. +pub trait IoChunkConsumer: Send + Sync + 'static { + fn consume(&self, chunk: IoChunk); +} diff --git a/codetracer-python-recorder/src/runtime/line_snapshots.rs b/codetracer-python-recorder/src/runtime/line_snapshots.rs new file mode 100644 index 0000000..88d2c52 --- /dev/null +++ b/codetracer-python-recorder/src/runtime/line_snapshots.rs @@ -0,0 +1,173 @@ +//! Thread-local line snapshot store used to attribute IO to the most recent step. + +use runtime_tracing::{Line, PathId}; +use std::collections::HashMap; +use std::sync::RwLock; +use std::thread::ThreadId; +use std::time::Instant; + +/// Identifier that helps correlate IO chunks with the owning Python frame. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub struct FrameId(u64); + +impl FrameId { + /// Build a frame id from a raw pointer cast or any unique integer. + pub fn from_raw(raw: u64) -> Self { + Self(raw) + } + + /// Return the raw identifier value. + #[cfg_attr(not(test), allow(dead_code))] + pub fn as_raw(self) -> u64 { + self.0 + } +} + +/// Snapshot for the most recent executed line per Python thread. +#[cfg_attr(not(test), allow(dead_code))] +#[derive(Clone, Debug)] +pub struct LineSnapshot { + path_id: PathId, + line: Line, + frame_id: FrameId, + captured_at: Instant, +} + +#[cfg_attr(not(test), allow(dead_code))] +impl LineSnapshot { + pub fn new(path_id: PathId, line: Line, frame_id: FrameId) -> Self { + Self { + path_id, + line, + frame_id, + captured_at: Instant::now(), + } + } + + pub fn path_id(&self) -> PathId { + self.path_id + } + + pub fn line(&self) -> Line { + self.line + } + + pub fn frame_id(&self) -> FrameId { + self.frame_id + } + + pub fn captured_at(&self) -> Instant { + self.captured_at + } +} + +/// Concurrent store recording the latest line snapshot per Python thread. +#[derive(Default)] +pub struct LineSnapshotStore { + inner: RwLock>, +} + +impl LineSnapshotStore { + pub fn new() -> Self { + Self { + inner: RwLock::new(HashMap::new()), + } + } + + /// Record or update the snapshot for a thread. + pub fn record(&self, thread_id: ThreadId, path_id: PathId, line: Line, frame_id: FrameId) { + let snapshot = LineSnapshot::new(path_id, line, frame_id); + let mut guard = self.inner.write().expect("lock poisoned"); + guard.insert(thread_id, snapshot); + } + + /// Fetch the latest snapshot for a thread if present. + #[cfg_attr(not(test), allow(dead_code))] + pub fn snapshot_for_thread(&self, thread_id: ThreadId) -> Option { + let guard = self.inner.read().expect("lock poisoned"); + guard.get(&thread_id).cloned() + } + + /// Remove a snapshot when a thread terminates. + #[cfg_attr(not(test), allow(dead_code))] + pub fn remove(&self, thread_id: ThreadId) -> Option { + let mut guard = self.inner.write().expect("lock poisoned"); + guard.remove(&thread_id) + } + + /// Clear all stored snapshots. + pub fn clear(&self) { + let mut guard = self.inner.write().expect("lock poisoned"); + guard.clear(); + } + + /// Count tracked threads, used in tests. + #[cfg(test)] + pub fn tracked_threads(&self) -> usize { + let guard = self.inner.read().expect("lock poisoned"); + guard.len() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use runtime_tracing::Line; + use std::sync::Arc; + use std::thread; + + fn make_snapshot(store: &LineSnapshotStore, line: i64) -> LineSnapshot { + let tid = thread::current().id(); + store.record(tid, PathId(7), Line(line), FrameId::from_raw(line as u64)); + store + .snapshot_for_thread(tid) + .expect("snapshot should be recorded") + } + + #[test] + fn records_and_reads_snapshot() { + let store = LineSnapshotStore::new(); + let snapshot = make_snapshot(&store, 10); + assert_eq!(snapshot.path_id(), PathId(7)); + assert_eq!(snapshot.line(), Line(10)); + assert_eq!(snapshot.frame_id().as_raw(), 10); + } + + #[test] + fn removes_snapshot_for_thread() { + let store = LineSnapshotStore::new(); + let tid = thread::current().id(); + store.record(tid, PathId(3), Line(20), FrameId::from_raw(1)); + assert!(store.snapshot_for_thread(tid).is_some()); + assert!(store.remove(tid).is_some()); + assert!(store.snapshot_for_thread(tid).is_none()); + } + + #[test] + fn concurrent_updates_do_not_panic() { + let store = Arc::new(LineSnapshotStore::new()); + let threads: Vec<_> = (0..8) + .map(|idx| { + let store_ref = Arc::clone(&store); + thread::spawn(move || { + for step in 0..32usize { + let line = Line(step as i64); + let frame_raw = (idx as u64) * 100 + step as u64; + store_ref.record( + thread::current().id(), + PathId(idx), + line, + FrameId::from_raw(frame_raw), + ); + } + }) + }) + .collect(); + + for handle in threads { + handle.join().expect("thread should finish"); + } + + assert!(store.tracked_threads() <= 8); + } +} diff --git a/codetracer-python-recorder/src/runtime/logging.rs b/codetracer-python-recorder/src/runtime/logging.rs index 6fea9df..b7e5ddf 100644 --- a/codetracer-python-recorder/src/runtime/logging.rs +++ b/codetracer-python-recorder/src/runtime/logging.rs @@ -3,10 +3,12 @@ use pyo3::Python; use crate::code_object::CodeObjectWrapper; +use crate::runtime::io_capture::ScopedMuteIoCapture; /// Emit a debug log entry for tracer callbacks, enriching the message with /// filename, qualified name, and optional line information when available. pub fn log_event(py: Python<'_>, code: &CodeObjectWrapper, event: &str, lineno: Option) { + let _mute = ScopedMuteIoCapture::new(); if let Some(line) = lineno { match code.filename(py) { Ok(filename) => log::debug!("[RuntimeTracer] {event}: {filename}:{line}"), diff --git a/codetracer-python-recorder/src/runtime/mod.rs b/codetracer-python-recorder/src/runtime/mod.rs index 45c8e29..2de61da 100644 --- a/codetracer-python-recorder/src/runtime/mod.rs +++ b/codetracer-python-recorder/src/runtime/mod.rs @@ -2,11 +2,14 @@ mod activation; mod frame_inspector; +pub mod io_capture; +mod line_snapshots; mod logging; mod output_paths; mod value_capture; mod value_encoder; +pub use line_snapshots::{FrameId, LineSnapshotStore}; pub use output_paths::TraceOutputPaths; use activation::ActivationController; @@ -19,15 +22,19 @@ use std::fs; use std::path::{Path, PathBuf}; #[cfg(feature = "integration-test")] use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; #[cfg(feature = "integration-test")] use std::sync::OnceLock; +use std::thread::{self, ThreadId}; use pyo3::prelude::*; use pyo3::types::PyAny; use recorder_errors::{bug, enverr, target, usage, ErrorCode, RecorderResult}; use runtime_tracing::NonStreamingTraceWriter; -use runtime_tracing::{Line, TraceEventsFileFormat, TraceWriter}; +use runtime_tracing::{ + EventLogKind, Line, PathId, RecordEvent, TraceEventsFileFormat, TraceLowLevelEvent, TraceWriter, +}; use crate::code_object::CodeObjectWrapper; use crate::ffi; @@ -36,6 +43,11 @@ use crate::monitoring::{ events_union, CallbackOutcome, CallbackResult, EventSet, MonitoringEvents, Tracer, }; use crate::policy::{policy_snapshot, RecorderPolicy}; +use crate::runtime::io_capture::{ + IoCapturePipeline, IoCaptureSettings, IoChunk, IoChunkFlags, IoStream, ScopedMuteIoCapture, +}; +use serde::Serialize; +use serde_json; use uuid::Uuid; @@ -53,6 +65,29 @@ impl Drop for TraceIdResetGuard { } } +fn io_flag_labels(flags: IoChunkFlags) -> Vec<&'static str> { + let mut labels = Vec::new(); + if flags.contains(IoChunkFlags::NEWLINE_TERMINATED) { + labels.push("newline"); + } + if flags.contains(IoChunkFlags::EXPLICIT_FLUSH) { + labels.push("flush"); + } + if flags.contains(IoChunkFlags::STEP_BOUNDARY) { + labels.push("step_boundary"); + } + if flags.contains(IoChunkFlags::TIME_SPLIT) { + labels.push("time_split"); + } + if flags.contains(IoChunkFlags::INPUT_CHUNK) { + labels.push("input"); + } + if flags.contains(IoChunkFlags::FD_MIRROR) { + labels.push("mirror"); + } + labels +} + /// Minimal runtime tracer that maps Python sys.monitoring events to /// runtime_tracing writer operations. pub struct RuntimeTracer { @@ -66,6 +101,8 @@ pub struct RuntimeTracer { events_recorded: bool, encountered_failure: bool, trace_id: String, + line_snapshots: Arc, + io_capture: Option, } #[derive(Clone, Copy, Debug, PartialEq, Eq)] @@ -111,6 +148,7 @@ fn configured_failure_mode() -> Option { *FAILURE_MODE.get_or_init(|| { let raw = std::env::var("CODETRACER_TEST_INJECT_FAILURE").ok(); if let Some(value) = raw.as_deref() { + let _mute = ScopedMuteIoCapture::new(); log::debug!("[RuntimeTracer] test failure injection mode: {}", value); } raw.and_then(|raw| match raw.trim().to_ascii_lowercase().as_str() { @@ -138,8 +176,7 @@ fn should_inject_failure(_stage: FailureStage) -> bool { #[cfg(feature = "integration-test")] fn should_inject_target_error() -> bool { - matches!(configured_failure_mode(), Some(FailureMode::TargetArgs)) - && mark_failure_triggered() + matches!(configured_failure_mode(), Some(FailureMode::TargetArgs)) && mark_failure_triggered() } #[cfg(not(feature = "integration-test"))] @@ -228,13 +265,147 @@ impl RuntimeTracer { events_recorded: false, encountered_failure: false, trace_id: Uuid::new_v4().to_string(), + line_snapshots: Arc::new(LineSnapshotStore::new()), + io_capture: None, + } + } + + /// Share the snapshot store with collaborators (IO capture, tests). + #[cfg_attr(not(test), allow(dead_code))] + pub fn line_snapshot_store(&self) -> Arc { + Arc::clone(&self.line_snapshots) + } + + pub fn install_io_capture(&mut self, py: Python<'_>, policy: &RecorderPolicy) -> PyResult<()> { + let settings = IoCaptureSettings { + line_proxies: policy.io_capture.line_proxies, + fd_mirror: policy.io_capture.fd_fallback, + }; + let pipeline = IoCapturePipeline::install(py, Arc::clone(&self.line_snapshots), settings)?; + self.io_capture = pipeline; + Ok(()) + } + + fn flush_io_before_step(&mut self, thread_id: ThreadId) { + if let Some(pipeline) = self.io_capture.as_ref() { + pipeline.flush_before_step(thread_id); + } + self.drain_io_chunks(); + } + + fn flush_pending_io(&mut self) { + if let Some(pipeline) = self.io_capture.as_ref() { + pipeline.flush_all(); + } + self.drain_io_chunks(); + } + + fn drain_io_chunks(&mut self) { + if let Some(pipeline) = self.io_capture.as_ref() { + let chunks = pipeline.drain_chunks(); + for chunk in chunks { + self.record_io_chunk(chunk); + } + } + } + + fn record_io_chunk(&mut self, mut chunk: IoChunk) { + if chunk.path_id.is_none() { + if let Some(path) = chunk.path.as_deref() { + let path_id = TraceWriter::ensure_path_id(&mut self.writer, Path::new(path)); + chunk.path_id = Some(path_id); + } + } + + let kind = match chunk.stream { + IoStream::Stdout => EventLogKind::Write, + IoStream::Stderr => EventLogKind::WriteOther, + IoStream::Stdin => EventLogKind::Read, + }; + + let metadata = self.build_io_metadata(&chunk); + let content = String::from_utf8_lossy(&chunk.payload).into_owned(); + + TraceWriter::add_event( + &mut self.writer, + TraceLowLevelEvent::Event(RecordEvent { + kind, + metadata, + content, + }), + ); + self.mark_event(); + } + + fn build_io_metadata(&self, chunk: &IoChunk) -> String { + #[derive(Serialize)] + struct IoEventMetadata<'a> { + stream: &'a str, + thread: String, + path_id: Option, + line: Option, + frame_id: Option, + flags: Vec<&'a str>, + } + + let snapshot = self.line_snapshots.snapshot_for_thread(chunk.thread_id); + let path_id = chunk + .path_id + .map(|id| id.0) + .or_else(|| snapshot.as_ref().map(|snap| snap.path_id().0)); + let line = chunk + .line + .map(|line| line.0) + .or_else(|| snapshot.as_ref().map(|snap| snap.line().0)); + let frame_id = chunk + .frame_id + .or_else(|| snapshot.as_ref().map(|snap| snap.frame_id())); + + let metadata = IoEventMetadata { + stream: match chunk.stream { + IoStream::Stdout => "stdout", + IoStream::Stderr => "stderr", + IoStream::Stdin => "stdin", + }, + thread: format!("{:?}", chunk.thread_id), + path_id, + line, + frame_id: frame_id.map(|id| id.as_raw()), + flags: io_flag_labels(chunk.flags), + }; + + match serde_json::to_string(&metadata) { + Ok(json) => json, + Err(err) => { + let _mute = ScopedMuteIoCapture::new(); + log::error!("failed to serialise IO metadata: {err}"); + "{}".to_string() + } + } + } + + fn teardown_io_capture(&mut self, py: Python<'_>) { + if let Some(mut pipeline) = self.io_capture.take() { + pipeline.flush_all(); + let chunks = pipeline.drain_chunks(); + for chunk in chunks { + self.record_io_chunk(chunk); + } + pipeline.uninstall(py); + let trailing = pipeline.drain_chunks(); + for chunk in trailing { + self.record_io_chunk(chunk); + } } } /// Configure output files and write initial metadata records. pub fn begin(&mut self, outputs: &TraceOutputPaths, start_line: u32) -> PyResult<()> { let start_path = self.activation.start_path(&self.program_path); - log::debug!("{}", start_path.display()); + { + let _mute = ScopedMuteIoCapture::new(); + log::debug!("{}", start_path.display()); + } outputs .configure_writer(&mut self.writer, start_path, start_line) .map_err(ffi::map_recorder_error)?; @@ -247,6 +418,7 @@ impl RuntimeTracer { fn mark_event(&mut self) { if suppress_events() { + let _mute = ScopedMuteIoCapture::new(); log::debug!("[RuntimeTracer] skipping event mark due to test injection"); return; } @@ -329,6 +501,7 @@ impl RuntimeTracer { Ok(name) => name, Err(err) => { with_error_code(ErrorCode::Io, || { + let _mute = ScopedMuteIoCapture::new(); log::error!("failed to resolve code filename: {err}"); }); record_dropped_event("filename_lookup_failed"); @@ -391,6 +564,7 @@ impl Tracer for RuntimeTracer { Err(err) => { let details = err.to_string(); with_error_code(ErrorCode::FrameIntrospectionFailed, || { + let _mute = ScopedMuteIoCapture::new(); log::error!("on_py_start: failed to capture args: {details}"); }); return Err(ffi::map_recorder_error( @@ -433,13 +607,27 @@ impl Tracer for RuntimeTracer { log_event(py, code, "on_line", Some(lineno)); + self.flush_io_before_step(thread::current().id()); + + let line_value = Line(lineno as i64); + let mut recorded_path: Option<(PathId, Line)> = None; + if let Ok(filename) = code.filename(py) { - TraceWriter::register_step(&mut self.writer, Path::new(filename), Line(lineno as i64)); + let path = Path::new(filename); + let path_id = TraceWriter::ensure_path_id(&mut self.writer, path); + TraceWriter::register_step(&mut self.writer, path, line_value); self.mark_event(); + recorded_path = Some((path_id, line_value)); } let snapshot = capture_frame(py, code)?; + if let Some((path_id, line)) = recorded_path { + let frame_id = FrameId::from_raw(snapshot.frame_ptr() as usize as u64); + self.line_snapshots + .record(thread::current().id(), path_id, line, frame_id); + } + let mut recorded: HashSet = HashSet::new(); record_visible_scope(py, &mut self.writer, &snapshot, &mut recorded); @@ -466,9 +654,12 @@ impl Tracer for RuntimeTracer { log_event(py, code, "on_py_return", None); + self.flush_pending_io(); + record_return_value(py, &mut self.writer, retval); self.mark_event(); if self.activation.handle_return_event(code.id()) { + let _mute = ScopedMuteIoCapture::new(); log::debug!("[RuntimeTracer] deactivated on activation return"); } @@ -482,7 +673,10 @@ impl Tracer for RuntimeTracer { fn flush(&mut self, _py: Python<'_>) -> PyResult<()> { // Trace event entry + let _mute = ScopedMuteIoCapture::new(); log::debug!("[RuntimeTracer] flush"); + drop(_mute); + self.flush_pending_io(); // For non-streaming formats we can update the events file. match self.format { TraceEventsFileFormat::Json | TraceEventsFileFormat::BinaryV0 => { @@ -501,8 +695,9 @@ impl Tracer for RuntimeTracer { Ok(()) } - fn finish(&mut self, _py: Python<'_>) -> PyResult<()> { + fn finish(&mut self, py: Python<'_>) -> PyResult<()> { // Trace event entry + let _mute_finish = ScopedMuteIoCapture::new(); log::debug!("[RuntimeTracer] finish"); if should_inject_failure(FailureStage::Finish) { @@ -513,6 +708,8 @@ impl Tracer for RuntimeTracer { let _reset = TraceIdResetGuard::new(); let policy = policy_snapshot(); + self.teardown_io_capture(py); + if self.encountered_failure { if policy.keep_partial_trace { if let Err(err) = self.finalise_writer() { @@ -537,6 +734,7 @@ impl Tracer for RuntimeTracer { } self.ignored_code_ids.clear(); self.function_ids.clear(); + self.line_snapshots.clear(); return Ok(()); } @@ -545,6 +743,7 @@ impl Tracer for RuntimeTracer { self.finalise_writer().map_err(ffi::map_recorder_error)?; self.ignored_code_ids.clear(); self.function_ids.clear(); + self.line_snapshots.clear(); Ok(()) } } @@ -556,10 +755,12 @@ mod tests { use crate::policy; use pyo3::types::{PyAny, PyCode, PyModule}; use pyo3::wrap_pyfunction; - use runtime_tracing::{FullValueRecord, TraceLowLevelEvent, ValueRecord}; + use runtime_tracing::{FullValueRecord, StepRecord, TraceLowLevelEvent, ValueRecord}; + use serde::Deserialize; use std::cell::Cell; use std::collections::BTreeMap; use std::ffi::CString; + use std::thread; thread_local! { static ACTIVE_TRACER: Cell<*mut RuntimeTracer> = Cell::new(std::ptr::null_mut()); @@ -594,6 +795,8 @@ mod tests { None, None, Some(false), + None, + None, ) .expect("reset recorder policy"); } @@ -734,6 +937,343 @@ result = compute()\n" }); } + #[test] + fn line_snapshot_store_tracks_last_step() { + Python::with_gil(|py| { + ensure_test_module(py); + let tmp = tempfile::tempdir().expect("create temp dir"); + let script_path = tmp.path().join("snapshot_script.py"); + let script = format!("{PRELUDE}\n\nsnapshot()\n"); + std::fs::write(&script_path, &script).expect("write script"); + + let mut tracer = + RuntimeTracer::new("snapshot_script.py", &[], TraceEventsFileFormat::Json, None); + let store = tracer.line_snapshot_store(); + + { + let _guard = ScopedTracer::new(&mut tracer); + LAST_OUTCOME.with(|cell| cell.set(None)); + let run_code = format!( + "import runpy\nrunpy.run_path(r\"{}\")", + script_path.display() + ); + let run_code_c = CString::new(run_code).expect("script contains nul byte"); + py.run(run_code_c.as_c_str(), None, None) + .expect("execute snapshot script"); + } + + let last_step: StepRecord = tracer + .writer + .events + .iter() + .rev() + .find_map(|event| match event { + TraceLowLevelEvent::Step(step) => Some(step.clone()), + _ => None, + }) + .expect("expected one step event"); + + let thread_id = thread::current().id(); + let snapshot = store + .snapshot_for_thread(thread_id) + .expect("snapshot should be recorded"); + + assert_eq!(snapshot.line(), last_step.line); + assert_eq!(snapshot.path_id(), last_step.path_id); + assert!(snapshot.captured_at().elapsed().as_secs_f64() >= 0.0); + }); + } + + #[derive(Debug, Deserialize)] + struct IoMetadata { + stream: String, + path_id: Option, + line: Option, + flags: Vec, + } + + #[test] + fn io_capture_records_python_and_native_output() { + Python::with_gil(|py| { + reset_policy(py); + policy::configure_policy_py( + Some("abort"), + Some(false), + Some(false), + None, + None, + Some(false), + Some(true), + Some(false), + ) + .expect("enable io capture proxies"); + + ensure_test_module(py); + let tmp = tempfile::tempdir().expect("create temp dir"); + let script_path = tmp.path().join("io_script.py"); + let script = format!( + "{PRELUDE}\n\nprint('python out')\nfrom ctypes import pythonapi, c_char_p\npythonapi.PySys_WriteStdout(c_char_p(b'native out\\n'))\n" + ); + std::fs::write(&script_path, &script).expect("write script"); + + let mut tracer = RuntimeTracer::new( + script_path.to_string_lossy().as_ref(), + &[], + TraceEventsFileFormat::Json, + None, + ); + let outputs = TraceOutputPaths::new(tmp.path(), TraceEventsFileFormat::Json); + tracer.begin(&outputs, 1).expect("begin tracer"); + tracer + .install_io_capture(py, &policy::policy_snapshot()) + .expect("install io capture"); + + { + let _guard = ScopedTracer::new(&mut tracer); + LAST_OUTCOME.with(|cell| cell.set(None)); + let run_code = format!( + "import runpy\nrunpy.run_path(r\"{}\")", + script_path.display() + ); + let run_code_c = CString::new(run_code).expect("script contains nul byte"); + py.run(run_code_c.as_c_str(), None, None) + .expect("execute io script"); + } + + tracer.finish(py).expect("finish tracer"); + + let io_events: Vec<(IoMetadata, Vec)> = tracer + .writer + .events + .iter() + .filter_map(|event| match event { + TraceLowLevelEvent::Event(record) => { + let metadata: IoMetadata = serde_json::from_str(&record.metadata).ok()?; + Some((metadata, record.content.as_bytes().to_vec())) + } + _ => None, + }) + .collect(); + + assert!(io_events + .iter() + .any(|(meta, payload)| meta.stream == "stdout" + && String::from_utf8_lossy(payload).contains("python out"))); + assert!(io_events + .iter() + .any(|(meta, payload)| meta.stream == "stdout" + && String::from_utf8_lossy(payload).contains("native out"))); + assert!(io_events.iter().all(|(meta, _)| { + if meta.stream == "stdout" { + meta.path_id.is_some() && meta.line.is_some() + } else { + true + } + })); + assert!(io_events + .iter() + .filter(|(meta, _)| meta.stream == "stdout") + .any(|(meta, _)| meta.flags.iter().any(|flag| flag == "newline"))); + + reset_policy(py); + }); + } + + #[cfg(unix)] + #[test] + fn fd_mirror_captures_os_write_payloads() { + Python::with_gil(|py| { + reset_policy(py); + policy::configure_policy_py( + Some("abort"), + Some(false), + Some(false), + None, + None, + Some(false), + Some(true), + Some(true), + ) + .expect("enable io capture with fd fallback"); + + ensure_test_module(py); + let tmp = tempfile::tempdir().expect("tempdir"); + let script_path = tmp.path().join("fd_mirror.py"); + std::fs::write( + &script_path, + format!( + "{PRELUDE}\nimport os\nprint('proxy line')\nos.write(1, b'fd stdout\\n')\nos.write(2, b'fd stderr\\n')\n" + ), + ) + .expect("write script"); + + let mut tracer = RuntimeTracer::new( + script_path.to_string_lossy().as_ref(), + &[], + TraceEventsFileFormat::Json, + None, + ); + let outputs = TraceOutputPaths::new(tmp.path(), TraceEventsFileFormat::Json); + tracer.begin(&outputs, 1).expect("begin tracer"); + tracer + .install_io_capture(py, &policy::policy_snapshot()) + .expect("install io capture"); + + { + let _guard = ScopedTracer::new(&mut tracer); + LAST_OUTCOME.with(|cell| cell.set(None)); + let run_code = format!( + "import runpy\nrunpy.run_path(r\"{}\")", + script_path.display() + ); + let run_code_c = CString::new(run_code).expect("script contains nul byte"); + py.run(run_code_c.as_c_str(), None, None) + .expect("execute fd script"); + } + + tracer.finish(py).expect("finish tracer"); + + let io_events: Vec<(IoMetadata, Vec)> = tracer + .writer + .events + .iter() + .filter_map(|event| match event { + TraceLowLevelEvent::Event(record) => { + let metadata: IoMetadata = serde_json::from_str(&record.metadata).ok()?; + Some((metadata, record.content.as_bytes().to_vec())) + } + _ => None, + }) + .collect(); + + let stdout_mirror = io_events.iter().find(|(meta, _)| { + meta.stream == "stdout" && meta.flags.iter().any(|flag| flag == "mirror") + }); + assert!( + stdout_mirror.is_some(), + "expected mirror event for stdout: {:?}", + io_events + ); + let stdout_payload = &stdout_mirror.unwrap().1; + assert!( + String::from_utf8_lossy(stdout_payload).contains("fd stdout"), + "mirror stdout payload missing expected text" + ); + + let stderr_mirror = io_events.iter().find(|(meta, _)| { + meta.stream == "stderr" && meta.flags.iter().any(|flag| flag == "mirror") + }); + assert!( + stderr_mirror.is_some(), + "expected mirror event for stderr: {:?}", + io_events + ); + let stderr_payload = &stderr_mirror.unwrap().1; + assert!( + String::from_utf8_lossy(stderr_payload).contains("fd stderr"), + "mirror stderr payload missing expected text" + ); + + assert!(io_events.iter().any(|(meta, payload)| { + meta.stream == "stdout" + && !meta.flags.iter().any(|flag| flag == "mirror") + && String::from_utf8_lossy(payload).contains("proxy line") + })); + + reset_policy(py); + }); + } + + #[cfg(unix)] + #[test] + fn fd_mirror_disabled_does_not_capture_os_write() { + Python::with_gil(|py| { + reset_policy(py); + policy::configure_policy_py( + Some("abort"), + Some(false), + Some(false), + None, + None, + Some(false), + Some(true), + Some(false), + ) + .expect("enable proxies without fd fallback"); + + ensure_test_module(py); + let tmp = tempfile::tempdir().expect("tempdir"); + let script_path = tmp.path().join("fd_disabled.py"); + std::fs::write( + &script_path, + format!( + "{PRELUDE}\nimport os\nprint('proxy line')\nos.write(1, b'fd stdout\\n')\nos.write(2, b'fd stderr\\n')\n" + ), + ) + .expect("write script"); + + let mut tracer = RuntimeTracer::new( + script_path.to_string_lossy().as_ref(), + &[], + TraceEventsFileFormat::Json, + None, + ); + let outputs = TraceOutputPaths::new(tmp.path(), TraceEventsFileFormat::Json); + tracer.begin(&outputs, 1).expect("begin tracer"); + tracer + .install_io_capture(py, &policy::policy_snapshot()) + .expect("install io capture"); + + { + let _guard = ScopedTracer::new(&mut tracer); + LAST_OUTCOME.with(|cell| cell.set(None)); + let run_code = format!( + "import runpy\nrunpy.run_path(r\"{}\")", + script_path.display() + ); + let run_code_c = CString::new(run_code).expect("script contains nul byte"); + py.run(run_code_c.as_c_str(), None, None) + .expect("execute fd script"); + } + + tracer.finish(py).expect("finish tracer"); + + let io_events: Vec<(IoMetadata, Vec)> = tracer + .writer + .events + .iter() + .filter_map(|event| match event { + TraceLowLevelEvent::Event(record) => { + let metadata: IoMetadata = serde_json::from_str(&record.metadata).ok()?; + Some((metadata, record.content.as_bytes().to_vec())) + } + _ => None, + }) + .collect(); + + assert!( + !io_events + .iter() + .any(|(meta, _)| meta.flags.iter().any(|flag| flag == "mirror")), + "mirror events should not be present when fallback disabled" + ); + + assert!( + !io_events.iter().any(|(_, payload)| { + String::from_utf8_lossy(payload).contains("fd stdout") + || String::from_utf8_lossy(payload).contains("fd stderr") + }), + "native os.write payload unexpectedly captured without fallback" + ); + + assert!(io_events.iter().any(|(meta, payload)| { + meta.stream == "stdout" && String::from_utf8_lossy(payload).contains("proxy line") + })); + + reset_policy(py); + }); + } + #[pyfunction] fn capture_line(py: Python<'_>, code: Bound<'_, PyCode>, lineno: u32) -> PyResult<()> { ffi::wrap_pyfunction("test_capture_line", || { @@ -1368,6 +1908,8 @@ snapshot() None, None, Some(false), + None, + None, ) .expect("enable require_trace policy"); @@ -1441,6 +1983,8 @@ snapshot() None, None, Some(false), + None, + None, ) .expect("enable keep_partial policy"); diff --git a/codetracer-python-recorder/src/session.rs b/codetracer-python-recorder/src/session.rs index 4acfc51..21270c9 100644 --- a/codetracer-python-recorder/src/session.rs +++ b/codetracer-python-recorder/src/session.rs @@ -11,6 +11,7 @@ use recorder_errors::{usage, ErrorCode}; use crate::ffi; use crate::logging::init_rust_logging_with_default; use crate::monitoring::{flush_installed_tracer, install_tracer, uninstall_tracer}; +use crate::policy::policy_snapshot; use crate::runtime::{RuntimeTracer, TraceOutputPaths}; use bootstrap::TraceSessionBootstrap; @@ -43,6 +44,7 @@ pub fn start_tracing(path: &str, format: &str, activation_path: Option<&str>) -> .map_err(ffi::map_recorder_error)?; let outputs = TraceOutputPaths::new(bootstrap.trace_directory(), bootstrap.format()); + let policy = policy_snapshot(); let mut tracer = RuntimeTracer::new( bootstrap.program(), @@ -51,6 +53,7 @@ pub fn start_tracing(path: &str, format: &str, activation_path: Option<&str>) -> bootstrap.activation_path(), ); tracer.begin(&outputs, 1)?; + tracer.install_io_capture(py, &policy)?; // Install callbacks install_tracer(py, Box::new(tracer))?; diff --git a/codetracer-python-recorder/tests/python/unit/test_cli.py b/codetracer-python-recorder/tests/python/unit/test_cli.py index 0f2621e..d125d54 100644 --- a/codetracer-python-recorder/tests/python/unit/test_cli.py +++ b/codetracer-python-recorder/tests/python/unit/test_cli.py @@ -96,3 +96,39 @@ def test_parse_args_collects_policy_overrides(tmp_path: Path) -> None: "log_file": (tmp_path / "logs" / "recorder.log").resolve(), "json_errors": True, } + + +def test_parse_args_controls_io_capture(tmp_path: Path) -> None: + script = tmp_path / "entry.py" + _write_script(script) + + config = _parse_args( + [ + "--io-capture", + "off", + str(script), + ] + ) + + assert config.policy_overrides == { + "io_capture_line_proxies": False, + "io_capture_fd_fallback": False, + } + + +def test_parse_args_enables_io_capture_fd_mirroring(tmp_path: Path) -> None: + script = tmp_path / "entry.py" + _write_script(script) + + config = _parse_args( + [ + "--io-capture", + "proxies+fd", + str(script), + ] + ) + + assert config.policy_overrides == { + "io_capture_line_proxies": True, + "io_capture_fd_fallback": True, + } diff --git a/design-docs/adr/0007-io-capture.md b/design-docs/adr/0007-io-capture.md new file mode 100644 index 0000000..ecd0290 --- /dev/null +++ b/design-docs/adr/0007-io-capture.md @@ -0,0 +1,43 @@ +# ADR 0007: Input and Output Capture for Runtime Traces + +- **Status:** Proposed +- **Date:** 2025-10-03 +- **Deciders:** Runtime recorder maintainers +- **Consulted:** Python platform crew, Replay tooling crew +- **Informed:** DX crew, Release crew + +## Context +- The repo now splits session bootstrap, monitoring glue, and runtime logic into clear modules (`session`, `monitoring`, `runtime`). +- `RuntimeTracer` owns the `NonStreamingTraceWriter` and activation rules, and already writes metadata, paths, and step events. +- `recorder-errors` gives us uniform error codes and panic trapping. Every new subsystem must use it. +- We still forward stdout, stderr, and stdin directly to the host console. No bytes reach the trace. +- Replay and debugging teams need IO events beside call and line records so they can rebuild console sessions. + +## Problem +- We need lossless IO capture without breaking the in-process `sys.monitoring` design or the new error policy. +- The old pipe-based spec assumed the tracer lived inside `start()` and mutated global state freely. The refactor put lifecycle code behind `TraceSessionBootstrap`, `TraceOutputPaths`, and `RuntimeTracer::begin`. +- We also added activation gating and stricter teardown rules. Any IO hooks must respect them and always restore the original file descriptors. + +## Decision +1. Keep the Python CLI contract. `codetracer_python_recorder.start_tracing` keeps installing the tracer, but now also starts an IO capture controller right before `install_tracer` and shuts it down inside `stop_tracing`. +2. Introduce `runtime::io_capture` with a single public type, `IoCapture`. It duplicates stdin/stdout/stderr, installs platform pipes, and spawns blocking reader threads. The module hides Unix vs Windows code paths behind a small trait (`IoEndpoint`). +3. Expose an `IoEventSink` from `RuntimeTracer`. The sink wraps the writer in `Arc>` and exposes two safe methods: `record_output(chunk: IoChunk)` and `record_input(chunk: IoChunk)`. Reader threads call the sink only. All conversions to `TraceLowLevelEvent` live next to the writer so we reuse value encoders and error helpers. +4. Extend `RuntimeTracer` with a light `ThreadSnapshotStore`. `on_line` updates the current `{ path_id, line, frame_id }` per Python thread. `IoEventSink` reads the latest snapshot when it serialises a chunk. When no snapshot exists we fall back to the last global step. +5. Store stdout and stderr bytes as `EventLogKind::Write` and `WriteOther`. Store stdin bytes as `EventLogKind::Read`. Metadata includes the stream name, monotonic timestamps, thread tag, and the captured snapshot when present. Bytes stay base64 encoded by the runtime tracing crate. +6. Keep console passthrough. The reader threads mirror each chunk back into the saved file descriptors so users still see live output. +7. Wire capture teardown into existing error handling. `IoCapture::stop` drains the pipes, restores FDs, signals the threads, and logs failures through the `recorder-errors` macros. `RuntimeTracer::finish` waits for the IO channel before calling `TraceWriter::finish_*` to avoid races. +8. Hide the feature behind `RecorderPolicy`. A new flag `policy.io_capture` defaults to off today. Tests and early adopters enable it. Once stable we flip the default. + +## Consequences +- **Upsides:** We capture IO without a subprocess, reuse the refactored writer lifecycle, and keep activation gating intact. Replay tooling reads one stream for events and IO. +- **Costs:** Writer calls now cross a mutex, so we must measure contention. The new module adds platform code that needs tight tests. We must watch out for deadlocks on interpreter shutdown. + +## Rollout +- Ship behind an environment toggle `CODETRACER_CAPTURE_IO=1` wired into the policy layer. Emit a warning when the policy disables capture. +- Document the behaviour in the recorder README and the user CLI help once we land the feature. +- Graduate the ADR to **Accepted** after the implementation plan closes and the policy ship flips the default on both Unix and Windows. + +## Alternatives +- A subprocess wrapper was considered again and rejected. It would undo the refactor that keeps tracing in-process and would break existing embedding use cases. +- `sys.stdout` monkey patching remains off the table. It misses native writes and user-assigned streams. +- Writing IO into a separate JSON file is still unnecessary. The runtime tracing schema already handles IO events. diff --git a/design-docs/adr/0008-line-aware-io-capture.md b/design-docs/adr/0008-line-aware-io-capture.md new file mode 100644 index 0000000..2b91f6c --- /dev/null +++ b/design-docs/adr/0008-line-aware-io-capture.md @@ -0,0 +1,46 @@ +# ADR 0008: Line-Aware IO Capture Through Stream Proxies + +- **Status:** Proposed +- **Date:** 2025-01-14 +- **Supersedes:** ADR 0007 +- **Deciders:** Runtime recorder maintainers +- **Consulted:** Python platform crew, Replay tooling crew +- **Informed:** DX crew, Release crew + +## Context +- We must attribute every visible chunk of IO to the Python line that triggered it. +- Pipe-based capture lags behind the interpreter and breaks the ordering with our line events. +- The refactored recorder already tracks thread snapshots for line events and ships a policy system plus lifecycle hooks. +- Patching `sys.stdout` and friends is the only way to synchronise output with the active frame without changing how users launch their code. + +## Problem +- We need an in-process IO capture layer that keeps pass-through behaviour, works across CPython versions we support, and does not swallow our own logs. +- We must cover writes coming from Python code and from C extensions that call the CPython stream APIs. +- The solution must restore the original streams even if tracing crashes or the user stops tracing inside a `finally` block. + +## Decision +1. Introduce `runtime::io_capture` with one public type, `IoStreamProxies`. It owns the original `{stdout, stderr, stdin}` objects and exposes `install(py)` / `uninstall(py)` helpers. +2. Provide three PyO3 classes: `LineAwareStdout`, `LineAwareStderr`, and `LineAwareStdin`. They proxy every method we rely on (`write`, `writelines`, `flush`, `read`, `readline`, `readinto`, iteration). +3. Each proxy calls back into Rust. The callback grabs the per-thread `LineSnapshot` maintained by the monitoring layer. When the snapshot is missing we record `None` and mark the IO chunk as "detached". +4. The callback forwards the payload to the original stream right away so the console stays live. We record the chunk in an `IoEventSink` right after the forward call while we still hold the GIL. +5. `IoEventSink` batches small writes per thread to reduce chatter. A flush, newline, line switch, or time gap of 5 ms emits the batch. When the monitoring layer emits a Step event for the same thread it first flushes the pending IO batch, then writes the Step, then installs the new snapshot. This keeps the Step/IO ordering deterministic: `Step -> IO emitted by that Step -> next Step`. +6. Extend `RuntimeTracer` with an `io_capture` field. `start_tracing` installs the proxies after the recorder policy authorises IO capture. `stop_tracing` and panic handlers call `teardown_io_capture`, which drains pending batches and restores the original streams. +7. Guard our own logging with a `ScopedMuteIoCapture` RAII helper. It sets a thread-local flag so proxy callbacks short-circuit when the recorder writes to stderr. +8. Add an optional best-effort FD mirror for `stdout`/`stderr`. When enabled it duplicates the file descriptors and spawns a reader that only handles writes not seen by the proxies. We track bytes seen by proxies in a per-stream ledger that keeps a FIFO byte buffer plus a monotonic sequence ID. The mirror removes ledger bytes from every read chunk with a streaming diff: scan left to right, skip native bytes, and peel ledger entries whenever their bytes appear, even when native writes arrive first in the chunk. Whatever bytes remain become mirror-only output. The GIL keeps Python `write` calls serial, so proxy order matches the order the OS sees. When native code writes directly to the FD it appears in the diff as leftover bytes we record with a `FdMirror` source tag, and we annotate those events with a `mirror` metadata flag so downstream tooling can highlight fallback captures. +9. Expose a policy flag `policy.io_capture.line_proxies` defaulting to `true`. The FD mirror stays off by default and hides behind `policy.io_capture.fd_fallback`. +10. Encode captured payloads as raw UTF-8 strings when forwarding to `runtime_tracing`. We trim the old manual base64 layer so downstream tooling, including the Codetracer UI, can consume the bytes without a second decode pass. Non UTF-8 input falls back to lossless bytes from the mirror or replacement characters when proxies surface decoded text. + +## Consequences +- **Pros:** We align IO chunks with the current Python frame, match C extensions that honour `sys.stdout`, and keep console behaviour untouched. The design lives inside the existing lifecycle code. +- **Cons:** The proxies add overhead to every write call and must stay in sync with CPython `TextIOBase`. We have to maintain batching logic and reentrancy guards. +- **Risks:** Misbehaving third-party code that replaces `sys.stdout` mid-run may bypass us. If the user hands a binary stream to `sys.stdout` we must fall back to passthrough mode. The FD mirror ledger can fall behind if proxies skip `record_proxy_bytes`. We detect the mismatch, reset the ledger, and keep the bytes via the mirror so capture stays lossless even when dedupe fails. + +## Rollout +- Ship behind the new policy flags and an environment override `CODETRACER_CAPTURE_IO=proxies[,fd]`. +- Keep telemetry for dropped chunks and proxy failures. Emit a single warning when the proxy install fails and we fall back to plain pass-through. +- Promote this ADR to **Accepted** once the implementation plan ships on Linux and Windows and soak tests confirm line attribution accuracy. + +## Alternatives +- Keep the old pipe-based capture: rejected because it can never align output timing with the interpreter. +- Subclass Python `io.TextIOWrapper` in pure Python: rejected because the Rust-backed recorder needs control over batching and logging guards inside the GIL. +- Patch `libc::write` through LD_PRELOAD: rejected as too invasive and brittle across platforms. It also cannot recover the active Python frame. diff --git a/design-docs/capture-output-implementation-plan.md b/design-docs/capture-output-implementation-plan.md new file mode 100644 index 0000000..42cfe78 --- /dev/null +++ b/design-docs/capture-output-implementation-plan.md @@ -0,0 +1,65 @@ +# Capture Output Implementation Plan + +## Goal +- Ship lossless stdout, stderr, and stdin capture in the Rust recorder without breaking the current CLI flow or error policy. + +## Guiding Notes +- Follow ADR 0007. +- Keep sentences short for readers; prefer bullets. +- Run `just test` on every stage. + +## Stage 0 – Refactor for IO capture (must land first) +- Split writer ownership out of `RuntimeTracer` into a helper (`TraceWriterHost`) that exposes a thread-safe event API. +- Add a small `ThreadSnapshotStore` that records the latest `{path_id, line, frame_id}` per Python thread inside the runtime module. +- Ensure `RuntimeTracer::finish` already waits on background work hooks; add a stub `IoDrain` trait with no-op implementation so later stages can slot in real drains. +- Update `session::start_tracing` and `stop_tracing` to accept optional "extra lifecycle" handles so we can pair start/stop work without more globals. +- Tests: extend existing runtime unit tests to cover the new snapshot store and confirm start/stop paths still finalise trace files. + +## Stage 1 – Build the IO capture core +- Create `runtime::io_capture` with platform-specific back ends (`unix.rs`, `windows.rs`) hidden behind a common trait. +- Implement descriptor/handle duplication, pipe install, and reader thread startup. Use blocking reads and thread-safe queues (`crossbeam-channel` already in workspace; add if missing). +- Ensure mirror writes go back to the saved descriptors so console output stays live. +- Tests: add Rust unit tests that fake pipes (use `os_pipe` on Unix, `tempfile` handles on Windows via CI) to confirm duplication and restoration. + +## Stage 2 – Connect capture to the tracer +- Add an `IoEventSink` struct that owns `Arc>` plus a snapshot reader. +- Reader threads push `IoChunk` structs (`stream`, `timestamp`, `bytes`, `producer_thread`) into the sink. The sink converts them into runtime tracing events and records them. +- Use `recorder-errors` for all failures (`usage!` for bad config, `enverr!` for IO problems). Log through the existing logging module; never `println!`. +- Update `RuntimeTracer::begin` to start the sink when policy allows. Store the `IoCapture` handle and drain it in `finish`. +- Tests: add integration tests in `tests/` that run a sample script writing to stdout/stderr and reading from stdin, then assert trace files contain the matching events. Verify passthrough stays intact. + +## Stage 3 – Policy flag, CLI wiring, and guards +- Extend `RecorderPolicy` with `io_capture_enabled` plus env var `CODETRACER_CAPTURE_IO`. +- Make the Python CLI surface a `--capture-io` flag (defaults to policy). Document the flag in help text. +- Emit a single log line when capture is disabled by policy so users understand why their trace lacks IO events. +- Tests: Python integration test toggling the policy and checking presence/absence of IO records. + +## Stage 4 – Hardening and docs +- Stress test with large outputs (beyond pipe buffer) and interleaved writes from multiple threads. +- Run Windows CI to verify handle restore logic and CRLF behaviour. +- Document the feature in README + design docs. Update ADR status once accepted. +- Add metrics for dropped IO chunks using the existing logging counters. +- Tests: extend stress tests plus regression tests for start/stop loops to ensure descriptors always restore. + +## Milestones +1. Stage 0 merged and green CI. Serves as base branch for feature work. +2. Stages 1–2 merged together behind a feature flag. Feature hidden by default. +3. Stage 3 flips the flag for opted-in users. Gather feedback. +4. Stage 4 finishes docs, flips default to on, and promotes ADR 0007 to Accepted. + +## Verification Checklist +- `just test` passes after every stage. +- New unit tests cover writer host, snapshot store, and IO capture workers. +- Integration tests assert trace events and passthrough behaviour on Linux and Windows. +- Manual smoke: run `python -m codetracer_python_recorder examples/stdout_script.py` and confirm console output plus IO trace entries. + +## Risks & Mitigations +- **Deadlocks:** Keep reader threads simple, use bounded channels, and add shutdown timeouts tested in CI. +- **Performance hit:** Benchmark before and after Stage 2 with large stdout workloads; document results. +- **Platform drift:** Share the Unix/Windows API contract in a `README` inside the module and guard behaviour with tests. + +## Exit Criteria +- IO events present in trace files when the policy flag is on. +- Console output unchanged for users. +- No file descriptor leaks (checked via stress tests and `lsof` in CI scripts). +- Documentation published and linked from ADR 0007. diff --git a/design-docs/capture-output.md b/design-docs/capture-output.md deleted file mode 100644 index efa4f03..0000000 --- a/design-docs/capture-output.md +++ /dev/null @@ -1,204 +0,0 @@ -# ADR: Non-invasive stdout/stderr/stdin capture and line-level mapping for the Python tracer (PyO3 + runtime_tracing) - -**Status**: Accepted -**Date**: 2025-10-01 -**Owners**: Tracing/Recorder team -**Scope**: Recorder runtime (Rust/PyO3) and Python instrumentation glue - ---- - -## Context - -- The user-facing CLI remains `python -m codetracer_python_recorder`. -- On startup the CLI parses recorder flags, adjusts `sys.argv`, and calls `codetracer_python_recorder.start(...)`, which delegates to the Rust extension. -- The Rust side already installs a `RuntimeTracer` implementation that subscribes to `sys.monitoring` callbacks and writes runtime_tracing artifacts (`trace.bin`/`trace.json`, `trace_metadata.json`, `trace_paths.json`). -- We still lack non-invasive capture of stdout/stderr/stdin that aligns each chunk with the trace stream for replay. -- The original draft assumed the script would be executed from Rust; in practice `runpy.run_path` stays inside the Python entrypoint, so the design must integrate with that lifecycle. -- The runtime_tracing crate (`TraceWriter`, `TraceLowLevelEvent::Event`, `EventLogKind::*`) already provides primitives for persisting arbitrary IO events; we will rely on it rather than introducing a bespoke artifact. - ---- - -## Decision - -1. **Lifecycle & CLI compatibility** - Retain the existing Python launcher. `python -m codetracer_python_recorder script.py` continues to prepare `sys.argv`, call `start(...)`, execute `runpy.run_path`, and finally `stop()`. Capture plumbing lives entirely inside `start()`/`stop()`, keeping the public API stable. - -2. **FD-level capture component** - Within `start_tracing` we instantiate an `OutputCapture` controller that duplicates the original stdin/stdout/stderr descriptors (or Windows handles), installs pipes in their place, and spawns draining threads. The Python script still runs in-process, but every write to fd 1/2 is diverted through the controller. Each chunk receives a monotonic timestamp before being queued, and the bytes are simultaneously mirrored back to the preserved descriptors so users continue to see live console output. - -3. **stdin strategy** - The controller exposes a write-end that the CLI feeds. By default we mirror the user's real stdin into the pipe (teeing so we can persist what was provided) and close it on EOF. Later scripted input can use the same interface. - -4. **runtime_tracing integration** - `RuntimeTracer` keeps sole ownership of the `NonStreamingTraceWriter`. We wrap it in `Arc>` so the FD reader threads can call `TraceWriter::add_event(TraceLowLevelEvent::Event(...))`. Each chunk becomes an `EventLogKind::Write` (stdout), `WriteOther` (stderr), or `Read` (stdin) record whose `metadata` is a JSON document (stream, thread ids when known, activation state, step reference, timestamps) and whose `content` carries the base64-encoded bytes. IO data therefore lives in the same `trace.bin`/`trace.json` file as step and call events. - -5. **Line attribution** - `RuntimeTracer` already handles `LINE` and `PY_*` monitoring callbacks. We extend it to track the most recent `Step` per Python thread and expose a `Snapshot` API returning `(path_id, line, call_key)`. When an IO chunk arrives we fetch the latest snapshot for the emitting Python thread (or fall back to the global latest step if the writer thread is unknown) and include it in the metadata so replay tooling can align output with execution. - -6. **Recorder logging isolation** - Logger initialisation continues to route Rust `log` output to the preserved original stderr handle before redirection. Capture threads never use `println!`; they rely on the logger to avoid contaminating user streams. - -7. **Buffering & flushing** - After installing the pipes we request line buffering on `sys.stdout`/`sys.stderr` when possible. On `stop()` we drain pending chunks, restore the original descriptors, and finish the runtime_tracing writer (`finish_writing_trace_*`). - ---- - -## Alternatives Considered - -- Monkey-patching `sys.stdout`/`sys.stderr`: misses native writes and conflicts with user overrides. -- Spawning a subprocess wrapper around the script: breaks the in-process monitoring story and changes CLI semantics. -- Emitting IO to a bespoke JSON artifact: unnecessary because runtime_tracing already models IO events. -- Deferring IO capture to a UI component: prevents parity with existing CodeTracer replay capabilities. - ---- - -## Consequences - -- **Pros** - - IO, monitoring events, and metadata share the runtime_tracing stream, so downstream tooling continues to work. - - Users keep invoking the CLI exactly the same way. - - FD duplication captures writes coming from both Python and native extensions; recorder logging stays isolated. - -- **Cons / Risks** - - `NonStreamingTraceWriter` must be guarded for cross-thread use; we need to validate performance and correctness when `add_event` is called from background threads. - - Mapping IO chunks to the exact Python thread is best-effort because file descriptors do not expose thread identity. - - Reader threads must stay ahead of producers to avoid filling pipe buffers; shutdown paths must handle long-running readers. - ---- - -## Detailed Design - -### A. Execution lifecycle - -1. **CLI (Python)** - - Parse recorder flags, resolve `script_path`, choose format/path, call `start(trace_dir, format, start_on_enter=script_path)`. - -2. **`start_tracing` (Rust)** - - Initialise logging and guard against nested sessions. - - Ensure the trace directory exists and choose event/meta/path filenames. - - Instantiate `RuntimeTracer` with the requested format and activation path. - - Wrap the tracer in `Arc>` exposing `emit_io_chunk(...)`. - - Construct `OutputCapture`: - - Duplicate original fds/handles and store them for restoration and logger use. - - Create pipes (Unix: `pipe2`; Windows: `CreatePipe`). - - Redirect 0/1/2 via `dup2`/`SetStdHandle`. - - Spawn reader threads that block on the pipe, timestamp (`Instant::now()`), gather OS thread id when available, and push `IoChunk { stream, bytes, ts, os_thread }` into a channel while forwarding the same bytes to the saved descriptors to maintain passthrough console behaviour. - -3. **Tracing activation** - - Install `sys.monitoring` callbacks with `install_tracer(py, Box::new(runtime_tracer_clone))`. - - Start a draining thread that consumes `IoChunk`, resolves Python thread ids via a shared `DashMap` maintained by `RuntimeTracer` on `PY_START/PY_RESUME`, and calls `emit_io_chunk`. - -4. **Python script execution** - - Back in Python, `runpy.run_path(str(script_path), run_name="__main__")` runs the target script. IO already flows through the capture pipelines. - -5. **Shutdown** -- On normal completion or exception: - - Close writer handles to signal EOF. - - Join reader/draining threads with a timeout guard. - - Restore original descriptors/handles. - - Call `flush()`/`finish()` on `RuntimeTracer` and release the monitoring tool id. - -### B. Encoding IO chunks with runtime_tracing - -- `emit_io_chunk` composes a metadata JSON document similar to: - -```json -{ - "stream": "stdout", - "encoding": "base64", - "ts_ns": 1234567890123, - "os_thread": 140355679779840, - "py_thread": 123, - "step": { "path_id": 5, "line": 42, "call_key": 12 }, - "activation": "active" -} -``` - -- The payload is `base64::encode(chunk.bytes)`. -- We invoke `TraceWriter::add_event(TraceLowLevelEvent::Event(RecordEvent { kind, metadata, content }))`, where `kind` is `EventLogKind::Write` for stdout, `WriteOther` for stderr, and `Read` for stdin. -- This keeps IO data inside `trace.bin`/`trace.json`, allowing replay tools to process it alongside steps and returns. - -### C. Mapping algorithm - -- Maintain `DashMap` updated on every LINE event. A snapshot stores `(path_id, line, call_key, perf_counter_ns)`. -- When an IO chunk arrives: - - Map OS thread id to Python thread id if possible; otherwise leave it null. - - Retrieve the latest snapshot for that Python thread. If the chunk timestamp predates the snapshot by more than a configurable threshold, fall back to the global latest snapshot. - - Embed the snapshot in the metadata JSON. -- For multi-line outputs the drainers keep chunk boundaries (newline-aware if the writer flushes per line) so replay can group contiguous chunks with identical snapshots. - ---- - -## Implementation Notes (Unix/Windows) - -- **Unix**: rely on the `nix` crate for `pipe2`, `dup`, `dup2`, and `fcntl` (set CLOEXEC). Reader threads use blocking `read` and propagate EOF by pushing a terminal message. -- **Windows**: use `CreatePipe`, `SetStdHandle`, `DuplicateHandle`, and `ReadFile` in overlapped mode if needed. Convert UTF-16 console output to UTF-8 before encoding. Ensure we re-register the original handles via `SetStdHandle` on teardown. Forwarding threads write mirrored bytes via the saved handles (`WriteFile`) so console output remains visible. -- PTY support (for interactive shells) can be layered later with `openpty`/ConPTY; initial scope sticks to pipes. - ---- - -## Telemetry & Logging - -- Keep `env_logger` initialisation on module import. Default to debug for the recorder crate so developers can inspect capture lifecycle logs. -- Emit debug logs (`install`, `chunk`, `drain_exit`, `restore`) to the preserved stderr handle. Emit failures as `EventLogKind::Error` events with diagnostic metadata when feasible. - ---- - -## Key Files - -- `codetracer-python-recorder/src/lib.rs`: orchestrates tracer startup/shutdown, trace directory provisioning, and will host creation of the `OutputCapture` component. -- `codetracer-python-recorder/src/runtime_tracer.rs`: owns the `NonStreamingTraceWriter`, handles `sys.monitoring` callbacks, and will expose APIs for emitting IO events plus step snapshots. -- `codetracer-python-recorder/src/tracer.rs`: manages registration with `sys.monitoring`; may require adjustments to share thread metadata with the capture pipeline. -- `codetracer-python-recorder/codetracer_python_recorder/__main__.py`: CLI entrypoint that invokes `start()`/`stop()`; may need updates for new flags or environment toggles. -- `codetracer-python-recorder/codetracer_python_recorder/api.py`: Python façade over the Rust extension, coordinating session lifecycle and flushing semantics. -- `codetracer-python-recorder/src/output_capture.rs` (new): encapsulates platform-specific descriptor duplication, pipe management, mirroring, and reader threads. -- `codetracer-python-recorder/tests/` (integration tests): will gain coverage asserting IO events appear in traces and that console passthrough remains functional. - ---- - -## Testing Plan - -1. **Unit / small integration** - - Scripts emitting stdout/stderr; assert generated traces contain `EventLogKind::Write`/`WriteOther` records with correct base64 payloads. - - Validate metadata JSON includes expected `path_id` and `line` for simple cases. - -2. **Concurrency** - - Multi-threaded printing to ensure no deadlocks and that chunks remain ordered. - - `asyncio` tasks writing concurrently; confirm snapshots continue to resolve. - -3. **Input** - - `input()` and `sys.stdin.read()` scenarios; ensure `EventLogKind::Read` captures bytes and EOF. - - Passthrough from the real stdin preserves exact bytes. - -4. **Large output & stress** - - Emit payloads larger than the pipe buffer to validate continuous draining. - - Rapid start/stop cycles ensure descriptors restore cleanly. - -5. **Windows** - - Mirror the above coverage on Windows CI runners, focusing on handle restoration and CRLF handling. - ---- - -## Rollout - -- Gate the feature behind an environment flag (`CODETRACER_CAPTURE_IO=1`) for early adopters, removing it after validation. -- Update CLI help text to mention stdout/stderr/stdin capture. -- Add regression tests driven by `just test` that assert on IO events in traces generated from the examples. - ---- - -## Open Questions / Future Work - -- Improve thread attribution by integrating with Python `threading` hooks if OS-level mapping proves insufficient. -- Allow configurable passthrough (e.g., disable mirroring when running headless or redirect to a file) once the default teeing behaviour is in place. -- Investigate PTY support for interactive applications that expect terminal semantics. -- Consider compressing large payloads before base64 encoding to reduce trace sizes. - ---- - -## Acceptance Criteria - -- Running `python -m codetracer_python_recorder script.py` produces runtime_tracing files containing IO events alongside existing step/call records. -- stdout, stderr, and stdin bytes are captured losslessly and attributed to the most relevant step snapshot. -- Original descriptors are restored even on exceptions or early exits. -- Reader threads terminate cleanly without leaks on Unix and Windows. diff --git a/design-docs/io-capture-line-proxy-implementation-plan.md b/design-docs/io-capture-line-proxy-implementation-plan.md new file mode 100644 index 0000000..0663a62 --- /dev/null +++ b/design-docs/io-capture-line-proxy-implementation-plan.md @@ -0,0 +1,58 @@ +# Line-Aware IO Capture Implementation Plan + +This plan replaces the old pipe-based capture plan. Sentences stay short for easy reading. Each stage ends with tests and exit checks. + +## Stage 0 – Prepare runtime hooks +- Add a `LineSnapshotStore` in the runtime crate if it does not already exist. It records `{path_id, line, frame_id, timestamp}` per thread. +- Expose a read-only view of the snapshot store so other modules can fetch the latest entry using a thread id. +- Extend `RuntimeTracer` to keep a handle to the snapshot store. Ensure the handle works from monitoring callbacks and from the IO layer. +- Tests: unit tests for snapshot updates, eviction on thread exit, and concurrent read/write. +- Exit: recorder still passes `just test`, tracing lifecycle unaffected. + +## Stage 1 – Build IO proxy classes +- Create `runtime::io_capture::proxies` module with the PyO3 proxy structs (`LineAwareStdout`, `LineAwareStderr`, `LineAwareStdin`). +- Each proxy holds the original stream, a weak pointer to the shared `IoEventSink`, and a reentrancy guard. +- Implement the Python surface: `write`, `writelines`, `flush` for output; `read`, `readline`, `readinto`, `__iter__`, `__next__` for input. Methods delegate to the original stream. +- Capture payloads before returning. Store them together with the current thread id. +- Tests: Rust unit tests using `Python::with_gil` to ensure proxies mirror data, flush behaviour, and error handling. Python-level tests that monkeypatch `sys.stdout` and verify `print` still works. +- Exit: No leaks when we install/uninstall proxies repeatedly inside tests. + +## Stage 2 – Implement IoEventSink and batching +- Add `IoChunk` struct holding `{stream, payload, thread_id, snapshot, timestamp, flags}`. +- `IoEventSink` groups writes by thread and stream. Batches flush when we hit newline, explicit flush, a Step boundary, or a 5 ms timer. Use `parking_lot::Mutex` and a `once_cell::sync::Lazy` timer wheel to keep locking simple. +- Provide `flush_before_step(thread_id)` API. The monitoring callbacks call it right before they emit a Step event, then record the Step, then update the snapshot store. This enforces the `Step -> IO -> next Step` ordering. +- Convert chunks into runtime trace events right after batching. Feed raw payload bytes into `runtime_tracing` so consumers do not need to undo an extra base64 layer. +- Integrate with the recorder error macros for faults (`usage!`, `ioerr!`). +- Tests: unit tests for batching rules, timer flush, newline handling, guard on recursion, and the Step ordering API. +- Exit: sink drops zero events during stress tests that flood stdout with short writes. + +## Stage 3 – Wire proxies into lifecycle +- Extend policy with `io_capture` options plus env parsing (`CODETRACER_CAPTURE_IO`). +- `start_tracing` installs proxies when policy allows. Also expose a Python helper `configure_policy_py` update to set the flags. +- `stop_tracing` tears down proxies even if tracing never started. Hook teardown into panic/unwind paths. +- Add `ScopedMuteIoCapture` and use it in logging and error reporting code. +- Tests: integration test launching a script that prints from Python and a C extension (use `ctypes` calling `PySys_WriteStdout`). Ensure events carry file path and line numbers. +- Exit: recorded IO events align with line events in the trace format. + +## Stage 4 – Optional FD mirror +- Implement `FdMirror` that duplicates `stdout`/`stderr` descriptors and reads unseen bytes through a background thread. +- Maintain a per-stream ledger: a FIFO byte buffer plus the next sequence id. Proxy callbacks append `{sequence, bytes}` entries while holding a short lock. The sequence uses an atomic counter so we can spot skipped entries during debugging. +- Each mirror read walks a streaming diff: scan the chunk left-to-right, skip unmatched native bytes, and peel ledger entries whenever their bytes appear (even when native writes come first). Handle partial matches at chunk boundaries so long entries carry across reads. Whatever bytes remain become the mirror-only payload. This keeps capture lossless even when native writers interleave unexpectedly. +- Document why ordering holds in the common case: CPython keeps the GIL during `write`, so proxy order matches FD write order. Native writers run once the proxy releases the GIL, so their bytes appear after any proxy prefixes. +- The reader tags leftover events as `source = FdMirror` and uses the latest snapshot per thread for attribution. Mark those chunks with a `mirror` flag in the event metadata so downstream tooling can distinguish fallback output. +- Tie the mirror to `policy.io_capture.fd_fallback`. Default off. Skip on platforms where dup is unsupported. +- Tests: stress test with `os.write(1, b"...")`, mixed proxy/mirror writes, mismatched sequences, and multiple Python threads. Ensure desync resets leave no duplicates and teardown restores descriptors. +- Exit: mirror can be toggled at runtime without breaking the proxies. + +## Stage 5 – Hardening and docs +- Add timeouts to teardown so we avoid deadlocks when threads hang. +- Document the feature in the README and link to ADR 0008. +- Provide troubleshooting notes (how to detect when another tool replaces `sys.stdout`). +- Run cross-platform CI (Linux + Windows). Manual smoke: run `python -m codetracer_python_recorder examples/stdout_script.py`. +- Exit: telemetry shows zero dropped chunks; docs merged; ADR status ready to flip to Accepted. + +## Verification Checklist +- `just test` passes after each stage. +- New unit and integration tests cover proxies, batching, policy toggles, and FD mirror. +- Manual smoke checks confirm console output stays live and line references match expectations. +- Logging guard stops recursive capture of recorder logs. diff --git a/design-docs/io-capture-line-proxy-implementation-plan.status.md b/design-docs/io-capture-line-proxy-implementation-plan.status.md new file mode 100644 index 0000000..6454708 --- /dev/null +++ b/design-docs/io-capture-line-proxy-implementation-plan.status.md @@ -0,0 +1,29 @@ +# IO Capture Line-Proxy Plan – Status + +## Relevant Design Docs +- `design-docs/adr/0008-line-aware-io-capture.md` +- `design-docs/io-capture-line-proxy-implementation-plan.md` +- Prototype reference: `design-docs/prototypes/io_capture_ledger_mirror_prototype.py` + +## Key Source Files +- `codetracer-python-recorder/src/runtime/line_snapshots.rs` +- `codetracer-python-recorder/src/runtime/mod.rs` +- `codetracer-python-recorder/src/runtime/io_capture/` +- Runtime tests that exercise the snapshot lifecycle live in `codetracer-python-recorder/src/runtime/mod.rs` + +## Stage Progress +- ✅ **Stage 0 – Prepare runtime hooks:** Added `LineSnapshotStore` with per-thread records (`path_id`, `line`, `frame_id`, timestamp), wired it into `RuntimeTracer::on_line`, exposed a read-only handle, and covered the store with unit plus integration tests. Cleanup paths clear the store on tracer finish. +- ✅ **Stage 1 – Build IO proxy classes:** Brought `runtime::io_capture::proxies` into the build, ported the proxy implementations to the PyO3 0.25 Bound/IntoPyObject APIs, and restored the unit tests that verify stdout/stderr passthrough, stdin reads, and the reentrancy guard. `just test` now exercises the proxies end-to-end. +- ✅ **Stage 2 – Implement IoEventSink and batching:** Added the `IoChunk` model plus a `IoEventSink` batching layer that groups stdout/stderr writes per thread, flushes on newline, explicit `flush()`, step boundaries, and 5 ms gaps, and emits stdin reads immediately. Updated the proxies to surface flush events and introduced focused unit tests that cover batching, timer splits, step flushes, and stdin capture. `just test` runs the sink tests alongside the existing proxy coverage. +- ✅ **Stage 3 – Wire proxies into lifecycle:** `RuntimeTracer::install_io_capture` now instantiates the sink, installs the proxies behind the policy flag, and drains/flushed buffered chunks at step and finish boundaries. `IoChunk` records path IDs, frame IDs, and thread IDs sourced from the `LineSnapshotStore`, with a Python stack fallback filling metadata when monitoring snapshots are not yet available. Metadata emitted by `RecordEvent` now includes `path_id`, `line`, and `frame_id` for stdout/stderr chunks, and the Stage 3 integration test passes end-to-end. +- 🔄 **Stage 4 – Optional FD mirror:** Implemented the shared ledger (`runtime::io_capture::fd_mirror`), plumbed optional `MirrorLedgers`/`FdMirrorController` through `IoEventSink` and `RuntimeTracer::install_io_capture`, and added runtime tests that assert `os.write` payloads are captured only when `io_capture_fd_fallback` is enabled. Next actions: tighten metadata/telemetry (expose mirror stats, warn when descriptor duplication fails) and stress-test concurrent native writers. +- 🔄 **Stage 5 – Hardening and docs:** Kickoff 2025-10-15. Focus areas: add teardown timeouts for the FD mirror threads, expand README coverage (include ADR 0008 link plus troubleshooting steps for replaced `sys.stdout`), and capture manual/CI verification notes. + - ✅ Added FD mirror shutdown timeout with a polling helper to detach stuck reader threads plus unit coverage. + - ✅ Documented README guidance (ADR link, mirror flag notes, troubleshooting for replaced `sys.stdout`) and recorded the manual smoke command. + - ✅ Logged the full IO capture feature set and hardening updates in `codetracer-python-recorder/CHANGELOG.md`. + - ✅ Verification: `just dev test` (Linux) passes with new mirror timeout tests; Windows CI regression run still queued. + +## Next Steps +1. Stage 4: finish FD mirror wiring — add mirror-only tests (`os.write` / mixed stdout/stderr), surface user-facing warnings on setup failure, and document the new `mirror` flag in chunk metadata. +2. Stage 5 follow-up: monitor the queued Windows CI regression run and flip ADR 0008 to Accepted once cross-platform verification lands. +3. Evaluate performance impact of the Python stack fallback and gate it behind monitoring snapshots once `sys.monitoring` integration fully drives the snapshot store. diff --git a/design-docs/prototypes/io_capture_ledger_mirror_prototype.py b/design-docs/prototypes/io_capture_ledger_mirror_prototype.py new file mode 100644 index 0000000..b44a4ec --- /dev/null +++ b/design-docs/prototypes/io_capture_ledger_mirror_prototype.py @@ -0,0 +1,339 @@ +#!/usr/bin/env python3 +""" +Prototype that exercises the ledger + FD mirror algorithm with multiple Python +threads and direct ``os.write`` calls. The goal is to show that the ledger lets +the mirror drop bytes already captured by the proxies while still recording +native writes that bypass the proxies. + +Run with: + + python design-docs/prototypes/io_capture_ledger_mirror_prototype.py +""" + +from __future__ import annotations + +import itertools +import os +import random +import sys +import threading +import time +from collections import deque +from dataclasses import dataclass +from typing import Deque, List, Sequence, Tuple + + +@dataclass +class LedgerEntry: + seq: int + data: bytes + offset: int = 0 + + @property + def remaining(self) -> bytes: + return self.data[self.offset :] + + def consume(self, size: int) -> bytes: + chunk = self.data[self.offset : self.offset + size] + self.offset += size + return chunk + + def is_spent(self) -> bool: + return self.offset >= len(self.data) + + +class Ledger: + """Thread-safe FIFO ledger that stores proxy writes.""" + + def __init__(self) -> None: + self._entries: Deque[LedgerEntry] = deque() + self._lock = threading.Lock() + self._seq = itertools.count() + + def push(self, payload: bytes) -> int: + seq = next(self._seq) + entry = LedgerEntry(seq=seq, data=payload) + with self._lock: + self._entries.append(entry) + return seq + + def subtract_from_chunk(self, chunk: bytes) -> Tuple[bytes, Sequence[Tuple[int, bytes]]]: + """Remove ledger bytes from ``chunk`` while preserving native leftovers. + + Returns (leftover, matched_entries). ``leftover`` contains the bytes that + did not match any ledger entry, in order. ``matched_entries`` keeps the + ledger sequences that were consumed for debugging. + """ + if not chunk: + return b"", () + + leftover = bytearray() + matched: List[Tuple[int, bytes]] = [] + idx = 0 + view = memoryview(chunk) + + with self._lock: + while idx < len(view): + # If the ledger is empty we can append the rest of the chunk. + if not self._entries: + leftover.extend(view[idx:]) + break + + entry = self._entries[0] + remaining = entry.remaining + if not remaining: + self._entries.popleft() + continue + + # Skip native bytes that precede the next ledger entry. + if view[idx] != remaining[0]: + leftover.append(view[idx]) + idx += 1 + continue + + # Full match fits inside the chunk. + full_len = len(remaining) + end_idx = idx + full_len + if end_idx <= len(view) and view[idx:end_idx].tobytes() == remaining: + consumed = entry.consume(full_len) + matched.append((entry.seq, consumed)) + idx = end_idx + if entry.is_spent(): + self._entries.popleft() + continue + + # Partial match at the end of the chunk. + tail = view[idx:].tobytes() + prefix = remaining[: len(tail)] + if tail == prefix: + consumed = entry.consume(len(tail)) + matched.append((entry.seq, consumed)) + idx = len(view) + if entry.is_spent(): + self._entries.popleft() + break + + # The byte matches the ledger start but diverges immediately. + # Treat it as native output. + leftover.append(view[idx]) + idx += 1 + + return (bytes(leftover), tuple(matched)) + + def reset(self) -> None: + with self._lock: + self._entries.clear() + + def pending_bytes(self) -> int: + with self._lock: + return sum(len(entry.remaining) for entry in self._entries) + + +class ProxyStdout: + """Minimal stdout proxy that records writes into the ledger.""" + + def __init__(self, write_fd: int, ledger: Ledger, proxy_events: List[dict]) -> None: + self._write_fd = write_fd + self._ledger = ledger + self._events = proxy_events + self._lock = threading.RLock() + self.encoding = "utf-8" + self.errors = "strict" + self.closed = False + + def write(self, text: str) -> int: + if self.closed: + raise ValueError("write to closed proxy") + if not isinstance(text, str): + text = str(text) + if not text: + return 0 + data = text.encode(self.encoding, self.errors) + with self._lock: + seq = self._ledger.push(data) + self._events.append( + { + "thread": threading.get_ident(), + "seq": seq, + "text": text, + "bytes": data, + } + ) + os.write(self._write_fd, data) + return len(text) + + def writelines(self, lines: Sequence[str]) -> None: + for line in lines: + self.write(line) + + def flush(self) -> None: + # Pipe writes are already flushed. + return + + def fileno(self) -> int: + return self._write_fd + + def isatty(self) -> bool: + return False + + def close(self) -> None: + if not self.closed: + os.close(self._write_fd) + self.closed = True + + +class FdMirror(threading.Thread): + """Background thread that reads from the pipe and drops ledger matches.""" + + def __init__(self, read_fd: int, ledger: Ledger, mirror_events: List[dict]) -> None: + super().__init__(daemon=True) + self._read_fd = read_fd + self._ledger = ledger + self._events = mirror_events + self.matched_bytes = 0 + self.total_bytes = 0 + self._done = threading.Event() + + def run(self) -> None: + try: + while True: + chunk = os.read(self._read_fd, 1024) + if not chunk: + break + self.total_bytes += len(chunk) + leftover, matched = self._ledger.subtract_from_chunk(chunk) + self.matched_bytes += sum(len(bytes_) for _, bytes_ in matched) + if leftover: + payload = leftover.decode("utf-8", errors="replace") + self._events.append( + { + "thread": threading.get_ident(), + "payload": payload, + "raw_bytes": leftover, + "matched_sequences": [seq for seq, _ in matched], + "chunk_size": len(chunk), + "ledger_entries_consumed": len(matched), + } + ) + finally: + self._done.set() + + def wait(self, timeout: float) -> bool: + return self._done.wait(timeout) + + +def run_trial(trial_id: int, *, validate: bool = True) -> dict: + orig_stdout = sys.stdout + proxy_events: List[dict] = [] + mirror_events: List[dict] = [] + native_events: List[str] = [] + native_lock = threading.Lock() + + read_fd, write_fd = os.pipe() + ledger = Ledger() + proxy_stdout = ProxyStdout(write_fd=write_fd, ledger=ledger, proxy_events=proxy_events) + sys.stdout = proxy_stdout + + mirror = FdMirror(read_fd=read_fd, ledger=ledger, mirror_events=mirror_events) + mirror.start() + + random.seed(42 + trial_id) + proxy_threads: List[threading.Thread] = [] + native_threads: List[threading.Thread] = [] + + def proxy_worker(idx: int) -> None: + for i in range(25): + print(f"[proxy {idx}] message {i}") + time.sleep(random.uniform(0.0, 0.003)) + + def native_worker(idx: int) -> None: + for i in range(15): + payload = f"[native {idx}] chunk {i}\n" + with native_lock: + native_events.append(payload) + os.write(write_fd, payload.encode("utf-8")) + time.sleep(random.uniform(0.0, 0.004)) + + for idx in range(4): + t = threading.Thread(target=proxy_worker, args=(idx,), name=f"proxy-{idx}") + proxy_threads.append(t) + t.start() + + for idx in range(2): + t = threading.Thread(target=native_worker, args=(idx,), name=f"native-{idx}") + native_threads.append(t) + t.start() + + for t in proxy_threads + native_threads: + t.join() + + # Restore stdout before closing the pipe so that final prints go to the console. + sys.stdout = orig_stdout + + proxy_stdout.close() + mirror.wait(timeout=5.0) + os.close(read_fd) + mirror.join(timeout=0.1) + + native_payload = "".join(native_events) + mirror_payload = "".join(event["payload"] for event in mirror_events) + + result = { + "trial": trial_id, + "proxy_events": len(proxy_events), + "proxy_bytes": sum(len(event["bytes"]) for event in proxy_events), + "native_events": len(native_events), + "native_bytes": sum(len(payload.encode("utf-8")) for payload in native_events), + "mirror_events": len(mirror_events), + "mirror_bytes": sum(len(event["raw_bytes"]) for event in mirror_events), + "ledger_pending": ledger.pending_bytes(), + "mirror_matched_bytes": mirror.matched_bytes, + "mirror_total_bytes": mirror.total_bytes, + } + + if not validate: + result["native_payload"] = native_payload + result["mirror_payload"] = mirror_payload + result["mirror_events_detail"] = mirror_events + result["proxy_events_detail"] = proxy_events + return result + + if native_payload != mirror_payload: + raise AssertionError( + "mirror payload does not match native writes", + native_payload[:200], + mirror_payload[:200], + ) + + if ledger.pending_bytes(): + raise AssertionError(f"ledger still holds {ledger.pending_bytes()} bytes") + + if not native_events: + raise AssertionError("expected at least one native write") + + # Ensure proxy payloads never leak through the mirror events. + for event in proxy_events: + text = event["text"] + if "[proxy" in text and text in mirror_payload: + raise AssertionError("proxy payload leaked into mirror capture", event["text"]) + + return result + + +def main() -> None: + results = [run_trial(trial_id) for trial_id in range(5)] + print("Ledger + FD mirror prototype results:") + for entry in results: + print( + f" trial {entry['trial']}: " + f"proxy_bytes={entry['proxy_bytes']}, " + f"native_bytes={entry['native_bytes']}, " + f"mirror_bytes={entry['mirror_bytes']}, " + f"matched_bytes={entry['mirror_matched_bytes']}, " + f"total_bytes={entry['mirror_total_bytes']}" + ) + print("All trials passed without ledger mismatches.") + + +if __name__ == "__main__": + main() diff --git a/examples/README.md b/examples/README.md index e50b85a..1fb07bb 100644 --- a/examples/README.md +++ b/examples/README.md @@ -13,7 +13,9 @@ Scripts - generators_async.py: A generator, async function, and async generator. - context_and_closures.py: A context manager and a nested closure. - threading.py: Two threads invoking traced functions and joining. -- imports_side_effects.py: Module‑level side effects vs main guard. +- imports_side_effects.py: Module-level side effects vs main guard. - kwargs_nested.py: Nested kwargs structure to validate structured encoding. +- io_capture.py: Mix of print/write/os.write to validate IO capture proxies. +- stdin_capture.py: Reads piped stdin via input(), readline(), read(). All scripts are deterministic and print minimal output. diff --git a/examples/io_capture.py b/examples/io_capture.py new file mode 100644 index 0000000..d2ece76 --- /dev/null +++ b/examples/io_capture.py @@ -0,0 +1,30 @@ +"""Exercise IO capture proxies with mixed stdout/stderr writes.""" +from __future__ import annotations + +import os +import sys + + +def emit_high_level() -> None: + print("stdout via print()", flush=True) + sys.stdout.write("stdout via sys.stdout.write()\n") + sys.stdout.flush() + + sys.stderr.write("stderr via sys.stderr.write()\n") + sys.stderr.flush() + + +def emit_low_level() -> None: + os.write(1, b"stdout via os.write()\n") + os.write(2, b"stderr via os.write()\n") + + +def main() -> None: + print("Demonstrating Codetracer IO capture behaviour") + emit_high_level() + emit_low_level() + print("Done.") + + +if __name__ == "__main__": + main() diff --git a/examples/stdin_capture.py b/examples/stdin_capture.py new file mode 100644 index 0000000..c3c5b70 --- /dev/null +++ b/examples/stdin_capture.py @@ -0,0 +1,43 @@ +"""Exercise stdin capture with sequential reads.""" +from __future__ import annotations + +import sys + + +def _describe(label: str, value: str | None) -> None: + if value is None: + print(f"{label}: ") + elif value == "": + print(f"{label}: ") + else: + print(f"{label}: {value!r}") + + +def main() -> None: + if sys.stdin.isatty(): + print("stdin is attached to a TTY; pipe data to exercise capture.") + print( + "Example: printf 'first\\nsecond\\nthird' | " + "python -m codetracer_python_recorder examples/stdin_capture.py" + ) + return + + print("Inspecting stdin via input()/readline()/read()") + + try: + first = input() + except EOFError: + first = None + _describe("input()", first) + + second = sys.stdin.readline() + _describe("sys.stdin.readline()", second if second else None) + + remaining = sys.stdin.read() + _describe("sys.stdin.read()", remaining if remaining else None) + + print("Done reading from stdin.") + + +if __name__ == "__main__": + main()