diff --git a/codetracer-python-recorder/codetracer_python_recorder/cli.py b/codetracer-python-recorder/codetracer_python_recorder/cli.py index c6e4793..105e427 100644 --- a/codetracer-python-recorder/codetracer_python_recorder/cli.py +++ b/codetracer-python-recorder/codetracer_python_recorder/cli.py @@ -287,7 +287,7 @@ def main(argv: Iterable[str] | None = None) -> int: try: flush() finally: - stop() + stop(exit_code=exit_code) sys.argv = old_argv _serialise_metadata(trace_dir, script=script_path) diff --git a/codetracer-python-recorder/codetracer_python_recorder/session.py b/codetracer-python-recorder/codetracer_python_recorder/session.py index d067954..f7604aa 100644 --- a/codetracer-python-recorder/codetracer_python_recorder/session.py +++ b/codetracer-python-recorder/codetracer_python_recorder/session.py @@ -38,10 +38,17 @@ def __init__(self, path: Path, format: str) -> None: self.path = path self.format = format - def stop(self) -> None: - """Stop this trace session.""" + def stop(self, *, exit_code: int | None = None) -> None: + """Stop this trace session. + + Parameters + ---------- + exit_code: + Optional process exit status to forward to the recorder backend. + When ``None``, the session shutdown reason remains unspecified. + """ if _active_session is self: - stop() + stop(exit_code=exit_code) def flush(self) -> None: """Flush buffered trace data for this session.""" @@ -51,6 +58,7 @@ def __enter__(self) -> "TraceSession": return self def __exit__(self, exc_type, exc, tb) -> None: # pragma: no cover - thin wrapper + # Exit codes are not tracked for context-managed sessions; report unknown. self.stop() @@ -121,12 +129,18 @@ def start( return session -def stop() -> None: - """Stop the active trace session if one is running.""" +def stop(*, exit_code: int | None = None) -> None: + """Stop the active trace session if one is running. + + Parameters + ---------- + exit_code: + Optional process exit status to forward to the backend. + """ global _active_session if not _is_tracing_backend(): return - _stop_backend() + _stop_backend(exit_code) _active_session = None diff --git a/codetracer-python-recorder/src/monitoring/api.rs b/codetracer-python-recorder/src/monitoring/api.rs index 0bc02c2..fee4283 100644 --- a/codetracer-python-recorder/src/monitoring/api.rs +++ b/codetracer-python-recorder/src/monitoring/api.rs @@ -107,6 +107,15 @@ pub trait Tracer: Send + Any { Ok(()) } + /// Provide the process exit status ahead of tracer teardown. + fn set_exit_status( + &mut self, + _py: Python<'_>, + _exit_code: Option, + ) -> PyResult<()> { + Ok(()) + } + /// Called on resumption of a generator/coroutine (not via throw()). fn on_py_resume( &mut self, diff --git a/codetracer-python-recorder/src/monitoring/install.rs b/codetracer-python-recorder/src/monitoring/install.rs index 22afd4f..b51dcb0 100644 --- a/codetracer-python-recorder/src/monitoring/install.rs +++ b/codetracer-python-recorder/src/monitoring/install.rs @@ -82,3 +82,11 @@ pub fn flush_installed_tracer(py: Python<'_>) -> PyResult<()> { } Ok(()) } + +/// Provide the session exit status to the active tracer if one is installed. +pub fn update_exit_status(py: Python<'_>, exit_code: Option) -> PyResult<()> { + if let Some(global) = GLOBAL.lock().unwrap().as_mut() { + global.tracer.set_exit_status(py, exit_code)?; + } + Ok(()) +} diff --git a/codetracer-python-recorder/src/monitoring/mod.rs b/codetracer-python-recorder/src/monitoring/mod.rs index dffdd92..a582262 100644 --- a/codetracer-python-recorder/src/monitoring/mod.rs +++ b/codetracer-python-recorder/src/monitoring/mod.rs @@ -10,7 +10,12 @@ pub(crate) mod install; pub mod tracer; pub use api::Tracer; -pub use install::{flush_installed_tracer, install_tracer, uninstall_tracer}; +pub use install::{ + flush_installed_tracer, + install_tracer, + uninstall_tracer, + update_exit_status, +}; const MONITORING_TOOL_NAME: &str = "codetracer"; diff --git a/codetracer-python-recorder/src/runtime/tracer/events.rs b/codetracer-python-recorder/src/runtime/tracer/events.rs index c96524e..dfcaea9 100644 --- a/codetracer-python-recorder/src/runtime/tracer/events.rs +++ b/codetracer-python-recorder/src/runtime/tracer/events.rs @@ -1,6 +1,5 @@ //! Event handling pipeline for `RuntimeTracer`. -use super::filtering::TraceDecision; use super::runtime_tracer::RuntimeTracer; use crate::code_object::CodeObjectWrapper; use crate::ffi; @@ -177,18 +176,8 @@ impl Tracer for RuntimeTracer { code: &CodeObjectWrapper, _offset: i32, ) -> CallbackResult { - let is_active = self - .lifecycle - .activation_mut() - .should_process_event(py, code); - if matches!( - self.should_trace_code(py, code), - TraceDecision::SkipAndDisable - ) { - return Ok(CallbackOutcome::Continue); - } - if !is_active { - return Ok(CallbackOutcome::Continue); + if let Some(outcome) = self.evaluate_gate(py, code, true) { + return Ok(outcome); } if should_inject_failure(FailureStage::PyStart) { @@ -244,18 +233,8 @@ impl Tracer for RuntimeTracer { code: &CodeObjectWrapper, _offset: i32, ) -> CallbackResult { - let is_active = self - .lifecycle - .activation_mut() - .should_process_event(py, code); - if matches!( - self.should_trace_code(py, code), - TraceDecision::SkipAndDisable - ) { - return Ok(CallbackOutcome::Continue); - } - if !is_active { - return Ok(CallbackOutcome::Continue); + if let Some(outcome) = self.evaluate_gate(py, code, false) { + return Ok(outcome); } log_event(py, code, "on_py_resume", None); @@ -264,18 +243,8 @@ impl Tracer for RuntimeTracer { } fn on_line(&mut self, py: Python<'_>, code: &CodeObjectWrapper, lineno: u32) -> CallbackResult { - let is_active = self - .lifecycle - .activation_mut() - .should_process_event(py, code); - if matches!( - self.should_trace_code(py, code), - TraceDecision::SkipAndDisable - ) { - return Ok(CallbackOutcome::Continue); - } - if !is_active { - return Ok(CallbackOutcome::Continue); + if let Some(outcome) = self.evaluate_gate(py, code, false) { + return Ok(outcome); } if should_inject_failure(FailureStage::Line) { @@ -378,18 +347,8 @@ impl Tracer for RuntimeTracer { _offset: i32, exception: &Bound<'_, PyAny>, ) -> CallbackResult { - let is_active = self - .lifecycle - .activation_mut() - .should_process_event(py, code); - if matches!( - self.should_trace_code(py, code), - TraceDecision::SkipAndDisable - ) { - return Ok(CallbackOutcome::Continue); - } - if !is_active { - return Ok(CallbackOutcome::Continue); + if let Some(outcome) = self.evaluate_gate(py, code, false) { + return Ok(outcome); } log_event(py, code, "on_py_throw", None); @@ -439,8 +398,17 @@ impl Tracer for RuntimeTracer { ) } + fn set_exit_status( + &mut self, + _py: Python<'_>, + exit_code: Option, + ) -> PyResult<()> { + self.record_exit_status(exit_code); + Ok(()) + } + fn notify_failure(&mut self, _py: Python<'_>) -> PyResult<()> { - self.mark_failure(); + self.mark_disabled(); Ok(()) } @@ -484,9 +452,14 @@ impl Tracer for RuntimeTracer { self.mark_event(); } + self.emit_session_exit(py); + + let exit_summary = self.exit_summary(); + if self.lifecycle.encountered_failure() { if policy.keep_partial_trace { - if let Err(err) = self.lifecycle.finalise(&mut self.writer, &self.filter) { + if let Err(err) = self.lifecycle.finalise(&mut self.writer, &self.filter, &exit_summary) + { with_error_code(ErrorCode::TraceIncomplete, || { log::warn!( "failed to finalise partial trace after disable: {}", @@ -519,7 +492,7 @@ impl Tracer for RuntimeTracer { .require_trace_or_fail(&policy) .map_err(ffi::map_recorder_error)?; self.lifecycle - .finalise(&mut self.writer, &self.filter) + .finalise(&mut self.writer, &self.filter, &exit_summary) .map_err(ffi::map_recorder_error)?; self.function_ids.clear(); self.module_names.clear(); @@ -553,22 +526,8 @@ impl RuntimeTracer { exit_kind: Option, allow_disable: bool, ) -> CallbackResult { - let is_active = self - .lifecycle - .activation_mut() - .should_process_event(py, code); - if matches!( - self.should_trace_code(py, code), - TraceDecision::SkipAndDisable - ) { - return Ok(if allow_disable { - CallbackOutcome::DisableLocation - } else { - CallbackOutcome::Continue - }); - } - if !is_active { - return Ok(CallbackOutcome::Continue); + if let Some(outcome) = self.evaluate_gate(py, code, allow_disable) { + return Ok(outcome); } log_event(py, code, label, None); diff --git a/codetracer-python-recorder/src/runtime/tracer/lifecycle.rs b/codetracer-python-recorder/src/runtime/tracer/lifecycle.rs index 24ca643..6cfa6bb 100644 --- a/codetracer-python-recorder/src/runtime/tracer/lifecycle.rs +++ b/codetracer-python-recorder/src/runtime/tracer/lifecycle.rs @@ -6,6 +6,8 @@ use crate::runtime::activation::ActivationController; use crate::runtime::io_capture::ScopedMuteIoCapture; use crate::runtime::output_paths::TraceOutputPaths; use crate::runtime::tracer::filtering::FilterCoordinator; +use crate::runtime::tracer::runtime_tracer::ExitSummary; +use log::debug; use recorder_errors::{enverr, usage, ErrorCode, RecorderResult}; use runtime_tracing::{NonStreamingTraceWriter, TraceWriter}; use serde_json::{self, json}; @@ -105,12 +107,12 @@ impl LifecycleController { &mut self, writer: &mut NonStreamingTraceWriter, filter: &FilterCoordinator, + exit_summary: &ExitSummary, ) -> RecorderResult<()> { TraceWriter::finish_writing_trace_metadata(writer).map_err(|err| { enverr!(ErrorCode::Io, "failed to finalise trace metadata") .with_context("source", err.to_string()) })?; - self.append_filter_metadata(filter)?; TraceWriter::finish_writing_trace_paths(writer).map_err(|err| { enverr!(ErrorCode::Io, "failed to finalise trace paths") .with_context("source", err.to_string()) @@ -119,6 +121,9 @@ impl LifecycleController { enverr!(ErrorCode::Io, "failed to finalise trace events") .with_context("source", err.to_string()) })?; + debug!("[Lifecycle] writing exit metadata: code={:?}, label={:?}", exit_summary.code, exit_summary.label); + self.append_filter_metadata(filter)?; + self.append_exit_metadata(exit_summary)?; Ok(()) } @@ -132,6 +137,49 @@ impl LifecycleController { self.encountered_failure = false; } + fn append_exit_metadata(&self, exit_summary: &ExitSummary) -> RecorderResult<()> { + let Some(outputs) = &self.output_paths else { + return Ok(()); + }; + + let path = outputs.metadata(); + let original = fs::read_to_string(path).map_err(|err| { + enverr!(ErrorCode::Io, "failed to read trace metadata") + .with_context("path", path.display().to_string()) + .with_context("source", err.to_string()) + })?; + + let mut metadata: serde_json::Value = serde_json::from_str(&original).map_err(|err| { + enverr!(ErrorCode::Io, "failed to parse trace metadata JSON") + .with_context("path", path.display().to_string()) + .with_context("source", err.to_string()) + })?; + + if let serde_json::Value::Object(ref mut obj) = metadata { + let status = json!({ + "code": exit_summary.code, + "label": exit_summary.label, + }); + obj.insert("process_exit_status".to_string(), status); + let serialised = serde_json::to_string(&metadata).map_err(|err| { + enverr!(ErrorCode::Io, "failed to serialise trace metadata") + .with_context("path", path.display().to_string()) + .with_context("source", err.to_string()) + })?; + fs::write(path, serialised).map_err(|err| { + enverr!(ErrorCode::Io, "failed to write trace metadata") + .with_context("path", path.display().to_string()) + .with_context("source", err.to_string()) + })?; + Ok(()) + } else { + Err( + enverr!(ErrorCode::Io, "trace metadata must be a JSON object") + .with_context("path", path.display().to_string()), + ) + } + } + fn append_filter_metadata(&self, filter: &FilterCoordinator) -> RecorderResult<()> { let Some(outputs) = &self.output_paths else { return Ok(()); diff --git a/codetracer-python-recorder/src/runtime/tracer/runtime_tracer.rs b/codetracer-python-recorder/src/runtime/tracer/runtime_tracer.rs index 8560f66..b85de47 100644 --- a/codetracer-python-recorder/src/runtime/tracer/runtime_tracer.rs +++ b/codetracer-python-recorder/src/runtime/tracer/runtime_tracer.rs @@ -5,19 +5,116 @@ use super::lifecycle::LifecycleController; use crate::code_object::CodeObjectWrapper; use crate::ffi; use crate::module_identity::{ModuleIdentityCache, ModuleNameHints}; +use crate::monitoring::CallbackOutcome; use crate::policy::RecorderPolicy; use crate::runtime::io_capture::{IoCaptureSettings, ScopedMuteIoCapture}; use crate::runtime::line_snapshots::LineSnapshotStore; use crate::runtime::output_paths::TraceOutputPaths; +use crate::runtime::value_encoder::encode_value; use crate::trace_filter::engine::TraceFilterEngine; use pyo3::prelude::*; +use pyo3::types::{PyAny, PyInt, PyString}; use runtime_tracing::NonStreamingTraceWriter; use runtime_tracing::{Line, TraceEventsFileFormat, TraceWriter}; +use std::borrow::Cow; use std::collections::HashMap; use std::path::Path; use std::sync::Arc; use std::thread::ThreadId; +#[derive(Debug)] +enum ExitPayload { + Code(i32), + Text(Cow<'static, str>), +} + +#[derive(Clone, Debug, Default)] +pub(crate) struct ExitSummary { + pub code: Option, + pub label: Option, +} + +impl ExitPayload { + fn is_code(&self) -> bool { + matches!(self, ExitPayload::Code(_)) + } + + #[cfg(test)] + fn is_text(&self, text: &str) -> bool { + matches!(self, ExitPayload::Text(current) if current.as_ref() == text) + } +} + +#[derive(Debug)] +struct SessionExitState { + payload: ExitPayload, + emitted: bool, +} + +impl Default for SessionExitState { + fn default() -> Self { + Self { + payload: ExitPayload::Text(Cow::Borrowed("")), + emitted: false, + } + } +} + +impl SessionExitState { + fn set_exit_code(&mut self, exit_code: Option) { + if self.can_override_with_code() { + self.payload = exit_code + .map(ExitPayload::Code) + .unwrap_or_else(|| ExitPayload::Text(Cow::Borrowed(""))); + } + } + + fn mark_disabled(&mut self) { + if !self.payload.is_code() { + self.payload = ExitPayload::Text(Cow::Borrowed("")); + } + } + + #[cfg(test)] + fn mark_failure(&mut self) { + if !self.payload.is_code() && !self.payload.is_text("") { + self.payload = ExitPayload::Text(Cow::Borrowed("")); + } + } + + fn can_override_with_code(&self) -> bool { + matches!(&self.payload, ExitPayload::Text(current) if current.as_ref() == "") + } + + fn as_bound<'py>(&self, py: Python<'py>) -> Bound<'py, PyAny> { + match &self.payload { + ExitPayload::Code(value) => PyInt::new(py, *value).into_any(), + ExitPayload::Text(text) => PyString::new(py, text.as_ref()).into_any(), + } + } + + fn mark_emitted(&mut self) { + self.emitted = true; + } + + fn is_emitted(&self) -> bool { + self.emitted + } + + fn summary(&self) -> ExitSummary { + match &self.payload { + ExitPayload::Code(value) => ExitSummary { + code: Some(*value), + label: None, + }, + ExitPayload::Text(text) => ExitSummary { + code: None, + label: Some(text.as_ref().to_string()), + }, + } + } +} + /// Minimal runtime tracer that maps Python sys.monitoring events to /// runtime_tracing writer operations. pub struct RuntimeTracer { @@ -28,6 +125,7 @@ pub struct RuntimeTracer { pub(super) io: IoCoordinator, pub(super) filter: FilterCoordinator, pub(super) module_names: ModuleIdentityCache, + session_exit: SessionExitState, } impl RuntimeTracer { @@ -49,6 +147,7 @@ impl RuntimeTracer { io: IoCoordinator::new(), filter: FilterCoordinator::new(trace_filter), module_names: ModuleIdentityCache::new(), + session_exit: SessionExitState::default(), } } @@ -78,6 +177,18 @@ impl RuntimeTracer { } } + pub(super) fn emit_session_exit(&mut self, py: Python<'_>) { + if self.session_exit.is_emitted() { + return; + } + + self.flush_pending_io(); + let value = self.session_exit.as_bound(py); + let record = encode_value(py, &mut self.writer, &value); + TraceWriter::register_return(&mut self.writer, record); + self.session_exit.mark_emitted(); + } + /// Configure output files and write initial metadata records. pub fn begin(&mut self, outputs: &TraceOutputPaths, start_line: u32) -> PyResult<()> { self.lifecycle @@ -95,10 +206,48 @@ impl RuntimeTracer { self.lifecycle.mark_event(); } + #[cfg(test)] pub(super) fn mark_failure(&mut self) { + self.session_exit.mark_failure(); + self.lifecycle.mark_failure(); + } + + pub(super) fn mark_disabled(&mut self) { + self.session_exit.mark_disabled(); self.lifecycle.mark_failure(); } + pub(super) fn record_exit_status(&mut self, exit_code: Option) { + self.session_exit.set_exit_code(exit_code); + } + + pub(super) fn exit_summary(&self) -> ExitSummary { + self.session_exit.summary() + } + + pub(super) fn evaluate_gate( + &mut self, + py: Python<'_>, + code: &CodeObjectWrapper, + allow_disable: bool, + ) -> Option { + let is_active = self.lifecycle.activation_mut().should_process_event(py, code); + if matches!( + self.should_trace_code(py, code), + TraceDecision::SkipAndDisable + ) { + return Some(if allow_disable { + CallbackOutcome::DisableLocation + } else { + CallbackOutcome::Continue + }); + } + if !is_active { + return Some(CallbackOutcome::Continue); + } + None + } + pub(super) fn ensure_function_id( &mut self, py: Python<'_>, @@ -1326,6 +1475,45 @@ initializer("omega") }); } + #[test] + fn finish_emits_toplevel_return_with_exit_code() { + Python::with_gil(|py| { + reset_policy(py); + + let script_dir = tempfile::tempdir().expect("script dir"); + let program_path = script_dir.path().join("program.py"); + std::fs::write(&program_path, "print('hi')\n").expect("write program"); + + let outputs_dir = tempfile::tempdir().expect("outputs dir"); + let outputs = TraceOutputPaths::new(outputs_dir.path(), TraceEventsFileFormat::Json); + + let mut tracer = RuntimeTracer::new( + program_path.to_string_lossy().as_ref(), + &[], + TraceEventsFileFormat::Json, + None, + None, + ); + tracer.begin(&outputs, 1).expect("begin tracer"); + tracer.record_exit_status(Some(7)); + + tracer.finish(py).expect("finish tracer"); + + let mut exit_value: Option = None; + for event in &tracer.writer.events { + if let TraceLowLevelEvent::Return(record) = event { + exit_value = Some(record.return_value.clone()); + } + } + + let exit_value = exit_value.expect("expected toplevel return value"); + match exit_value { + ValueRecord::Int { i, .. } => assert_eq!(i, 7), + other => panic!("expected integer exit value, got {other:?}"), + } + }); + } + #[test] fn trace_filter_metadata_includes_summary() { Python::with_gil(|py| { diff --git a/codetracer-python-recorder/src/session.rs b/codetracer-python-recorder/src/session.rs index b57ceea..cd19cda 100644 --- a/codetracer-python-recorder/src/session.rs +++ b/codetracer-python-recorder/src/session.rs @@ -10,7 +10,12 @@ use recorder_errors::{usage, ErrorCode}; use crate::ffi; use crate::logging::init_rust_logging_with_default; -use crate::monitoring::{flush_installed_tracer, install_tracer, uninstall_tracer}; +use crate::monitoring::{ + flush_installed_tracer, + install_tracer, + uninstall_tracer, + update_exit_status, +}; use crate::policy::policy_snapshot; use crate::runtime::{RuntimeTracer, TraceOutputPaths}; use bootstrap::TraceSessionBootstrap; @@ -73,10 +78,11 @@ pub fn start_tracing( } /// Stop tracing by resetting the global flag. -#[pyfunction] -pub fn stop_tracing() -> PyResult<()> { +#[pyfunction(signature = (exit_code=None))] +pub fn stop_tracing(exit_code: Option) -> PyResult<()> { ffi::wrap_pyfunction("stop_tracing", || { Python::with_gil(|py| { + update_exit_status(py, exit_code)?; // Uninstall triggers finish() on tracer implementation. uninstall_tracer(py)?; ACTIVE.store(false, Ordering::SeqCst); diff --git a/codetracer-python-recorder/tests/python/test_exit_payloads.py b/codetracer-python-recorder/tests/python/test_exit_payloads.py new file mode 100644 index 0000000..1473b29 --- /dev/null +++ b/codetracer-python-recorder/tests/python/test_exit_payloads.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path + +import pytest + +from codetracer_python_recorder.trace_balance import load_trace_events + + +def _last_return_value(trace_dir: Path) -> dict[str, object]: + events = load_trace_events(trace_dir / "trace.json") + for event in reversed(events): + payload = event.get("Return") + if payload is not None: + return payload["return_value"] + raise AssertionError("trace did not contain any Return events") + + +def test_cli_records_exit_code_in_toplevel_return(tmp_path: Path) -> None: + script = tmp_path / "exit_script.py" + script.write_text( + "import sys\n" + "sys.exit(3)\n", + encoding="utf-8", + ) + + trace_dir = tmp_path / "trace" + result = subprocess.run( + [ + sys.executable, + "-m", + "codetracer_python_recorder", + "--trace-dir", + str(trace_dir), + "--format", + "json", + str(script), + ], + capture_output=True, + text=True, + check=False, + ) + + assert result.returncode == 3, result.stderr + exit_value = _last_return_value(trace_dir) + assert exit_value["kind"] == "Int" + assert exit_value["i"] == 3 + + metadata = json.loads((trace_dir / "trace_metadata.json").read_text(encoding="utf-8")) + status = metadata.get("process_exit_status") + assert status == {"code": 3, "label": None} + + +def test_default_exit_payload_uses_placeholder(tmp_path: Path) -> None: + trace_dir = tmp_path / "trace" + trace_dir.mkdir() + + # Directly call the start/stop API without providing an exit code. + script = ( + "import json\n" + "from pathlib import Path\n" + "import codetracer_python_recorder as codetracer\n" + f"trace_dir = Path({json.dumps(str(trace_dir))!s})\n" + "session = codetracer.start(trace_dir, format='json')\n" + "session.stop()\n" + ) + subprocess.run([sys.executable, "-c", script], check=True) + + exit_value = _last_return_value(trace_dir) + assert exit_value["kind"] == "String" + assert exit_value["text"] == "" + + metadata = json.loads((trace_dir / "trace_metadata.json").read_text(encoding="utf-8")) + status = metadata.get("process_exit_status") + assert status == {"code": None, "label": ""} diff --git a/codetracer-python-recorder/tests/python/test_trace_balance.py b/codetracer-python-recorder/tests/python/test_trace_balance.py index 65c4680..bafbcdc 100644 --- a/codetracer-python-recorder/tests/python/test_trace_balance.py +++ b/codetracer-python-recorder/tests/python/test_trace_balance.py @@ -1,10 +1,13 @@ import json import importlib.util +import runpy from pathlib import Path from typing import List, Mapping import pytest +import codetracer_python_recorder as codetracer + from codetracer_python_recorder.trace_balance import ( TraceBalanceError, TraceBalanceResult, @@ -99,3 +102,79 @@ def test_cli_reports_success_for_balanced_trace(tmp_path: Path, capsys: pytest.C assert exit_code == 0 assert "Balanced trace" in captured.out + + +def test_activation_and_filter_skip_still_balances_trace(tmp_path: Path) -> None: + script = tmp_path / "app.py" + script.write_text( + """ +def side_effect(): + for _ in range(3): + pass + +if __name__ == "__main__": + side_effect() +""", + encoding="utf-8", + ) + + filter_file = tmp_path / "skip.toml" + filter_file.write_text( + """ +[meta] +name = "skip-main" +version = 1 + +[scope] +default_exec = "trace" +default_value_action = "allow" + +[[scope.rules]] +selector = "pkg:__main__" +exec = "skip" +value_default = "allow" +""", + encoding="utf-8", + ) + + trace_dir = tmp_path / "trace" + + session = codetracer.start( + trace_dir, + format="json", + start_on_enter=script, + trace_filter=[filter_file], + ) + try: + runpy.run_path(str(script), run_name="__main__") + finally: + session.stop() + + events = load_trace_events(trace_dir / "trace.json") + function_names: dict[int, str] = {} + next_function_id = 0 + for event in events: + payload = event.get("Function") + if payload: + function_names[next_function_id] = payload.get("name", "") + next_function_id += 1 + + toplevel_ids = {fid for fid, name in function_names.items() if name == ""} + assert len(toplevel_ids) == 1, f"expected single toplevel function, saw {toplevel_ids}" + + toplevel_call_count = sum( + 1 + for event in events + if "Call" in event and event["Call"].get("function_id") in toplevel_ids + ) + assert toplevel_call_count == 1 + + exit_returns = [ + event["Return"] + for event in events + if "Return" in event and event["Return"].get("return_value", {}).get("text") == "" + ] + assert len(exit_returns) == 1 + + script_names = {name for name in function_names.values() if name not in {""}} + assert "side_effect" not in script_names diff --git a/codetracer-python-recorder/tests/python/unit/test_auto_start.py b/codetracer-python-recorder/tests/python/unit/test_auto_start.py index 365c62a..939199f 100644 --- a/codetracer-python-recorder/tests/python/unit/test_auto_start.py +++ b/codetracer-python-recorder/tests/python/unit/test_auto_start.py @@ -38,7 +38,7 @@ def fake_start_backend( state["active"] = True captured_filters.append(filters) - def fake_stop_backend() -> None: + def fake_stop_backend(exit_code: int | None = None) -> None: state["active"] = False monkeypatch.setenv(auto_start.ENV_TRACE_PATH, str(trace_dir)) diff --git a/codetracer-python-recorder/tests/python/unit/test_session_helpers.py b/codetracer-python-recorder/tests/python/unit/test_session_helpers.py index 62cab9a..443acd3 100644 --- a/codetracer-python-recorder/tests/python/unit/test_session_helpers.py +++ b/codetracer-python-recorder/tests/python/unit/test_session_helpers.py @@ -63,13 +63,13 @@ def clear_active_session() -> None: def test_trace_session_stop_clears_global(monkeypatch: pytest.MonkeyPatch) -> None: - called = {"stop": False, "start": False} + called = {"stop_exit_codes": [], "start": False} def fake_start(*args, **kwargs) -> None: called["start"] = True - def fake_stop() -> None: - called["stop"] = True + def fake_stop(exit_code: int | None = None) -> None: + called["stop_exit_codes"].append(exit_code) monkeypatch.setattr(session, "_start_backend", fake_start) monkeypatch.setattr(session, "_stop_backend", fake_stop) @@ -78,7 +78,7 @@ def fake_stop() -> None: session._active_session = session.TraceSession(path=Path("/tmp"), format="json") session.stop() assert session._active_session is None - assert called["stop"] is True + assert called["stop_exit_codes"] == [None] def test_trace_session_flush_noop_when_inactive(monkeypatch: pytest.MonkeyPatch) -> None: @@ -107,9 +107,9 @@ def fake_start( trace_state["active"] = True calls["start"].append((Path(path), fmt, activation, filters)) - def fake_stop() -> None: + def fake_stop(exit_code: int | None = None) -> None: trace_state["active"] = False - calls["stop"].append(True) + calls["stop"].append(exit_code) monkeypatch.setattr(session, "_start_backend", fake_start) monkeypatch.setattr(session, "_stop_backend", fake_stop) @@ -124,7 +124,19 @@ def fake_stop() -> None: assert handle.format == "binary" assert calls["start"] == [(target, "binary", None, None)] - assert calls["stop"] == [True] + assert calls["stop"] == [None] + + +def test_stop_forwards_exit_code(monkeypatch: pytest.MonkeyPatch) -> None: + captured: list[int | None] = [] + + monkeypatch.setattr(session, "_is_tracing_backend", lambda: True) + monkeypatch.setattr(session, "_stop_backend", lambda code=None: captured.append(code)) + + session._active_session = session.TraceSession(path=Path("/tmp"), format="json") + session.stop(exit_code=123) + assert captured == [123] + assert session._active_session is None def test_normalize_trace_filter_handles_none() -> None: diff --git a/design-docs/adr/0015-balanced-toplevel-lifecycle-and-trace-gating.md b/design-docs/adr/0015-balanced-toplevel-lifecycle-and-trace-gating.md new file mode 100644 index 0000000..66e16b7 --- /dev/null +++ b/design-docs/adr/0015-balanced-toplevel-lifecycle-and-trace-gating.md @@ -0,0 +1,65 @@ +# ADR 0015: Balanced Toplevel Lifecycle and Unified Trace Gating + +- **Status:** Proposed +- **Date:** 2025-03-21 +- **Deciders:** codetracer recorder maintainers +- **Consulted:** DX tooling stakeholders, runtime tracing SMEs +- **Informed:** Support engineering, product analytics, replay consumers + +## Context +- The recorder seeds every trace with a synthetic `` call when `TraceWriter::start` is invoked from `TraceOutputPaths::configure_writer` (`codetracer-python-recorder/src/runtime/output_paths.rs`). That call models the Python process entrypoint but the runtime never emits the matching return edge. +- CLI and API entrypoints (`codetracer_python_recorder/cli.py`, `codetracer_python_recorder/session.py`) already capture the script's exit status, yet the Rust runtime is oblivious to it, so the trace file looks like the script is still running when recording ends. +- Runtime gating currently combines two orthogonal systems: the legacy activation controller (`codetracer-python-recorder/src/runtime/activation.rs`) that defers tracing until a configured file executes, and the newer `TraceFilterEngine` (`codetracer-python-recorder/src/runtime/tracer/filtering.rs`) that offers scope-level allow/deny decisions. Both mechanisms decide whether an event should be written, but they execute independently and cache their own state. +- Because activation and filtering have separate caches and lifecycle hooks, downstream policies (value capture, IO flushes) see inconsistent state: a filter-suppressed frame still triggers activation bookkeeping, and activation suspensions do not propagate into the filter cache. The split also makes it hard to reason about which events will be recorded in the presence of chained filters. + +## Problem +- **Unbalanced traces:** Without a `` return, consumers reconstructing the call stack see a dangling activation at depth 0. This breaks invariants in `TraceWriter::finish_writing_trace_events`, forces replay tools to special-case the synthetic frame, and hides the script's exit code even though callers already have it. +- **Duplicated gating logic:** Activation and filter decisions contradict one another in edge cases. For example: + - When activation gates tracing until a file runs, the filter still caches a `TraceDecision::Trace` for the same code object, so subsequent resumes bypass activation because the filter short-circuits to "disable location". + - When filters skip a frame, activation has no way to learn that the frame completed; its suspended/completed bookkeeping only triggers on return events that never fire for filter-disabled frames. +- The divergent implementations increase bug surface area (e.g., dangling activations, stale filter caches) and make it challenging to add new recorder policies that need a consistent view of "is this event observable?". + +## Decision +1. **Emit a `` return carrying process exit status.** + - Extend the PyO3 surface so `stop_tracing` accepts an optional integer exit code (default `None`). The Python helpers (`session.stop`, CLI `main`) will pass the script's final status. + - Add a `TraceSessionGuard` helper on the Rust side that stores the provided exit status until `RuntimeTracer::finish` runs. When `finish` executes, it must: + - Flush pending IO. + - Record the exit status via `TraceWriter::register_return`, tagging the payload as `` when the code is unknown (e.g., interpreter crash) and serialising the integer otherwise. + - Only then call the existing `finalise`/`cleanup` routines. + - If tracing aborts early (`OnRecorderError::Disable` or fatal errors), emit a `` return with a synthetic reason (`""` / captured exception) so the stack always balances. +2. **Unify activation and filter decisions behind a shared gate.** + - Introduce a `TraceGate` service managed by `RuntimeTracer`. The gate combines activation state and filter results into a single `GateDecision { process_event, disable_location, activation_event }`. + - `FilterCoordinator` becomes responsible for caching scope resolutions only when `TraceGate` reports that the frame was actually processed. When the gate denies an event, both activation and filter caches are notified so they can mark the code id as "ignored" in lockstep. + - `ActivationController` exposes explicit transitions (`on_enter`, `on_suspend`, `on_exit`) rather than letting callbacks poke its internal flags. `TraceGate` translates filter outcomes into the appropriate activation transition (e.g., a filter skip counts as `on_exit` so suspended activations resume correctly). + - All tracer callbacks (`on_py_start`, `on_py_return`, `on_py_yield`, etc.) ask the gate for a decision before doing work. They honour `disable_location` uniformly, so CPython stops invoking us for code objects that either filter or activation wants to suppress. + - Document the merged semantics: activation remains a coarse gate (enabling/disabling the root frame), filters apply fine-grained scope policies, and both share a single cache lifetime tied to tracer flush/reset. +3. **Update metadata and tooling expectations.** + - Persist the recorded exit status into trace metadata (`trace_metadata.json`) so downstream tools can rely on it without scanning events. + - Update docs and integration tests to assert that traces end at stack depth zero, even when activation suspends/resumes or filters drop frames. + +## Consequences +- **Benefits** + - Trace consumers no longer see dangling `` activations, and they can surface the script exit status directly from the trace file. + - Activation, filtering, value capture, and IO policies share a single gating decision, reducing state divergence and simplifying future features (e.g., per-filter activation windows). + - Error paths become easier to reason about because every exit funnels through the same `` return emission. +- **Costs** + - API changes propagate through the Python bindings (`stop_tracing`, `TraceSession.stop`, CLI), requiring coordination with users embedding the recorder programmatically. + - The gate abstraction adds code churn in `runtime/tracer/events.rs` and related helpers as callbacks adopt the new decision API. + - Metadata writers must update to include the exit status. +- **Risks** + - Forgetting to pass the exit status from bespoke integrations (custom Python entrypoints) would regress behaviour back to "unknown exit". We mitigate this with a backwards-compatible default (`None` translates to `` exit) and clear release notes. + - A buggy gate implementation could over-disable callbacks, suppressing legitimate trace data. We will add regression tests covering activation+filter combinations (activation path inside a skipped scope, resumed generators, etc.) before rollout. + - The PyO3 signature change may break ABI expectations if not versioned carefully. We will bump the crate minor version and document the new keyword argument. + +## Alternatives +- **Emit the `` return entirely on the Python side.** Rejected because it would duplicate writer logic in Python, bypass IO flush/value capture, and fail when users call the PyO3 API directly from Rust. +- **Keep activation and filter gating separate but document the quirks.** Rejected: we already hit real bugs (unbalanced traces, stale caches), and layering more documentation will not solve the underlying inconsistency. +- **Deprecate activation now that filters exist.** Rejected because activation provides a simple UX for "start tracing when my script begins", which filters alone cannot replace without writing bespoke configs. + +## References +- `codetracer-python-recorder/src/runtime/output_paths.rs` +- `codetracer-python-recorder/src/runtime/tracer/events.rs` +- `codetracer-python-recorder/src/runtime/activation.rs` +- `codetracer-python-recorder/src/runtime/tracer/filtering.rs` +- `codetracer_python_recorder/session.py` +- `codetracer_python_recorder/cli.py` diff --git a/design-docs/toplevel-exit-and-trace-gating-implementation-plan.md b/design-docs/toplevel-exit-and-trace-gating-implementation-plan.md new file mode 100644 index 0000000..aa73bd6 --- /dev/null +++ b/design-docs/toplevel-exit-and-trace-gating-implementation-plan.md @@ -0,0 +1,93 @@ +# Toplevel Exit & Trace Gating – Implementation Plan + +Plan owners: codetracer recorder maintainers +Target ADR: 0015 – Balanced Toplevel Lifecycle and Unified Trace Gating +Impacted components: +- `codetracer-python-recorder/src/session.rs` and `codetracer_python_recorder/session.py` +- `codetracer-python-recorder/src/runtime/tracer` (events, lifecycle, filtering) +- `codetracer-python-recorder/src/runtime/activation.rs` +- `codetracer-python-recorder/src/runtime/output_paths.rs` and metadata helpers +- `codetracer-pure-python-recorder` parity shims (optional but strongly recommended) + +## Goals +- Always emit a `` return event whose payload reflects the process exit status (or a descriptive placeholder when unavailable). +- Plumb exit codes from Python entrypoints through the PyO3 API into the Rust runtime without breaking existing integrations. +- Replace the ad-hoc combination of activation and filter decisions with a single gate so callbacks make consistent trace/skip/disable choices. +- Keep lifecycle bookkeeping (IO flush, value capture, activation teardown) in sync with the unified gate and the new exit record. +- Extend metadata (`trace_metadata.json`) with the recorded exit status for downstream tooling. + +## Non-Goals +- No changes to the on-disk trace schema beyond the new return record payload; we keep the existing call/return/line structure. +- No removal of activation support; the work only refactors it to cooperate with filters. +- No immediate addition of exit-status reporting to the CLI JSON trailers (can be follow-up). +- No attempt to refit the pure-Python recorder in the same PR; it may gain parity later but does not block landing the Rust changes. + +## Current Gaps +- `stop_tracing` (PyO3) accepts no arguments, so the runtime never learns the script exit status captured by `codetracer_python_recorder/cli.py`. +- `RuntimeTracer::finish` only finalises writers; it does not record any return edge for the synthetic `` call emitted in `TraceWriter::start`. +- Activation and filtering are checked independently inside each callback (`on_py_start`, `on_py_return`, etc.), leading to divergent cache state (`ActivationController::suspended` vs `FilterCoordinator::ignored_code_ids`). +- Filter-driven `CallbackOutcome::DisableLocation` does not inform the activation controller, so the activation window can remain "active" after CPython stops issuing callbacks. +- Metadata writers do not persist exit status, and tests assume partial stacks are acceptable. + +## Workstreams + +### WS1 – Public API & Session Plumbing +**Scope:** Carry exit status from Python to Rust with backwards-compatible defaults. +- Update `codetracer-python-recorder/src/session.rs::stop_tracing` to expose an optional `exit_code: Option` parameter (new keyword-only arg in Python). +- Adjust `codetracer_python_recorder/session.py` so `TraceSession.stop` and the module-level `stop()` accept an optional exit code and forward it. +- Modify `codetracer_python_recorder/cli.py::main` to pass the captured `exit_code` when stopping the session; preserve legacy behaviour (`None`) for callers that do not provide a code. +- Add unit tests in `codetracer_python_recorder/tests` ensuring the new keyword argument is optional and that `stop(exit_code=123)` calls into the backend with the expected value (mocking PyO3 layer). + +### WS2 – Runtime Exit State & `` Return Emission +**Scope:** Store the exit status and emit the balancing return event. +- Introduce a small struct (e.g., `SessionTermination`) inside `RuntimeTracer` to hold `exit_code: Option` plus a `reason: ExitReason`. +- Extend the `Tracer` trait implementation with a new method (e.g., `set_exit_status`) or reuse `notify_failure` paths to capture both normal exit and disable scenarios. +- In `RuntimeTracer::finish`, before `finalise`: + - Call `record_return_value` with the exit payload. + - Invoke `TraceWriter::register_return` / `mark_event`. + - Ensure activation receives an `ActivationExitKind::Completed` for the toplevel code id. +- Emit synthetic reasons (`""`, `""`, etc.) when tracing stops due to errors. Reuse existing error-path metadata in `notify_failure` to populate the reason. +- Update integration tests (Rust + Python) to assert that the final event sequence includes a `` call + return pair and that the return payload matches the script exit code. + +### WS3 – Unified Trace Gate Abstraction +**Scope:** Merge activation and filter decision paths. +- Create `TraceGate` and `GateDecision` types under `runtime/tracer`. + - API: `evaluate(py, code, event_kind) -> GateDecision`. + - Decision carries: `process_event` (bool), `disable_location` (bool), `activation_transition` (enum). +- Refactor `FilterCoordinator` so it exposes `decide_for_gate(py, code)` returning an enriched result (trace/skip plus cached scope resolution). The coordinator only updates caches when the gate confirms the event was processed. +- Update `ActivationController` with methods `on_enter`, `on_suspend`, `on_exit`, and `reset`. Remove direct field mutations from callbacks. +- Rewrite tracer callbacks (`on_py_start`, `on_py_return`, `on_py_resume`, etc.) to: + 1. Ask the gate for a decision. + 2. Exit early when `process_event == false` and return the proper `CallbackOutcome`. + 3. After recording the event, invoke any activation transition included in the decision. +- Ensure `CallbackOutcome::DisableLocation` is only returned once per code id and that both activation and filter caches mark the frame as ignored thereafter. +- Unit test the gate with synthetic `CodeObjectWrapper` fixtures covering combinations: inactive activation + allow filter, active activation + skip filter, suspended activation resumed, etc. + +### WS4 – Lifecycle & Metadata Updates +**Scope:** Keep writer lifecycle and metadata consistent with the new behaviour. +- Update `LifecycleController::finalise` (or adjacent helper) to write the exit status into `trace_metadata.json` under a new field (e.g., `"process_exit_code"`). Ensure this runs only once per session. +- Confirm `cleanup_partial_outputs` still executes when tracing disables early and that the exit record is only written for successful sessions. +- Add a regression test around `TraceOutputPaths::configure_writer` + `RuntimeTracer::finish` verifying the events buffer contains both call and return entries for ``. +- Update documentation (`design-docs/design-001.md`, user docs) to describe the exit-status metadata and the unified gating semantics. + +### WS5 – Validation & Parity Follow-Up +**Scope:** Prove end-to-end correctness and plan the optional pure-Python update. +- Extend Python integration tests (`tests/python`) with scenarios: + - CLI run that exits with non-zero status; assert trace contains `` return with the negative path. + - Activation path configured alongside a filter that skips the same file; ensure tracing starts/stops exactly once and stack depth ends at zero. + - Generator/coroutine workloads guarded by activation; confirm gate decisions do not regress existing balanced-call tests. +- Run `just test` to cover all tests after refactors. +- Document a follow-up issue to mirror `` return emission in `codetracer-pure-python-recorder`, keeping trace semantics aligned across products. + +## Testing & Rollout Checklist +- [ ] `just test` +- [ ] Python integration tests covering exit-code propagation and activation+filter combinations +- [ ] Manual smoke test: run CLI against a script returning exit code 3, inspect `trace.json` for `` return payload `3` +- [ ] Update changelog / release notes highlighting the new API parameter and exit-status metadata +- [ ] Notify downstream data pipeline owners that exit status is now available + +## Risks & Mitigations +- **Breaking API changes:** Ensure `stop_tracing` still works without arguments by providing a Python default (`exit_code: int | None = None`) and by releasing under a minor version bump. +- **Gate regressions:** Add exhaustive unit tests plus targeted integration tests so we catch scenarios where activation or filters no longer fire. +- **Performance impact:** Benchmark tracing hot paths after the refactor; the gate should add minimal overhead. Profile with `just bench` / existing benchmarks, and roll back micro-optimisations if regressions exceed 5%. +- **Incomplete error coverage:** Make sure disable/error paths still flush IO and write metadata. Write explicit tests that trigger `OnRecorderError::Disable` to observe the synthetic `` return reason. diff --git a/design-docs/toplevel-exit-and-trace-gating-implementation-plan.status.md b/design-docs/toplevel-exit-and-trace-gating-implementation-plan.status.md new file mode 100644 index 0000000..6886e51 --- /dev/null +++ b/design-docs/toplevel-exit-and-trace-gating-implementation-plan.status.md @@ -0,0 +1,50 @@ +# Toplevel Exit & Trace Gating – Status + +## Relevant Design Docs +- `design-docs/adr/0015-balanced-toplevel-lifecycle-and-trace-gating.md` +- `design-docs/toplevel-exit-and-trace-gating-implementation-plan.md` + +## Key Source Files +- `codetracer-python-recorder/src/session.rs` +- `codetracer_python_recorder/session.py` +- `codetracer_python_recorder/cli.py` +- `codetracer-python-recorder/src/monitoring/install.rs` +- `codetracer-python-recorder/src/monitoring/mod.rs` +- `codetracer-python-recorder/tests/python` (planned additions) + +## Workstream Progress + +### WS1 – Public API & Session Plumbing +- **Scope recap:** Teach `stop_tracing` to accept an optional exit code and propagate it through the Python helpers and CLI while keeping backwards compatibility. +- **Status:** _Completed_ + - `stop_tracing` now accepts an optional `exit_code` argument, and the Python session helpers/CLI forward the value. + - Added unit coverage ensuring `stop(exit_code=...)` forwards the value downstream while `stop()` remains valid. + - Verification: `just dev` (editable build with `integration-test` feature) and `just py-test` (Python suites across both recorders) pass. + +### WS2 – Runtime Exit State & `` Return Emission +- **Status:** _Completed_ + - `RuntimeTracer` now tracks a `SessionExitState`, emits the `` return during `finish`, and differentiates between explicit exit codes, default exits, and policy-driven disables. + - Added trait plumbing (`Tracer::set_exit_status`) plus installer wiring so `stop_tracing` can forward the exit code before teardown. + - Verification: `just cargo-test` (workspace) and `just py-test` exercises the new Rust test (`finish_emits_toplevel_return_with_exit_code`) and Python integration tests (`test_exit_payloads`). + +### WS3 – Unified Trace Gate Abstraction +- **Status:** _Completed_ + - Added a single `evaluate_gate` helper so callbacks defer to shared activation+filter logic, ensuring consistent disable decisions and reducing duplicated guards. + - All tracer callbacks now consult the helper; return handlers pass through the same decision pipeline before emitting events. + - Verification: `just cargo-test` and `just py-test`. + +### WS4 – Lifecycle & Metadata Updates +- **Status:** _Completed_ + - Metadata writer now records `process_exit_status` alongside filter info, and runtime emits a `` return before finalisation. + - Added regression coverage in Python for both exit code and default placeholder metadata values. + - Verification: `just cargo-test`, `just py-test`. + +### WS5 – Validation & Parity Follow-Up +- **Status:** _Completed_ +- Added explicit tests verifying exit metadata plus activation/filter interplay keeps the trace balanced (`test_exit_payloads`, `test_trace_balance`). +- Documented follow-up to mirror exit-status support in the pure-Python recorder before release. +- Verification: `just cargo-test`, `just py-test`. + +## Notes +- API changes will require a minor version bump once runtime support lands; capture release planning tasks after WS2. +- Remember to bootstrap the dev build (`just dev`) before Python suites so integration-test hooks stay enabled.