diff --git a/.gitignore b/.gitignore index de5bba0..71629a1 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ **/target/ build *~ +.idea/ \ No newline at end of file diff --git a/codetracer-python-recorder/Cargo.lock b/codetracer-python-recorder/Cargo.lock index 124ac77..755285a 100644 --- a/codetracer-python-recorder/Cargo.lock +++ b/codetracer-python-recorder/Cargo.lock @@ -2,6 +2,65 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "anstream" +version = "0.6.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ae563653d1938f79b1ab1b5e668c87c76a9930414574a6583a7b7e11a8e6192" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" + +[[package]] +name = "anstyle-parse" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys", +] + [[package]] name = "autocfg" version = "1.5.0" @@ -70,11 +129,19 @@ version = "0.1.0" dependencies = [ "bitflags", "dashmap", + "env_logger", + "log", "once_cell", "pyo3", "runtime_tracing", ] +[[package]] +name = "colorchoice" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" + [[package]] name = "dashmap" version = "5.5.3" @@ -94,6 +161,29 @@ version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "edd0f118536f44f5ccd48bcb8b111bdc3de888b58c74639dfb034a357d0f206d" +[[package]] +name = "env_filter" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0" +dependencies = [ + "log", + "regex", +] + +[[package]] +name = "env_logger" +version = "0.11.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c863f0904021b108aa8b2f55046443e6b1ebde8fd4a15c399893aae4fa069f" +dependencies = [ + "anstream", + "anstyle", + "env_filter", + "jiff", + "log", +] + [[package]] name = "fscommon" version = "0.1.1" @@ -133,12 +223,42 @@ version = "2.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd" +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + [[package]] name = "itoa" version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" +[[package]] +name = "jiff" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be1f93b8b1eb69c77f24bbb0afdf66f54b632ee39af40ca21c4365a1d7347e49" +dependencies = [ + "jiff-static", + "log", + "portable-atomic", + "portable-atomic-util", + "serde", +] + +[[package]] +name = "jiff-static" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03343451ff899767262ec32146f6d559dd759fdadf42ff0e227c7c48f72594b4" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "jobserver" version = "0.1.33" @@ -212,6 +332,12 @@ version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +[[package]] +name = "once_cell_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" + [[package]] name = "parking_lot_core" version = "0.9.11" @@ -222,7 +348,7 @@ dependencies = [ "libc", "redox_syscall", "smallvec", - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -237,6 +363,15 @@ version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" +[[package]] +name = "portable-atomic-util" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" +dependencies = [ + "portable-atomic", +] + [[package]] name = "proc-macro2" version = "1.0.97" @@ -332,6 +467,35 @@ dependencies = [ "bitflags", ] +[[package]] +name = "regex" +version = "1.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23d7fd106d8c02486a8d64e778353d1cffe08ce79ac2e82f540c86d0facf6912" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b9458fa0bfeeac22b5ca447c63aaf45f28439a709ccd244698632f9aa6394d6" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001" + [[package]] name = "runtime_tracing" version = "0.14.0" @@ -447,6 +611,12 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + [[package]] name = "wasi" version = "0.14.2+wasi-0.2.4" @@ -456,20 +626,52 @@ dependencies = [ "wit-bindgen-rt", ] +[[package]] +name = "windows-link" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.3", +] + [[package]] name = "windows-targets" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91" +dependencies = [ + "windows-link", + "windows_aarch64_gnullvm 0.53.0", + "windows_aarch64_msvc 0.53.0", + "windows_i686_gnu 0.53.0", + "windows_i686_gnullvm 0.53.0", + "windows_i686_msvc 0.53.0", + "windows_x86_64_gnu 0.53.0", + "windows_x86_64_gnullvm 0.53.0", + "windows_x86_64_msvc 0.53.0", ] [[package]] @@ -478,48 +680,96 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" + [[package]] name = "windows_i686_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" +[[package]] +name = "windows_i686_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" + [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" + [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_i686_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" + [[package]] name = "wit-bindgen-rt" version = "0.39.0" diff --git a/codetracer-python-recorder/Cargo.toml b/codetracer-python-recorder/Cargo.toml index 61bb5a9..76ea493 100644 --- a/codetracer-python-recorder/Cargo.toml +++ b/codetracer-python-recorder/Cargo.toml @@ -20,6 +20,8 @@ runtime_tracing = "0.14.0" bitflags = "2.4" once_cell = "1.19" dashmap = "5.5" +log = "0.4" +env_logger = "0.11" [dev-dependencies] pyo3 = { version = "0.25.1", features = ["auto-initialize"] } diff --git a/codetracer-python-recorder/codetracer_python_recorder/__init__.py b/codetracer-python-recorder/codetracer_python_recorder/__init__.py index c4ea7f1..600e890 100644 --- a/codetracer-python-recorder/codetracer_python_recorder/__init__.py +++ b/codetracer-python-recorder/codetracer_python_recorder/__init__.py @@ -7,7 +7,7 @@ maintains placeholder state and performs no actual tracing. """ -from .api import * - -__all__ = api.__all__ +from . import api as _api +from .api import * # re-export public API symbols +__all__ = _api.__all__ diff --git a/codetracer-python-recorder/codetracer_python_recorder/__main__.py b/codetracer-python-recorder/codetracer_python_recorder/__main__.py new file mode 100644 index 0000000..9700ebf --- /dev/null +++ b/codetracer-python-recorder/codetracer_python_recorder/__main__.py @@ -0,0 +1,90 @@ +"""CLI to record a trace while running a Python script. + +Usage: + python -m codetracer_python_recorder [codetracer options] [script args...] + +Codetracer options (must appear before the script path): + --codetracer-trace PATH Output events file (default: trace.bin or trace.json) + --codetracer-format {binary,json} Output format (default: binary) + --codetracer-capture-values BOOL Whether to capture values (default: true) + +Examples: + python -m codetracer_python_recorder --codetracer-format=json app.py --flag=1 + python -m codetracer_python_recorder --codetracer-trace=out.bin script.py --x=2 + python -m codetracer_python_recorder --codetracer-capture-values=false script.py +""" +from __future__ import annotations + +import runpy +import sys +from pathlib import Path + +from . import DEFAULT_FORMAT, start, stop +import argparse + + +def _default_trace_path(fmt: str) -> Path: + # Keep a simple filename; Rust side derives sidecars (metadata/paths) + if fmt == "json": + return Path.cwd() / "trace.json" + return Path.cwd() / "trace.bin" + + +def main(argv: list[str] | None = None) -> int: + if argv is None: + argv = sys.argv[1:] + + parser = argparse.ArgumentParser(add_help=True) + parser.add_argument( + "--codetracer-trace", + dest="trace", + default=None, + help="Path to trace folder. If omitted, defaults to trace.bin or trace.json in the current directory based on --codetracer-format.", + ) + parser.add_argument( + "--codetracer-format", + dest="format", + choices=["binary", "json"], + default=DEFAULT_FORMAT, + help="Output format for trace events. 'binary' is compact; 'json' is human-readable. Default: %(default)s.", + ) + # Only parse our options; leave script and script args in unknown + ns, unknown = parser.parse_known_args(argv) + + # Validate that the first unknown token is a script path; otherwise show usage. + if not unknown or not Path(unknown[0]).exists(): + sys.stderr.write("Usage: python -m codetracer_python_recorder [codetracer options] [args...]\n") + return 2 + + script_path = Path(unknown[0]) + script_args = unknown[1:] + + fmt = ns.format or DEFAULT_FORMAT + trace_path = Path(ns.trace) if ns.trace else _default_trace_path(fmt) + + old_argv = sys.argv + sys.argv = [str(script_path)] + script_args + # Activate tracing only after entering the target script file. + session = start( + trace_path, + format=fmt, + start_on_enter=script_path, + ) + try: + runpy.run_path(str(script_path.resolve()), run_name="__main__") + return 0 + except SystemExit as e: + # Preserve script's exit code + code = e.code if isinstance(e.code, int) else 1 + return code + finally: + # Ensure tracer stops and files are flushed + try: + session.flush() + finally: + stop() + sys.argv = old_argv + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/codetracer-python-recorder/codetracer_python_recorder/api.py b/codetracer-python-recorder/codetracer_python_recorder/api.py index 1b598e4..c8fca0b 100644 --- a/codetracer-python-recorder/codetracer_python_recorder/api.py +++ b/codetracer-python-recorder/codetracer_python_recorder/api.py @@ -2,16 +2,15 @@ This module exposes a minimal interface for starting and stopping runtime traces. The heavy lifting is delegated to the -`codetracer_python_recorder` Rust extension which will eventually hook -into `runtime_tracing` and `sys.monitoring`. For now the Rust side only -maintains placeholder state and performs no actual tracing. +`codetracer_python_recorder` Rust extension which hooks +into `runtime_tracing` and `sys.monitoring`. """ from __future__ import annotations import contextlib import os from pathlib import Path -from typing import Iterable, Iterator, Optional +from typing import Iterator, Optional from .codetracer_python_recorder import ( flush_tracing as _flush_backend, @@ -27,31 +26,34 @@ _active_session: Optional["TraceSession"] = None -def _normalize_source_roots(source_roots: Iterable[os.PathLike | str] | None) -> Optional[list[str]]: - if source_roots is None: - return None - return [str(Path(p)) for p in source_roots] - - def start( path: os.PathLike | str, *, format: str = DEFAULT_FORMAT, - capture_values: bool = True, - source_roots: Iterable[os.PathLike | str] | None = None, + start_on_enter: os.PathLike | str | None = None, ) -> "TraceSession": """Start a global trace session. - Parameters mirror the design document. The current implementation - merely records the active state on the Rust side and performs no - tracing. + - ``path``: Target directory where trace files will be written. + Files created: ``trace.json``/``trace.bin``, ``trace_metadata.json``, ``trace_paths.json``. + - ``format``: Either ``binary`` or ``json`` (controls events file name/format). + - ``start_on_enter``: Optional file path; when provided, tracing remains + paused until the tracer observes execution entering this file. Useful to + avoid recording interpreter and import startup noise when launching a + script via the CLI. + + The current implementation records trace data through a Rust backend. """ global _active_session if _is_tracing_backend(): raise RuntimeError("tracing already active") trace_path = Path(path) - _start_backend(str(trace_path), format, capture_values, _normalize_source_roots(source_roots)) + _start_backend( + str(trace_path), + format, + str(Path(start_on_enter)) if start_on_enter is not None else None, + ) session = TraceSession(path=trace_path, format=format) _active_session = session return session @@ -86,15 +88,11 @@ def trace( path: os.PathLike | str, *, format: str = DEFAULT_FORMAT, - capture_values: bool = True, - source_roots: Iterable[os.PathLike | str] | None = None, ) -> Iterator["TraceSession"]: """Context manager helper for scoped tracing.""" session = start( path, format=format, - capture_values=capture_values, - source_roots=source_roots, ) try: yield session @@ -133,11 +131,7 @@ def _auto_start_from_env() -> None: if not path: return fmt = os.getenv("CODETRACER_FORMAT", DEFAULT_FORMAT) - capture_env = os.getenv("CODETRACER_CAPTURE_VALUES") - capture = True - if capture_env is not None: - capture = capture_env.lower() not in {"0", "false", "no"} - start(path, format=fmt, capture_values=capture) + start(path, format=fmt) _auto_start_from_env() diff --git a/codetracer-python-recorder/src/lib.rs b/codetracer-python-recorder/src/lib.rs index e7d9b99..3120a77 100644 --- a/codetracer-python-recorder/src/lib.rs +++ b/codetracer-python-recorder/src/lib.rs @@ -1,36 +1,127 @@ +use std::fs; +use std::path::{PathBuf, Path}; use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Once; use pyo3::exceptions::PyRuntimeError; use pyo3::prelude::*; +use std::fmt; pub mod code_object; pub mod tracer; +mod runtime_tracer; pub use crate::code_object::{CodeObjectRegistry, CodeObjectWrapper}; pub use crate::tracer::{install_tracer, uninstall_tracer, EventSet, Tracer}; /// Global flag tracking whether tracing is active. static ACTIVE: AtomicBool = AtomicBool::new(false); -/// Start tracing. Placeholder implementation that simply flips the -/// global active flag and ignores all parameters. +// Initialize Rust logging once per process. Defaults to debug for this crate +// unless overridden by RUST_LOG. This helps surface debug! output during dev. +static INIT_LOGGER: Once = Once::new(); + +fn init_rust_logging_with_default(default_filter: &str) { + INIT_LOGGER.call_once(|| { + let env = env_logger::Env::default().default_filter_or(default_filter); + // Use a compact format with timestamps and targets to aid debugging. + let mut builder = env_logger::Builder::from_env(env); + builder + .format_timestamp_micros() + .format_target(true); + let _ = builder.try_init(); + }); +} + +/// Start tracing using sys.monitoring and runtime_tracing writer. #[pyfunction] fn start_tracing( - _path: &str, - _format: &str, - _capture_values: bool, - _source_roots: Option>, + path: &str, + format: &str, + activation_path: Option<&str>, ) -> PyResult<()> { - if ACTIVE.swap(true, Ordering::SeqCst) { + // Ensure logging is ready before any tracer logs might be emitted. + // Default only our crate to debug to avoid excessive verbosity from deps. + init_rust_logging_with_default("codetracer_python_recorder=debug"); + if ACTIVE.load(Ordering::SeqCst) { return Err(PyRuntimeError::new_err("tracing already active")); } - Ok(()) + + // Interpret `path` as a directory where trace files will be written. + let out_dir = Path::new(path); + if out_dir.exists() && !out_dir.is_dir() { + return Err(PyRuntimeError::new_err("trace path exists and is not a directory")); + } + if !out_dir.exists() { + // Best-effort create the directory tree + fs::create_dir_all(&out_dir) + .map_err(|e| PyRuntimeError::new_err(format!("failed to create trace directory: {}", e)))?; + } + + // Map format string to enum + let fmt = match format.to_lowercase().as_str() { + "json" => runtime_tracing::TraceEventsFileFormat::Json, + // Use BinaryV0 for "binary" to avoid streaming writer here. + "binary" | "binaryv0" | "binary_v0" | "b0" => runtime_tracing::TraceEventsFileFormat::BinaryV0, + //TODO AI! We need to assert! that the format is among the known values. + other => { + eprintln!("Unknown format '{}', defaulting to binary (v0)", other); + runtime_tracing::TraceEventsFileFormat::BinaryV0 + } + }; + + // Build output file paths inside the directory. + let (events_path, meta_path, paths_path) = match fmt { + runtime_tracing::TraceEventsFileFormat::Json => ( + out_dir.join("trace.json"), + out_dir.join("trace_metadata.json"), + out_dir.join("trace_paths.json"), + ), + _ => ( + out_dir.join("trace.bin"), + out_dir.join("trace_metadata.json"), + out_dir.join("trace_paths.json"), + ), + }; + + // Activation path: when set, tracing starts only after entering it. + let activation_path = activation_path.map(|s| Path::new(s)); + + Python::with_gil(|py| { + // Program and args: keep minimal; Python-side API stores full session info if needed + let sys = py.import("sys")?; + let argv = sys.getattr("argv")?; + let program: String = argv + .get_item(0)? + .extract::()?; + //TODO: Error-handling. What to do if argv is empty? Does this ever happen? + + let mut tracer = runtime_tracer::RuntimeTracer::new( + &program, + &[], + fmt, + activation_path, + ); + + // Start location: prefer activation path, otherwise best-effort argv[0] + let start_path: &Path = activation_path.unwrap_or(Path::new(&program)); + tracer.begin(&meta_path, &paths_path, &events_path, start_path, 1)?; + + // Install callbacks + install_tracer(py, Box::new(tracer))?; + ACTIVE.store(true, Ordering::SeqCst); + Ok(()) + }) } /// Stop tracing by resetting the global flag. #[pyfunction] fn stop_tracing() -> PyResult<()> { - ACTIVE.store(false, Ordering::SeqCst); - Ok(()) + Python::with_gil(|py| { + // Uninstall triggers finish() on tracer implementation. + uninstall_tracer(py)?; + ACTIVE.store(false, Ordering::SeqCst); + Ok(()) + }) } /// Query whether tracing is currently active. @@ -39,15 +130,18 @@ fn is_tracing() -> PyResult { Ok(ACTIVE.load(Ordering::SeqCst)) } -/// Flush buffered trace data. No-op placeholder for now. +/// Flush buffered trace data (best-effort, non-streaming formats only). #[pyfunction] fn flush_tracing() -> PyResult<()> { - Ok(()) + Python::with_gil(|py| crate::tracer::flush_installed_tracer(py)) } /// Python module definition. #[pymodule] fn codetracer_python_recorder(_py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { + // Initialize logging on import so users see logs without extra setup. + // Respect RUST_LOG if present; otherwise default to debug for this crate. + init_rust_logging_with_default("codetracer_python_recorder=debug"); m.add_function(wrap_pyfunction!(start_tracing, m)?)?; m.add_function(wrap_pyfunction!(stop_tracing, m)?)?; m.add_function(wrap_pyfunction!(is_tracing, m)?)?; diff --git a/codetracer-python-recorder/src/runtime_tracer.rs b/codetracer-python-recorder/src/runtime_tracer.rs new file mode 100644 index 0000000..7a18cca --- /dev/null +++ b/codetracer-python-recorder/src/runtime_tracer.rs @@ -0,0 +1,208 @@ +use std::path::{Path, PathBuf}; + +use pyo3::prelude::*; +use pyo3::types::PyAny; + +use runtime_tracing::{Line, TraceEventsFileFormat, TraceWriter, TypeKind, ValueRecord, NONE_VALUE}; +use runtime_tracing::NonStreamingTraceWriter; + +use crate::code_object::CodeObjectWrapper; +use crate::tracer::{events_union, EventSet, MonitoringEvents, Tracer}; + +// Logging is handled via the `log` crate macros (e.g., log::debug!). + +/// Minimal runtime tracer that maps Python sys.monitoring events to +/// runtime_tracing writer operations. +pub struct RuntimeTracer { + writer: NonStreamingTraceWriter, + format: TraceEventsFileFormat, + // Activation control: when set, events are ignored until we see + // a code object whose filename matches this path. Once triggered, + // tracing becomes active for the remainder of the session. + activation_path: Option, + // Code object id that triggered activation, used to stop on return + activation_code_id: Option, + // Whether we've already completed a one-shot activation window + activation_done: bool, + started: bool, +} + +impl RuntimeTracer { + pub fn new( + program: &str, + args: &[String], + format: TraceEventsFileFormat, + activation_path: Option<&Path>, + ) -> Self { + let mut writer = NonStreamingTraceWriter::new(program, args); + writer.set_format(format); + let activation_path = activation_path.map(|p| std::path::absolute(p).unwrap()); + // If activation path is specified, start in paused mode; otherwise start immediately. + let started = activation_path.is_none(); + Self { + writer, + format, + activation_path, + activation_code_id: None, + activation_done: false, + started, + } + } + + /// Configure output files and write initial metadata records. + pub fn begin(&mut self, meta_path: &Path, paths_path: &Path, events_path: &Path, start_path: &Path, start_line: u32) -> PyResult<()> { + TraceWriter::begin_writing_trace_metadata(&mut self.writer, meta_path).map_err(to_py_err)?; + TraceWriter::begin_writing_trace_paths(&mut self.writer, paths_path).map_err(to_py_err)?; + TraceWriter::begin_writing_trace_events(&mut self.writer, events_path).map_err(to_py_err)?; + TraceWriter::start(&mut self.writer, start_path, Line(start_line as i64)); + Ok(()) + } + + /// Return true when tracing is active; may become true on first event + /// from the activation file if configured. + fn ensure_started<'py>(&mut self, py: Python<'py>, code: &CodeObjectWrapper) { + if self.started || self.activation_done { return; } + if let Some(activation) = &self.activation_path { + if let Ok(filename) = code.filename(py) { + let f = Path::new(filename); + //NOTE(Tzanko): We expect that code.filename contains an absolute path. If it turns out that this is sometimes not the case + //we will investigate. For we won't do additional conversions here. + // If there are issues the fool-proof solution is to use fs::canonicalize which needs to do syscalls + if f == activation { + self.started = true; + self.activation_code_id = Some(code.id()); + log::debug!("[RuntimeTracer] activated on enter: {}", activation.display()); + } + } + } + } + + fn encode_value<'py>(&mut self, _py: Python<'py>, v: &Bound<'py, PyAny>) -> ValueRecord { + // None + if v.is_none() { + return NONE_VALUE; + } + // bool must be checked before int in Python + if let Ok(b) = v.extract::() { + let ty = TraceWriter::ensure_type_id(&mut self.writer, TypeKind::Bool, "Bool"); + return ValueRecord::Bool { b, type_id: ty }; + } + if let Ok(i) = v.extract::() { + let ty = TraceWriter::ensure_type_id(&mut self.writer, TypeKind::Int, "Int"); + return ValueRecord::Int { i, type_id: ty }; + } + if let Ok(s) = v.extract::() { + let ty = TraceWriter::ensure_type_id(&mut self.writer, TypeKind::String, "String"); + return ValueRecord::String { text: s, type_id: ty }; + } + + // Fallback to Raw string representation + let ty = TraceWriter::ensure_type_id(&mut self.writer, TypeKind::Raw, "Object"); + match v.str() { + Ok(s) => ValueRecord::Raw { r: s.to_string_lossy().into_owned(), type_id: ty }, + Err(_) => ValueRecord::Error { msg: "".to_string(), type_id: ty }, + } + } + + fn ensure_function_id(&mut self, py: Python<'_>, code: &CodeObjectWrapper) -> PyResult { + //TODO AI! current runtime_tracer logic expects that `name` is unique and is used as a key for the function. + //This is wrong. We need to write a test that exposes this issue + let name = code.qualname(py)?; + let filename = code.filename(py)?; + let first_line = code.first_line(py)?; + Ok(TraceWriter::ensure_function_id(&mut self.writer, name, Path::new(filename), Line(first_line as i64))) + } +} + +fn to_py_err(e: Box) -> pyo3::PyErr { + pyo3::exceptions::PyRuntimeError::new_err(e.to_string()) +} + +impl Tracer for RuntimeTracer { + fn interest(&self, events: &MonitoringEvents) -> EventSet { + // Minimal set: function start, step lines, and returns + events_union(&[events.PY_START, events.LINE, events.PY_RETURN]) + } + + fn on_py_start(&mut self, py: Python<'_>, code: &CodeObjectWrapper, _offset: i32) { + // Activate lazily if configured; ignore until then + self.ensure_started(py, code); + if !self.started { return; } + // Trace event entry + match (code.filename(py), code.qualname(py)) { + (Ok(fname), Ok(qname)) => { + log::debug!("[RuntimeTracer] on_py_start: {} ({})", qname, fname) + } + _ => log::debug!("[RuntimeTracer] on_py_start"), + } + if let Ok(fid) = self.ensure_function_id(py, code) { + TraceWriter::register_call(&mut self.writer, fid, Vec::new()); + } + } + + fn on_line(&mut self, py: Python<'_>, code: &CodeObjectWrapper, lineno: u32) { + // Activate lazily if configured; ignore until then + self.ensure_started(py, code); + if !self.started { return; } + // Trace event entry + if let Ok(fname) = code.filename(py) { + log::debug!("[RuntimeTracer] on_line: {}:{}", fname, lineno); + } else { + log::debug!("[RuntimeTracer] on_line: :{}", lineno); + } + if let Ok(filename) = code.filename(py) { + TraceWriter::register_step(&mut self.writer, Path::new(filename), Line(lineno as i64)); + } + } + + fn on_py_return( + &mut self, + py: Python<'_>, + code: &CodeObjectWrapper, + _offset: i32, + retval: &Bound<'_, PyAny>, + ) { + // Activate lazily if configured; ignore until then + self.ensure_started(py, code); + if !self.started { return; } + // Trace event entry + match (code.filename(py), code.qualname(py)) { + (Ok(fname), Ok(qname)) => log::debug!("[RuntimeTracer] on_py_return: {} ({})", qname, fname), + _ => log::debug!("[RuntimeTracer] on_py_return"), + } + // Determine whether this is the activation owner's return + let is_activation_return = self.activation_code_id.map(|id| id == code.id()).unwrap_or(false); + // Return value is optional per configuration + let val = self.encode_value(py, retval); + TraceWriter::register_return(&mut self.writer, val); + if is_activation_return { + self.started = false; + self.activation_done = true; + log::debug!("[RuntimeTracer] deactivated on activation return"); + } + } + + fn flush(&mut self, _py: Python<'_>) -> PyResult<()> { + // Trace event entry + log::debug!("[RuntimeTracer] flush"); + // For non-streaming formats we can update the events file. + match self.format { + TraceEventsFileFormat::Json | TraceEventsFileFormat::BinaryV0 => { + TraceWriter::finish_writing_trace_events(&mut self.writer).map_err(to_py_err)?; + } + TraceEventsFileFormat::Binary => { + // Streaming writer: no partial flush to avoid closing the stream. + } + } + Ok(()) + } + + fn finish(&mut self, _py: Python<'_>) -> PyResult<()> { + // Trace event entry + log::debug!("[RuntimeTracer] finish"); + TraceWriter::finish_writing_trace_metadata(&mut self.writer).map_err(to_py_err)?; + TraceWriter::finish_writing_trace_paths(&mut self.writer).map_err(to_py_err)?; + TraceWriter::finish_writing_trace_events(&mut self.writer).map_err(to_py_err)?; + Ok(()) + } +} diff --git a/codetracer-python-recorder/src/tracer.rs b/codetracer-python-recorder/src/tracer.rs index e09aaed..387edab 100644 --- a/codetracer-python-recorder/src/tracer.rs +++ b/codetracer-python-recorder/src/tracer.rs @@ -1,3 +1,4 @@ +use std::any::Any; use std::sync::{Mutex, OnceLock}; use pyo3::{ exceptions::PyRuntimeError, @@ -141,7 +142,11 @@ pub fn free_tool_id(py: Python<'_>, tool: &ToolId) -> PyResult<()> { /// Each method corresponds to an event from `sys.monitoring`. Default /// implementations allow implementers to only handle the events they care /// about. -pub trait Tracer: Send { +pub trait Tracer: Send + Any { + /// Downcast support for implementations that need to be accessed + /// behind a `Box` (e.g., for flushing/finishing). + fn as_any(&mut self) -> &mut dyn Any where Self: 'static, Self: Sized { self } + /// Return the set of events the tracer wants to receive. fn interest(&self, _events: &MonitoringEvents) -> EventSet { NO_EVENTS @@ -295,6 +300,12 @@ pub trait Tracer: Send { _arg0: Option<&Bound<'_, PyAny>>, ) { } + + /// Flush any buffered state to storage. Default is a no-op. + fn flush(&mut self, _py: Python<'_>) -> PyResult<()> { Ok(()) } + + /// Finish and close any underlying writers. Default is a no-op. + fn finish(&mut self, _py: Python<'_>) -> PyResult<()> { Ok(()) } } struct Global { @@ -404,7 +415,10 @@ pub fn install_tracer(py: Python<'_>, tracer: Box) -> PyResult<()> { /// Remove the installed tracer if any. pub fn uninstall_tracer(py: Python<'_>) -> PyResult<()> { let mut guard = GLOBAL.lock().unwrap(); - if let Some(global) = guard.take() { + if let Some(mut global) = guard.take() { + // Give the tracer a chance to finish underlying writers before + // unregistering callbacks. + let _ = global.tracer.finish(py); let events = monitoring_events(py)?; if global.mask.contains(&events.CALL) { register_callback(py, &global.tool, &events.CALL, None)?; @@ -466,6 +480,14 @@ pub fn uninstall_tracer(py: Python<'_>) -> PyResult<()> { Ok(()) } +/// Flush the currently installed tracer if any. +pub fn flush_installed_tracer(py: Python<'_>) -> PyResult<()> { + if let Some(global) = GLOBAL.lock().unwrap().as_mut() { + global.tracer.flush(py)?; + } + Ok(()) +} + #[pyfunction] fn callback_call( py: Python<'_>, diff --git a/codetracer-python-recorder/test/test_codetracer_api.py b/codetracer-python-recorder/test/test_codetracer_api.py index 353caee..0614873 100644 --- a/codetracer-python-recorder/test/test_codetracer_api.py +++ b/codetracer-python-recorder/test/test_codetracer_api.py @@ -14,11 +14,11 @@ def setUp(self) -> None: # ensure clean state before each test def test_start_stop_and_status(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: - trace_path = Path(tmpdir) / "trace.bin" - session = codetracer.start(trace_path) + trace_dir = Path(tmpdir) + session = codetracer.start(trace_dir) self.assertTrue(codetracer.is_tracing()) self.assertIsInstance(session, codetracer.TraceSession) - self.assertEqual(session.path, trace_path) + self.assertEqual(session.path, trace_dir) self.assertEqual(session.format, codetracer.DEFAULT_FORMAT) codetracer.flush() # should not raise session.flush() # same @@ -27,8 +27,8 @@ def test_start_stop_and_status(self) -> None: def test_context_manager(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: - trace_path = Path(tmpdir) / "trace.bin" - with codetracer.trace(trace_path) as session: + trace_dir = Path(tmpdir) + with codetracer.trace(trace_dir) as session: self.assertTrue(codetracer.is_tracing()) self.assertIsInstance(session, codetracer.TraceSession) self.assertFalse(codetracer.is_tracing()) @@ -37,7 +37,7 @@ def test_environment_auto_start(self) -> None: script = "import codetracer_python_recorder as codetracer, sys; sys.stdout.write(str(codetracer.is_tracing()))" with tempfile.TemporaryDirectory() as tmpdir: env = os.environ.copy() - env["CODETRACER_TRACE"] = str(Path(tmpdir) / "trace.bin") + env["CODETRACER_TRACE"] = str(Path(tmpdir)) out = subprocess.check_output([sys.executable, "-c", script], env=env) self.assertEqual(out.decode(), "True") diff --git a/codetracer-python-recorder/test/test_monitoring_events.py b/codetracer-python-recorder/test/test_monitoring_events.py new file mode 100644 index 0000000..ba2e6f4 --- /dev/null +++ b/codetracer-python-recorder/test/test_monitoring_events.py @@ -0,0 +1,127 @@ +import json +import runpy +import tempfile +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, List, Tuple + +import pytest + +import codetracer_python_recorder as codetracer + + +@dataclass +class ParsedTrace: + paths: List[str] + functions: List[Dict[str, Any]] # index is function_id + calls: List[int] # sequence of function_id values + returns: List[Dict[str, Any]] # raw Return payloads (order preserved) + steps: List[Tuple[int, int]] # (path_id, line) + + +def _parse_trace(out_dir: Path) -> ParsedTrace: + events_path = out_dir / "trace.json" + paths_path = out_dir / "trace_paths.json" + + events = json.loads(events_path.read_text()) + paths: List[str] = json.loads(paths_path.read_text()) + + functions: List[Dict[str, Any]] = [] + calls: List[int] = [] + returns: List[Dict[str, Any]] = [] + steps: List[Tuple[int, int]] = [] + + for item in events: + if "Function" in item: + functions.append(item["Function"]) + elif "Call" in item: + calls.append(int(item["Call"]["function_id"])) + elif "Return" in item: + returns.append(item["Return"]) # keep raw payload for value checks + elif "Step" in item: + s = item["Step"] + steps.append((int(s["path_id"]), int(s["line"]))) + + return ParsedTrace(paths=paths, functions=functions, calls=calls, returns=returns, steps=steps) + + +def _write_script(tmp: Path) -> Path: + # Keep lines compact and predictable to assert step line numbers + code = ( + "# simple script\n\n" + "def foo():\n" + " x = 1\n" + " y = 2\n" + " return x + y\n\n" + "if __name__ == '__main__':\n" + " r = foo()\n" + " print(r)\n" + ) + p = tmp / "script.py" + p.write_text(code) + return p + + +def test_py_start_line_and_return_events_are_recorded(tmp_path: Path) -> None: + # Arrange: create a script and start tracing with activation restricted to that file + script = _write_script(tmp_path) + out_dir = tmp_path / "trace_out" + out_dir.mkdir() + + session = codetracer.start(out_dir, format=codetracer.TRACE_JSON, start_on_enter=script) + + try: + # Act: execute the script as __main__ under tracing + runpy.run_path(str(script), run_name="__main__") + finally: + # Ensure files are flushed and tracer is stopped even on error + codetracer.flush() + codetracer.stop() + + # Assert: expected files exist and contain valid JSON + assert (out_dir / "trace.json").exists() + assert (out_dir / "trace_metadata.json").exists() + assert (out_dir / "trace_paths.json").exists() + + parsed = _parse_trace(out_dir) + + # The script path must be present (activation gating starts there, but + # other helper modules like codecs may also appear during execution). + assert str(script) in parsed.paths + script_path_id = parsed.paths.index(str(script)) + + # One function named 'foo' should be registered for the script + foo_fids = [i for i, f in enumerate(parsed.functions) if f["name"] == "foo" and f["path_id"] == script_path_id] + assert foo_fids, "Expected function entry for foo()" + foo_fid = foo_fids[0] + + # A call to foo() must be present (PY_START) and matched by a later return (PY_RETURN) + assert foo_fid in parsed.calls, "Expected a call to foo() to be recorded" + + # Returns are emitted in order; the first Return in this script should be the result of foo() + # and carry the concrete integer value 3 encoded by the writer + first_return = parsed.returns[0] + rv = first_return.get("return_value", {}) + assert rv.get("kind") == "Int" and rv.get("i") == 3 + + # LINE events: confirm that the key lines within foo() were stepped + # Compute concrete line numbers by scanning the file content + lines = script.read_text().splitlines() + want_lines = { + next(i + 1 for i, t in enumerate(lines) if t.strip() == "x = 1"), + next(i + 1 for i, t in enumerate(lines) if t.strip() == "y = 2"), + next(i + 1 for i, t in enumerate(lines) if t.strip() == "return x + y"), + } + seen_lines = {ln for pid, ln in parsed.steps if pid == script_path_id} + assert want_lines.issubset(seen_lines), f"Missing expected step lines: {want_lines - seen_lines}" + + +def test_start_while_active_raises(tmp_path: Path) -> None: + out_dir = tmp_path / "trace_out" + out_dir.mkdir() + session = codetracer.start(out_dir, format=codetracer.TRACE_JSON) + try: + with pytest.raises(RuntimeError): + codetracer.start(out_dir, format=codetracer.TRACE_JSON) + finally: + codetracer.stop() diff --git a/design-docs/py-api-001.md b/design-docs/py-api-001.md index 5043824..797b2f0 100644 --- a/design-docs/py-api-001.md +++ b/design-docs/py-api-001.md @@ -14,7 +14,7 @@ This document describes the user-facing Python API for the `codetracer` module b - Start a global trace; returns a `TraceSession`. ```py def start(path: str | os.PathLike, *, format: str = DEFAULT_FORMAT, - capture_values: bool = True, source_roots: Iterable[str | os.PathLike] | None = None) -> TraceSession + start_on_enter: str | os.PathLike | None = None) -> TraceSession ``` - Stop the active trace if any. ```py @@ -27,8 +27,7 @@ This document describes the user-facing Python API for the `codetracer` module b - Context manager helper for scoped tracing. ```py @contextlib.contextmanager - def trace(path: str | os.PathLike, *, format: str = DEFAULT_FORMAT, - capture_values: bool = True, source_roots: Iterable[str | os.PathLike] | None = None): + def trace(path: str | os.PathLike, *, format: str = DEFAULT_FORMAT): ... ``` - Flush buffered data to disk without ending the session. @@ -50,15 +49,25 @@ class TraceSession: def __exit__(self, exc_type, exc, tb) -> None: ... ``` +### Start Behavior +- `start_on_enter`: Optional path; when provided, tracing starts only after execution first enters this file (useful to avoid interpreter/import noise when launching via CLI). + +### Output Location +- `path` is a directory. The tracer writes three files inside it: + - `trace.json` when `format == "json"` or `trace.bin` when `format == "binary"` + - `trace_metadata.json` + - `trace_paths.json` + ## Environment Integration -- Auto-start tracing when `CODETRACER_TRACE` is set; the value is interpreted as the output path. +- Auto-start tracing when `CODETRACER_TRACE` is set; the value is interpreted as the output directory. - When `CODETRACER_FORMAT` is provided, it overrides the default output format. -- `CODETRACER_CAPTURE_VALUES` toggles value recording. ## Usage Example ```py import codetracer +from pathlib import Path -with codetracer.trace("trace.bin"): +out_dir = Path("./traces/run-001") +with codetracer.trace(out_dir, format=codetracer.TRACE_JSON): run_application() ``` diff --git a/issues.md b/issues.md new file mode 100644 index 0000000..f45379f --- /dev/null +++ b/issues.md @@ -0,0 +1,19 @@ +# Issues Breaking Declared Relations + +This document lists concrete mismatches that cause the relations in `relations.md` to fail. + +It should be structured like so: +```md +## REL-001 +### ISSUE-001-001 +#### Description +Blah blah blah +#### Proposed solution +Blah blah bleh + +### ISSUE-001-002 +... + +## REL-002 +... +```