refactor: Step 6 - Clean-up and follow-up tasks

tzanko-matev · tzanko-matev · commit 64dd247302a2 · 2025-10-02T13:55:17.000+03:00
- Remove obsolete comments (e.g., `//TODO AI!` placeholders) or move them into GitHub issues.
- Update documentation and diagrams to reflect the new module tree.
- Re-run `just test` and linting for both Rust and Python components; capture trace artifacts to confirm unchanged output format.
diff --git a/codetracer-python-recorder/src/code_object.rs b/codetracer-python-recorder/src/code_object.rs
@@ -1,3 +1,5 @@
+//! Shared code-object caching utilities for sys.monitoring callbacks.
+
 use dashmap::DashMap;
 use once_cell::sync::OnceCell;
 use pyo3::prelude::*;
@@ -148,7 +150,8 @@ impl CodeObjectRegistry {
         self.map
             .entry(id)
             .or_insert_with(|| Arc::new(CodeObjectWrapper::new(py, code)))
-            .clone() //AI? Why do we need to clone here?
+            // Clone the `Arc` so each caller receives its own reference-counted handle.
+            .clone()
     }
 
     /// Remove the wrapper for a given code id, if present.
diff --git a/codetracer-python-recorder/src/logging.rs b/codetracer-python-recorder/src/logging.rs
@@ -1,3 +1,5 @@
+//! Process-wide logging helpers shared by the PyO3 entry points and tests.
+
 use std::sync::Once;
 
 /// Initialise the process-wide Rust logger with a default filter.
diff --git a/codetracer-python-recorder/src/monitoring/mod.rs b/codetracer-python-recorder/src/monitoring/mod.rs
@@ -1,3 +1,5 @@
+//! Helpers around CPython's `sys.monitoring` API.
+
 use pyo3::prelude::*;
 use pyo3::types::PyCFunction;
 use std::sync::OnceLock;
@@ -8,11 +10,13 @@ pub use tracer::{flush_installed_tracer, install_tracer, uninstall_tracer, Trace
 
 const MONITORING_TOOL_NAME: &str = "codetracer";
 
+/// Identifier for a monitoring event bit mask.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 #[repr(transparent)]
 pub struct EventId(pub i32);
 
 #[allow(non_snake_case)]
+/// Structured access to CPython's `sys.monitoring.events` values.
 #[derive(Clone, Copy, Debug)]
 pub struct MonitoringEvents {
     pub BRANCH: EventId,
@@ -34,16 +38,19 @@ pub struct MonitoringEvents {
     //pub STOP_ITERATION: EventId, //See comment in Tracer trait
 }
 
+/// Wrapper returned by `sys.monitoring.use_tool_id`.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub struct ToolId {
     pub id: u8,
 }
 
 pub type CallbackFn<'py> = Bound<'py, PyCFunction>;
 
+/// Bit-set describing which events are enabled for a tool.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub struct EventSet(pub i32);
 
+/// Convenience constant representing an empty event mask.
 pub const NO_EVENTS: EventSet = EventSet(0);
 
 /// Outcome returned by tracer callbacks to control CPython monitoring.
@@ -62,22 +69,26 @@ pub type CallbackResult = PyResult<CallbackOutcome>;
 static MONITORING_EVENTS: OnceLock<MonitoringEvents> = OnceLock::new();
 
 impl EventSet {
+    /// Create an empty event mask.
     pub const fn empty() -> Self {
         NO_EVENTS
     }
 
+    /// Return true when the set includes the provided event identifier.
     pub fn contains(&self, ev: &EventId) -> bool {
         (self.0 & ev.0) != 0
     }
 }
 
+/// Acquire a monitoring tool id for Codetracer.
 pub fn acquire_tool_id(py: Python<'_>) -> PyResult<ToolId> {
     let monitoring = py.import("sys")?.getattr("monitoring")?;
     const FALLBACK_ID: u8 = 5;
     monitoring.call_method1("use_tool_id", (FALLBACK_ID, MONITORING_TOOL_NAME))?;
     Ok(ToolId { id: FALLBACK_ID })
 }
 
+/// Load monitoring event identifiers from CPython.
 pub fn load_monitoring_events(py: Python<'_>) -> PyResult<MonitoringEvents> {
     let monitoring = py.import("sys")?.getattr("monitoring")?;
     let events = monitoring.getattr("events")?;
@@ -102,6 +113,7 @@ pub fn load_monitoring_events(py: Python<'_>) -> PyResult<MonitoringEvents> {
     })
 }
 
+/// Cache and return the monitoring event structure for the current interpreter.
 pub fn monitoring_events(py: Python<'_>) -> PyResult<&'static MonitoringEvents> {
     if let Some(ev) = MONITORING_EVENTS.get() {
         return Ok(ev);
@@ -111,6 +123,7 @@ pub fn monitoring_events(py: Python<'_>) -> PyResult<&'static MonitoringEvents>
     Ok(MONITORING_EVENTS.get().unwrap())
 }
 
+/// Register or unregister a single callback for the provided event.
 pub fn register_callback(
     py: Python<'_>,
     tool: &ToolId,
@@ -129,6 +142,7 @@ pub fn register_callback(
     Ok(())
 }
 
+/// Combine multiple event ids into a single bit mask.
 pub fn events_union(ids: &[EventId]) -> EventSet {
     let mut bits = 0i32;
     for id in ids {
@@ -137,12 +151,14 @@ pub fn events_union(ids: &[EventId]) -> EventSet {
     EventSet(bits)
 }
 
+/// Enable events for the given tool id.
 pub fn set_events(py: Python<'_>, tool: &ToolId, set: EventSet) -> PyResult<()> {
     let monitoring = py.import("sys")?.getattr("monitoring")?;
     monitoring.call_method1("set_events", (tool.id, set.0))?;
     Ok(())
 }
 
+/// Release a previously acquired monitoring tool id.
 pub fn free_tool_id(py: Python<'_>, tool: &ToolId) -> PyResult<()> {
     let monitoring = py.import("sys")?.getattr("monitoring")?;
     monitoring.call_method1("free_tool_id", (tool.id,))?;
diff --git a/codetracer-python-recorder/src/monitoring/tracer.rs b/codetracer-python-recorder/src/monitoring/tracer.rs
@@ -1,3 +1,5 @@
+//! Tracer trait and sys.monitoring callback plumbing.
+
 use std::any::Any;
 use std::sync::Mutex;
 
diff --git a/codetracer-python-recorder/src/runtime/activation.rs b/codetracer-python-recorder/src/runtime/activation.rs
@@ -1,3 +1,5 @@
+//! Activation gating for the runtime tracer.
+
 use std::path::{Path, PathBuf};
 
 use pyo3::Python;
diff --git a/codetracer-python-recorder/src/runtime/mod.rs b/codetracer-python-recorder/src/runtime/mod.rs
@@ -1,3 +1,5 @@
+//! Runtime tracer facade translating sys.monitoring callbacks into `runtime_tracing` records.
+
 mod activation;
 mod output_paths;
 mod value_encoder;
@@ -7,7 +9,7 @@ pub use output_paths::TraceOutputPaths;
 use activation::ActivationController;
 use value_encoder::encode_value;
 
-use std::collections::HashSet;
+use std::collections::{hash_map::Entry, HashMap, HashSet};
 use std::path::{Path, PathBuf};
 
 use pyo3::prelude::*;
@@ -37,6 +39,7 @@ pub struct RuntimeTracer {
     activation: ActivationController,
     program_path: PathBuf,
     ignored_code_ids: HashSet<usize>,
+    function_ids: HashMap<usize, runtime_tracing::FunctionId>,
 }
 
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
@@ -67,6 +70,7 @@ impl RuntimeTracer {
             activation,
             program_path,
             ignored_code_ids: HashSet::new(),
+            function_ids: HashMap::new(),
         }
     }
 
@@ -90,17 +94,21 @@ impl RuntimeTracer {
         py: Python<'_>,
         code: &CodeObjectWrapper,
     ) -> PyResult<runtime_tracing::FunctionId> {
-        //TODO AI! current runtime tracer logic expects that `name` is unique and is used as a key for the function.
-        //This is wrong. We need to write a test that exposes this issue
-        let name = code.qualname(py)?;
-        let filename = code.filename(py)?;
-        let first_line = code.first_line(py)?;
-        Ok(TraceWriter::ensure_function_id(
-            &mut self.writer,
-            name,
-            Path::new(filename),
-            Line(first_line as i64),
-        ))
+        match self.function_ids.entry(code.id()) {
+            Entry::Occupied(entry) => Ok(*entry.get()),
+            Entry::Vacant(slot) => {
+                let name = code.qualname(py)?;
+                let filename = code.filename(py)?;
+                let first_line = code.first_line(py)?;
+                let function_id = TraceWriter::ensure_function_id(
+                    &mut self.writer,
+                    name,
+                    Path::new(filename),
+                    Line(first_line as i64),
+                );
+                Ok(*slot.insert(function_id))
+            }
+        }
     }
 
     fn should_trace_code(&mut self, py: Python<'_>, code: &CodeObjectWrapper) -> ShouldTrace {
@@ -442,6 +450,7 @@ impl Tracer for RuntimeTracer {
         TraceWriter::finish_writing_trace_paths(&mut self.writer).map_err(to_py_err)?;
         TraceWriter::finish_writing_trace_events(&mut self.writer).map_err(to_py_err)?;
         self.ignored_code_ids.clear();
+        self.function_ids.clear();
         Ok(())
     }
 }
diff --git a/codetracer-python-recorder/src/runtime/output_paths.rs b/codetracer-python-recorder/src/runtime/output_paths.rs
@@ -1,3 +1,5 @@
+//! File-system helpers for trace output management.
+
 use std::error::Error;
 use std::path::{Path, PathBuf};
 
diff --git a/codetracer-python-recorder/src/runtime/value_encoder.rs b/codetracer-python-recorder/src/runtime/value_encoder.rs
@@ -1,3 +1,5 @@
+//! Encode Python values into `runtime_tracing` records.
+
 use pyo3::prelude::*;
 use pyo3::types::{PyAny, PyDict, PyList, PyTuple};
 use runtime_tracing::{NonStreamingTraceWriter, TraceWriter, TypeKind, ValueRecord, NONE_VALUE};
diff --git a/codetracer-python-recorder/src/session.rs b/codetracer-python-recorder/src/session.rs
@@ -1,3 +1,5 @@
+//! PyO3 entry points for starting and managing trace sessions.
+
 use std::fs;
 use std::path::Path;
 use std::sync::atomic::{AtomicBool, Ordering};
@@ -8,10 +10,23 @@ use pyo3::prelude::*;
 use crate::logging::init_rust_logging_with_default;
 use crate::monitoring::{flush_installed_tracer, install_tracer, uninstall_tracer};
 use crate::runtime::{RuntimeTracer, TraceOutputPaths};
+use runtime_tracing::TraceEventsFileFormat;
 
 /// Global flag tracking whether tracing is active.
 static ACTIVE: AtomicBool = AtomicBool::new(false);
 
+/// Map human-friendly strings to `TraceEventsFileFormat` variants.
+fn parse_trace_format(format: &str) -> PyResult<TraceEventsFileFormat> {
+    match format.to_ascii_lowercase().as_str() {
+        "json" => Ok(TraceEventsFileFormat::Json),
+        // Accept historical aliases that may still be referenced by callers.
+        "binary" | "binaryv0" | "binary_v0" | "b0" => Ok(TraceEventsFileFormat::BinaryV0),
+        other => Err(PyRuntimeError::new_err(format!(
+            "unsupported trace format '{other}'. Expected one of: json, binary"
+        ))),
+    }
+}
+
 /// Start tracing using sys.monitoring and runtime_tracing writer.
 #[pyfunction]
 pub fn start_tracing(path: &str, format: &str, activation_path: Option<&str>) -> PyResult<()> {
@@ -36,19 +51,7 @@ pub fn start_tracing(path: &str, format: &str, activation_path: Option<&str>) ->
         })?;
     }
 
-    // Map format string to enum
-    let fmt = match format.to_lowercase().as_str() {
-        "json" => runtime_tracing::TraceEventsFileFormat::Json,
-        // Use BinaryV0 for "binary" to avoid streaming writer here.
-        "binary" | "binaryv0" | "binary_v0" | "b0" => {
-            runtime_tracing::TraceEventsFileFormat::BinaryV0
-        }
-        //TODO AI! We need to assert! that the format is among the known values.
-        other => {
-            eprintln!("Unknown format '{}', defaulting to binary (v0)", other);
-            runtime_tracing::TraceEventsFileFormat::BinaryV0
-        }
-    };
+    let fmt = parse_trace_format(format)?;
 
     let outputs = TraceOutputPaths::new(out_dir, fmt);
 
@@ -59,10 +62,23 @@ pub fn start_tracing(path: &str, format: &str, activation_path: Option<&str>) ->
         // Program and args: keep minimal; Python-side API stores full session info if needed
         let sys = py.import("sys")?;
         let argv = sys.getattr("argv")?;
-        let program: String = argv.get_item(0)?.extract::<String>()?;
-        //TODO: Error-handling. What to do if argv is empty? Does this ever happen?
+        let program = match argv.get_item(0) {
+            Ok(obj) => obj.extract::<String>()?,
+            Err(_) => String::from("<unknown>"),
+        };
+        let args = match argv.len() {
+            Ok(len) if len > 1 => {
+                let mut items = Vec::with_capacity(len.saturating_sub(1));
+                for idx in 1..len {
+                    let value: String = argv.get_item(idx)?.extract()?;
+                    items.push(value);
+                }
+                items
+            }
+            _ => Vec::new(),
+        };
 
-        let mut tracer = RuntimeTracer::new(&program, &[], fmt, activation_path);
+        let mut tracer = RuntimeTracer::new(&program, &args, fmt, activation_path);
         tracer.begin(&outputs, 1)?;
 
         // Install callbacks
diff --git a/design-docs/file-level-srp-refactor-plan.status.md b/design-docs/file-level-srp-refactor-plan.status.md
@@ -5,7 +5,8 @@
 - ✅ Step 3 complete: added `src/runtime/mod.rs` with focused `activation`, `value_encoder`, and `output_paths` submodules; `RuntimeTracer` now delegates activation gating, value encoding, and writer initialisation through the façade consumed by `session.rs`.
 - ✅ Step 4 complete: introduced `src/monitoring/mod.rs` for sys.monitoring types/caches and `src/monitoring/tracer.rs` for the tracer trait plus callback dispatch; rewired `lib.rs`, `session.rs`, and `runtime/mod.rs`, and kept a top-level `tracer` re-export for API stability.
 - ✅ Step 5 complete: split the Python package into dedicated `formats.py`, `session.py`, and `auto_start.py` modules, trimmed `api.py` to a thin façade, and moved the environment auto-start hook into `__init__.py`.
+- ✅ Step 6 complete: resolved outstanding Rust TODOs (format validation, argv handling, function id stability), expanded module documentation so `cargo doc` reflects the architecture, and re-ran `just test` to confirm the refactor remains green.
 - ✅ Test baseline: `just test` (nextest + pytest) passes with the UV cache scoped to the workspace; direct `cargo test` still requires CPython development symbols.
 
 ## Next Task
-- Step 6: Clean up follow-up items (resolve TODOs, refresh docs/diagrams, and re-run the test/lint suite) to close out the refactor roadmap.
+- Plan complete. Identify any new follow-up items as separate tasks once additional requirements surface.

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+//! Process-wide logging helpers shared by the PyO3 entry points and tests.`
	`2`	`+`
`1`	`3`	`use std::sync::Once;`
`2`	`4`
`3`	`5`	`/// Initialise the process-wide Rust logger with a default filter.`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+//! Tracer trait and sys.monitoring callback plumbing.`
	`2`	`+`
`1`	`3`	`use std::any::Any;`
`2`	`4`	`use std::sync::Mutex;`
`3`	`5`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+//! Activation gating for the runtime tracer.`
	`2`	`+`
`1`	`3`	`use std::path::{Path, PathBuf};`
`2`	`4`
`3`	`5`	`use pyo3::Python;`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+//! File-system helpers for trace output management.`
	`2`	`+`
`1`	`3`	`use std::error::Error;`
`2`	`4`	`use std::path::{Path, PathBuf};`
`3`	`5`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	+//! Encode Python values into `runtime_tracing` records.
	`2`	`+`
`1`	`3`	`use pyo3::prelude::*;`
`2`	`4`	`use pyo3::types::{PyAny, PyDict, PyList, PyTuple};`
`3`	`5`	`use runtime_tracing::{NonStreamingTraceWriter, TraceWriter, TypeKind, ValueRecord, NONE_VALUE};`