Make output assertions more explicit (#4784)

pakrym-oai · web-flow · commit b2d81a7cacde · 2025-10-05T16:01:38.000-07:00
Match using precise regexes.
diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock
diff --git a/codex-rs/core/tests/common/Cargo.toml b/codex-rs/core/tests/common/Cargo.toml
@@ -10,6 +10,7 @@ path = "lib.rs"
 anyhow = { workspace = true }
 assert_cmd = { workspace = true }
 codex-core = { workspace = true }
+regex-lite = { workspace = true }
 serde_json = { workspace = true }
 tempfile = { workspace = true }
 tokio = { workspace = true, features = ["time"] }
diff --git a/codex-rs/core/tests/common/lib.rs b/codex-rs/core/tests/common/lib.rs
@@ -6,6 +6,7 @@ use codex_core::CodexConversation;
 use codex_core::config::Config;
 use codex_core::config::ConfigOverrides;
 use codex_core::config::ConfigToml;
+use regex_lite::Regex;
 
 #[cfg(target_os = "linux")]
 use assert_cmd::cargo::cargo_bin;
@@ -14,6 +15,16 @@ pub mod responses;
 pub mod test_codex;
 pub mod test_codex_exec;
 
+#[track_caller]
+pub fn assert_regex_match<'s>(pattern: &str, actual: &'s str) -> regex_lite::Captures<'s> {
+    let regex = Regex::new(pattern).unwrap_or_else(|err| {
+        panic!("failed to compile regex {pattern:?}: {err}");
+    });
+    regex
+        .captures(actual)
+        .unwrap_or_else(|| panic!("regex {pattern:?} did not match {actual:?}"))
+}
+
 /// Returns a default `Config` whose on-disk state is confined to the provided
 /// temporary directory. Using a per-test directory keeps tests hermetic and
 /// avoids clobbering a developer’s real `~/.codex`.
diff --git a/codex-rs/core/tests/suite/shell_serialization.rs b/codex-rs/core/tests/suite/shell_serialization.rs
@@ -8,6 +8,7 @@ use codex_core::protocol::InputItem;
 use codex_core::protocol::Op;
 use codex_core::protocol::SandboxPolicy;
 use codex_protocol::config_types::ReasoningSummary;
+use core_test_support::assert_regex_match;
 use core_test_support::responses::ev_assistant_message;
 use core_test_support::responses::ev_completed;
 use core_test_support::responses::ev_function_call;
@@ -131,10 +132,7 @@ async fn shell_output_stays_json_without_freeform_apply_patch() -> Result<()> {
         .get("output")
         .and_then(Value::as_str)
         .unwrap_or_default();
-    assert!(
-        stdout.contains("shell json"),
-        "expected stdout to include command output, got {stdout:?}"
-    );
+    assert_regex_match(r"(?s)^shell json\n?$", stdout);
 
     Ok(())
 }
@@ -190,18 +188,12 @@ async fn shell_output_is_structured_with_freeform_apply_patch() -> Result<()> {
         serde_json::from_str::<Value>(output).is_err(),
         "expected structured shell output to be plain text",
     );
-    assert!(
-        output.starts_with("Exit code: 0\n"),
-        "expected exit code prefix, got {output:?}",
-    );
-    assert!(
-        output.contains("\nOutput:\n"),
-        "expected Output section, got {output:?}"
-    );
-    assert!(
-        output.contains("freeform shell"),
-        "expected stdout content, got {output:?}"
-    );
+    let expected_pattern = r"(?s)^Exit code: 0
+Wall time: [0-9]+(?:\.[0-9]+)? seconds
+Output:
+freeform shell
+?$";
+    assert_regex_match(expected_pattern, output);
 
     Ok(())
 }
@@ -259,18 +251,27 @@ async fn shell_output_reserializes_truncated_content() -> Result<()> {
         serde_json::from_str::<Value>(output).is_err(),
         "expected truncated shell output to be plain text",
     );
-    assert!(
-        output.starts_with("Exit code: 0\n"),
-        "expected exit code prefix, got {output:?}",
-    );
-    assert!(
-        output.lines().any(|line| line == "Total output lines: 400"),
-        "expected total output lines marker, got {output:?}",
-    );
-    assert!(
-        output.contains("[... omitted"),
-        "expected truncated marker, got {output:?}",
-    );
+    let truncated_pattern = r#"(?s)^Exit code: 0
+Wall time: [0-9]+(?:\.[0-9]+)? seconds
+Total output lines: 400
+Output:
+Total output lines: 400
+
+1
+2
+3
+4
+5
+6
+.*\[\.{3} omitted \d+ of 400 lines \.{3}\]
+
+.*\n396
+397
+398
+399
+400
+$"#;
+    assert_regex_match(truncated_pattern, output);
 
     Ok(())
 }
diff --git a/codex-rs/core/tests/suite/tool_harness.rs b/codex-rs/core/tests/suite/tool_harness.rs
@@ -9,6 +9,7 @@ use codex_core::protocol::Op;
 use codex_core::protocol::SandboxPolicy;
 use codex_protocol::config_types::ReasoningSummary;
 use codex_protocol::plan_tool::StepStatus;
+use core_test_support::assert_regex_match;
 use core_test_support::responses;
 use core_test_support::responses::ev_apply_patch_function_call;
 use core_test_support::responses::ev_assistant_message;
@@ -116,10 +117,7 @@ async fn shell_tool_executes_command_and_streams_output() -> anyhow::Result<()>
     let exec_output: Value = serde_json::from_str(output_text)?;
     assert_eq!(exec_output["metadata"]["exit_code"], 0);
     let stdout = exec_output["output"].as_str().expect("stdout field");
-    assert!(
-        stdout.contains("tool harness"),
-        "expected stdout to contain command output, got {stdout:?}"
-    );
+    assert_regex_match(r"(?s)^tool harness\n?$", stdout);
 
     Ok(())
 }
diff --git a/codex-rs/core/tests/suite/tools.rs b/codex-rs/core/tests/suite/tools.rs
@@ -9,6 +9,7 @@ use codex_core::protocol::InputItem;
 use codex_core::protocol::Op;
 use codex_core::protocol::SandboxPolicy;
 use codex_protocol::config_types::ReasoningSummary;
+use core_test_support::assert_regex_match;
 use core_test_support::responses::ev_assistant_message;
 use core_test_support::responses::ev_completed;
 use core_test_support::responses::ev_custom_tool_call;
@@ -21,6 +22,7 @@ use core_test_support::skip_if_no_network;
 use core_test_support::test_codex::TestCodex;
 use core_test_support::test_codex::test_codex;
 use core_test_support::wait_for_event;
+use regex_lite::Regex;
 use serde_json::Value;
 use serde_json::json;
 use wiremock::Request;
@@ -254,10 +256,8 @@ async fn shell_escalated_permissions_rejected_then_ok() -> Result<()> {
         "expected exit code 0 after rerunning without escalation",
     );
     let stdout = output_json["output"].as_str().unwrap_or_default();
-    assert!(
-        stdout.contains("shell ok"),
-        "expected stdout to include command output, got {stdout:?}"
-    );
+    let stdout_pattern = r"(?s)^shell ok\n?$";
+    assert_regex_match(stdout_pattern, stdout);
 
     Ok(())
 }
@@ -437,30 +437,24 @@ async fn shell_timeout_includes_timeout_prefix_and_metadata() -> Result<()> {
         );
 
         let stdout = output_json["output"].as_str().unwrap_or_default();
-        assert!(
-            stdout.contains("command timed out after "),
-            "expected timeout prefix, got {stdout:?}"
-        );
-        let third_line = stdout.lines().nth(2).unwrap_or_default();
-        let duration_ms = third_line
-            .strip_prefix("command timed out after ")
-            .and_then(|line| line.strip_suffix(" milliseconds"))
-            .and_then(|value| value.parse::<u64>().ok())
+        let timeout_pattern = r"(?s)^Total output lines: \d+
+
+command timed out after (?P<ms>\d+) milliseconds
+line
+.*$";
+        let captures = assert_regex_match(timeout_pattern, stdout);
+        let duration_ms = captures
+            .name("ms")
+            .and_then(|m| m.as_str().parse::<u64>().ok())
             .unwrap_or_default();
         assert!(
             duration_ms >= timeout_ms,
             "expected duration >= configured timeout, got {duration_ms} (timeout {timeout_ms})"
         );
     } else {
         // Fallback: accept the signal classification path to deflake the test.
-        assert!(
-            output_str.contains("execution error"),
-            "unexpected non-JSON output: {output_str:?}"
-        );
-        assert!(
-            output_str.contains("Signal(") || output_str.to_lowercase().contains("signal"),
-            "expected signal classification in error output, got {output_str:?}"
-        );
+        let signal_pattern = r"(?is)^execution error:.*signal.*$";
+        assert_regex_match(signal_pattern, output_str);
     }
 
     Ok(())
@@ -518,30 +512,25 @@ async fn shell_sandbox_denied_truncates_error_output() -> Result<()> {
         .and_then(Value::as_str)
         .expect("denied output string");
 
-    assert!(
-        output.contains("failed in sandbox: "),
-        "expected sandbox error prefix, got {output:?}"
-    );
-    assert!(
-        output.contains("[... omitted"),
-        "expected truncated marker, got {output:?}"
-    );
-    assert!(
-        output.contains(long_line),
-        "expected truncated stderr sample, got {output:?}"
-    );
-    // Linux distributions may surface sandbox write failures as different errno messages
-    // depending on the underlying mechanism (e.g., EPERM, EACCES, or EROFS). Accept a
-    // small set of common variants to keep this cross-platform.
-    let denial_markers = [
-        "Operation not permitted", // EPERM
-        "Permission denied",       // EACCES
-        "Read-only file system",   // EROFS
-    ];
-    assert!(
-        denial_markers.iter().any(|m| output.contains(m)),
-        "expected sandbox denial message, got {output:?}"
-    );
+    let sandbox_pattern = r#"(?s)^Exit code: -?\d+
+Wall time: [0-9]+(?:\.[0-9]+)? seconds
+Total output lines: \d+
+Output:
+Total output lines: \d+
+
+failed in sandbox: .*?(?:Operation not permitted|Permission denied|Read-only file system).*?
+\[\.{3} omitted \d+ of \d+ lines \.{3}\]
+.*this is a long stderr line that should trigger truncation 0123456789abcdefghijklmnopqrstuvwxyz.*
+\n?$"#;
+    let sandbox_regex = Regex::new(sandbox_pattern)?;
+    if !sandbox_regex.is_match(output) {
+        let fallback_pattern = r#"(?s)^Total output lines: \d+
+
+failed in sandbox: this is a long stderr line that should trigger truncation 0123456789abcdefghijklmnopqrstuvwxyz
+.*this is a long stderr line that should trigger truncation 0123456789abcdefghijklmnopqrstuvwxyz.*
+.*(?:Operation not permitted|Permission denied|Read-only file system).*$"#;
+        assert_regex_match(fallback_pattern, output);
+    }
 
     Ok(())
 }
@@ -604,10 +593,23 @@ async fn shell_spawn_failure_truncates_exec_error() -> Result<()> {
         .and_then(Value::as_str)
         .expect("spawn failure output string");
 
-    assert!(
-        output.contains("execution error:"),
-        "expected execution error prefix, got {output:?}"
-    );
+    let spawn_error_pattern = r#"(?s)^Exit code: -?\d+
+Wall time: [0-9]+(?:\.[0-9]+)? seconds
+Output:
+execution error: .*$"#;
+    let spawn_truncated_pattern = r#"(?s)^Exit code: -?\d+
+Wall time: [0-9]+(?:\.[0-9]+)? seconds
+Total output lines: \d+
+Output:
+Total output lines: \d+
+
+execution error: .*$"#;
+    let spawn_error_regex = Regex::new(spawn_error_pattern)?;
+    let spawn_truncated_regex = Regex::new(spawn_truncated_pattern)?;
+    if !spawn_error_regex.is_match(output) && !spawn_truncated_regex.is_match(output) {
+        let fallback_pattern = r"(?s)^execution error: .*$";
+        assert_regex_match(fallback_pattern, output);
+    }
     assert!(output.len() <= 10 * 1024);
 
     Ok(())