Skip to content

Commit 310ec57

Browse files
committed
fix: Handle non-tty scenario correctly for aks agent evals
1 parent e761a75 commit 310ec57

File tree

3 files changed

+146
-21
lines changed

3 files changed

+146
-21
lines changed

src/aks-agent/azext_aks_agent/agent/agent.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -166,15 +166,28 @@ def aks_agent(
166166
os.environ[CONST_AGENT_CONFIG_PATH_DIR_ENV_KEY] = get_config_dir()
167167
os.environ[CONST_AGENT_NAME_ENV_KEY] = CONST_AGENT_NAME
168168

169-
# Detect and read piped input
169+
# Detect and read piped input with timeout protection
170170
piped_data = None
171-
if not sys.stdin.isatty():
172-
piped_data = sys.stdin.read().strip()
173-
if interactive:
174-
console.print(
175-
"[bold yellow]Interactive mode disabled when reading piped input[/bold yellow]"
176-
)
177-
interactive = False
171+
# In non-interactive mode with a prompt, we shouldn't try to read stdin
172+
# as it may hang in CI/CD environments. Only read stdin if:
173+
# 1. Not a TTY (indicating piped input)
174+
# 2. Interactive mode is enabled (allows stdin reading)
175+
should_check_stdin = not sys.stdin.isatty() and interactive
176+
177+
if should_check_stdin:
178+
try:
179+
# Use select with timeout to avoid hanging
180+
import select
181+
# Check if data is available with 100ms timeout
182+
if select.select([sys.stdin], [], [], 0.1)[0]:
183+
piped_data = sys.stdin.read().strip()
184+
console.print(
185+
"[bold yellow]Interactive mode disabled when reading piped input[/bold yellow]"
186+
)
187+
interactive = False
188+
except Exception:
189+
# Continue without piped data if stdin reading fails
190+
pass
178191

179192
# Determine MCP mode and smart refresh logic
180193
use_aks_mcp = bool(use_aks_mcp)

src/aks-agent/azext_aks_agent/tests/evals/test_ask_agent.py

Lines changed: 117 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@
88
import os
99
import shlex
1010
import subprocess
11+
import textwrap
12+
from datetime import datetime, timezone
1113
from pathlib import Path
14+
from time import perf_counter
1215
from typing import Iterable
1316

1417

@@ -40,6 +43,26 @@
4043
ITERATIONS = int(os.environ.get("ITERATIONS", "1"))
4144
BRAINTRUST_UPLOADER = BraintrustUploader(os.environ)
4245

46+
47+
def _log(message: str) -> None:
48+
"""Emit a timestamped log line that pytest `-s` will surface immediately."""
49+
timestamp = datetime.now(timezone.utc).isoformat(timespec="seconds")
50+
print(f"[{timestamp}] {message}", flush=True)
51+
52+
53+
def _summarise_command(parts: Iterable[str]) -> str:
54+
"""Return a shell-style command string for debugging output."""
55+
sequence = parts if isinstance(parts, list) else list(parts)
56+
if hasattr(shlex, "join"):
57+
return shlex.join(sequence)
58+
# ``shlex.join`` was added in Python 3.8; keep a safe fallback just in case.
59+
return " ".join(shlex.quote(part) for part in sequence)
60+
61+
62+
def _preview_output(output: str, *, limit: int = 400) -> str:
63+
"""Provide a trimmed preview of command output for quick debugging."""
64+
return textwrap.shorten(output.strip(), width=limit, placeholder=" …")
65+
4366
pytestmark = [
4467
pytest.mark.skipif(
4568
not RUN_LIVE,
@@ -90,14 +113,62 @@ def _build_command(prompt: str, model: str, resource_group: str, cluster_name: s
90113

91114

92115
def _run_cli(command: Iterable[str], env: dict[str, str]) -> str:
116+
command_list = list(command)
117+
command_display = _summarise_command(command_list)
118+
_log(f"Invoking AKS Agent CLI: {command_display}")
119+
start = perf_counter()
120+
121+
# Add timeout and real-time stderr display for debugging
122+
import sys
123+
import threading
124+
timeout_seconds = 600 # 10 minutes timeout
125+
93126
try:
94-
result = subprocess.run( # noqa: S603
95-
list(command),
96-
check=True,
97-
capture_output=True,
127+
# Use Popen for real-time output visibility
128+
process = subprocess.Popen( # noqa: S603
129+
command_list,
130+
stdout=subprocess.PIPE,
131+
stderr=subprocess.PIPE,
98132
text=True,
99133
env=env,
100134
)
135+
136+
# Thread to print stderr in real-time
137+
stderr_lines = []
138+
def print_stderr():
139+
for line in iter(process.stderr.readline, ''):
140+
if line:
141+
print(f"[STDERR] {line.rstrip()}", file=sys.stderr, flush=True)
142+
stderr_lines.append(line)
143+
144+
stderr_thread = threading.Thread(target=print_stderr, daemon=True)
145+
stderr_thread.start()
146+
147+
# Wait with timeout
148+
try:
149+
stdout, _ = process.communicate(timeout=timeout_seconds)
150+
stderr_thread.join(timeout=1)
151+
stderr = ''.join(stderr_lines)
152+
except subprocess.TimeoutExpired:
153+
process.kill()
154+
stdout, stderr_remainder = process.communicate()
155+
stderr = ''.join(stderr_lines) + (stderr_remainder or '')
156+
_log(f"[ERROR] CLI command timed out after {timeout_seconds}s")
157+
pytest.fail(
158+
f"AKS Agent CLI call timed out after {timeout_seconds}s\n"
159+
f"Command: {command_display}\n"
160+
f"Stdout: {stdout}\n"
161+
f"Stderr: {stderr}"
162+
)
163+
164+
if process.returncode != 0:
165+
raise subprocess.CalledProcessError(
166+
process.returncode, command_list, stdout, stderr
167+
)
168+
169+
result = subprocess.CompletedProcess(
170+
command_list, process.returncode, stdout, stderr
171+
)
101172
except subprocess.CalledProcessError as exc: # pragma: no cover - live failure path
102173
output = exc.stdout or ""
103174
stderr = exc.stderr or ""
@@ -109,13 +180,28 @@ def _run_cli(command: Iterable[str], env: dict[str, str]) -> str:
109180
f"Stdout: {output}\n"
110181
f"Stderr: {stderr}"
111182
)
183+
duration = perf_counter() - start
184+
stdout_preview = _preview_output(result.stdout)
185+
stderr_preview = _preview_output(result.stderr) if result.stderr else None
186+
_log(
187+
f"AKS Agent CLI completed in {duration:.1f}s with stdout preview: {stdout_preview}"
188+
)
189+
if stderr_preview:
190+
_log(
191+
f"AKS Agent CLI stderr preview: {stderr_preview}"
192+
)
112193
return result.stdout
113194

114195

115196
def _run_commands(
116197
commands: list[str], env: dict[str, str], label: str, scenario: Scenario
117198
) -> None:
199+
if not commands:
200+
_log(f"[{label}] {scenario.name}: no commands to run")
201+
return
118202
for cmd in commands:
203+
_log(f"[{label}] {scenario.name}: running shell command: {cmd}")
204+
start = perf_counter()
119205
try:
120206
completed = subprocess.run( # noqa: S603
121207
cmd,
@@ -137,9 +223,25 @@ def _run_commands(
137223
f"Stderr: {stderr}"
138224
)
139225
else:
226+
duration = perf_counter() - start
140227
# Provide quick visibility into command results when debugging failures.
141228
if completed.stdout:
142-
print(f"[{label}] {scenario.name}: {completed.stdout.strip()}")
229+
stdout_preview = _preview_output(completed.stdout)
230+
_log(
231+
f"[{label}] {scenario.name}: succeeded in {duration:.1f}s; stdout preview: {stdout_preview}"
232+
)
233+
else:
234+
_log(
235+
f"[{label}] {scenario.name}: succeeded in {duration:.1f}s; no stdout produced"
236+
)
237+
if completed.stderr:
238+
stderr_preview = _preview_output(completed.stderr)
239+
_log(
240+
f"[{label}] {scenario.name}: stderr preview: {stderr_preview}"
241+
)
242+
_log(
243+
f"[{label}] {scenario.name}: completed {len(commands)} command(s)"
244+
)
143245

144246

145247
def _scenario_params() -> list:
@@ -165,6 +267,7 @@ def test_ask_agent_live(
165267
request: pytest.FixtureRequest,
166268
) -> None:
167269
iteration_label = f"[iteration {iteration + 1}/{ITERATIONS}]"
270+
_log(f"{iteration_label} starting scenario {scenario.name}")
168271
if RUN_LIVE:
169272
env = _load_env()
170273

@@ -178,7 +281,7 @@ def test_ask_agent_live(
178281
env.update(scenario.env_overrides)
179282

180283
if iteration == 0 and scenario.before_commands and not aks_skip_setup:
181-
print(f"{iteration_label} running setup commands for {scenario.name}")
284+
_log(f"{iteration_label} running setup commands for {scenario.name}")
182285
_run_commands(scenario.before_commands, env, "setup", scenario)
183286

184287
command = _build_command(
@@ -188,7 +291,7 @@ def test_ask_agent_live(
188291
cluster_name=cluster_name,
189292
)
190293

191-
print(f"{iteration_label} invoking AKS Agent CLI for {scenario.name}")
294+
_log(f"{iteration_label} invoking AKS Agent CLI for {scenario.name}")
192295
try:
193296
raw_output = _run_cli(command, env)
194297
answer = ""
@@ -216,11 +319,11 @@ def test_ask_agent_live(
216319
classifier_rationale = classifier_result.metadata.get(
217320
"rationale", ""
218321
)
219-
print(
322+
_log(
220323
f"{iteration_label} classifier score for {scenario.name}: {classifier_score}"
221324
)
222325
if classifier_score is None:
223-
print(
326+
_log(
224327
f"{iteration_label} classifier returned no score for {scenario.name}; falling back to substring checks"
225328
)
226329
else:
@@ -230,7 +333,7 @@ def test_ask_agent_live(
230333
if not error_message:
231334
error_message = "Classifier judged answer incorrect"
232335
else:
233-
print(
336+
_log(
234337
f"{iteration_label} classifier unavailable for {scenario.name}; falling back to substring checks"
235338
)
236339

@@ -280,21 +383,21 @@ def test_ask_agent_live(
280383

281384
if GENERATE_MOCKS:
282385
mock_path = save_mock_answer(scenario.mock_path, answer)
283-
print(f"{iteration_label} [mock] wrote response to {mock_path}")
386+
_log(f"{iteration_label} [mock] wrote response to {mock_path}")
284387
finally:
285388
if (
286389
iteration == ITERATIONS - 1
287390
and scenario.after_commands
288391
and not aks_skip_cleanup
289392
):
290-
print(f"{iteration_label} running cleanup commands for {scenario.name}")
393+
_log(f"{iteration_label} running cleanup commands for {scenario.name}")
291394
_run_commands(scenario.after_commands, env, "cleanup", scenario)
292395
else:
293396
if GENERATE_MOCKS:
294397
pytest.fail("GENERATE_MOCKS requires RUN_LIVE=true")
295398
try:
296399
answer = load_mock_answer(scenario.mock_path)
297-
print(f"{iteration_label} replayed mock response for {scenario.name}")
400+
_log(f"{iteration_label} replayed mock response for {scenario.name}")
298401
except FileNotFoundError:
299402
pytest.skip(f"Mock response missing for scenario {scenario.name}; rerun with RUN_LIVE=true GENERATE_MOCKS=true")
300403

@@ -328,5 +431,6 @@ def test_ask_agent_live(
328431
_set_user_property(request, 'braintrust_root_span_id', str(root_span_id))
329432
if url:
330433
_set_user_property(request, 'braintrust_experiment_url', str(url))
434+
_log(f"{iteration_label} completed scenario {scenario.name} (passed={passed})")
331435
if not passed:
332436
pytest.fail(f"Scenario {scenario.name}: {error}\nAI answer:\n{answer}")

src/aks-agent/setup.cfg

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,10 @@
11
[bdist_wheel]
22
universal=1
3+
4+
[tool:pytest]
5+
markers =
6+
easy: Regression AKS Agent evals that should always pass
7+
medium: Stretch AKS Agent evals that may fail occasionally
8+
hard: Challenging AKS Agent evals reserved for complex scenarios
9+
kubernetes: AKS Agent evals that exercise Kubernetes-focused flows
10+
aks_eval: AKS Agent evaluation tests

0 commit comments

Comments
 (0)