Skip to content

Commit 50739f9

Browse files
committed
Add LLM token usage reporting and improve test script
Enhanced ShinyTestGenerator to print LLM token usage, cost, and elapsed time after each chat call. Updated run-test-evaluation.sh to allow configurable number of attempts, improved logging, and made minor cleanup for better maintainability and clarity.
1 parent 7d420e7 commit 50739f9

File tree

2 files changed

+59
-25
lines changed

2 files changed

+59
-25
lines changed

shiny/pytest/_generate/_main.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,12 @@
33
import os
44
import re
55
import sys
6+
import time
67
from dataclasses import dataclass
78
from pathlib import Path
89
from typing import Literal, Optional, Tuple, Union
910

10-
from chatlas import ChatAnthropic, ChatOpenAI
11+
from chatlas import ChatAnthropic, ChatOpenAI, token_usage
1112
from dotenv import load_dotenv
1213

1314
__all__ = [
@@ -196,7 +197,44 @@ def get_llm_response(self, prompt: str, model: Optional[str] = None) -> str:
196197
else:
197198
raise ValueError(f"Unsupported provider: {self.provider}")
198199

200+
start_time = time.perf_counter()
199201
response = chat.chat(prompt)
202+
elapsed = time.perf_counter() - start_time
203+
usage = token_usage()
204+
try:
205+
206+
def _fmt_tokens(n):
207+
try:
208+
n_int = int(n)
209+
except Exception:
210+
return str(n)
211+
if n_int >= 1_000_000:
212+
return f"{n_int / 1_000_000:.1f}M"
213+
if n_int >= 1_000:
214+
return f"{n_int / 1_000:.1f}k"
215+
return str(n_int)
216+
217+
entries = usage
218+
if isinstance(entries, dict):
219+
entries = [entries]
220+
221+
if isinstance(entries, (list, tuple)) and entries:
222+
print("LLM token usage and cost:")
223+
for e in entries:
224+
name = e.get("name", "N/A")
225+
model_name = e.get("model", "N/A")
226+
input_tokens = int(e.get("input", 0) or 0)
227+
output_tokens = int(e.get("output", 0) or 0)
228+
cost = float(e.get("cost", 0.0) or 0.0)
229+
print(
230+
f"{name} ({model_name}): {_fmt_tokens(input_tokens)} input, {_fmt_tokens(output_tokens)} output | Cost ${cost:.2f} | Time taken: {elapsed:.2f}s\n"
231+
)
232+
else:
233+
print(f"Token usage: {usage}\n")
234+
print(f"Time taken: {elapsed:.2f}s")
235+
except Exception:
236+
print(f"Token usage: {usage}")
237+
print(f"Time taken: {elapsed:.2f}s")
200238

201239
if hasattr(response, "content"):
202240
return response.content
Lines changed: 20 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,32 @@
11
#!/bin/bash
22

3-
set -e # Exit immediately if a command fails
3+
set -e
44

5-
# CI fast-fail defaults (override via env)
6-
: "${SHINY_TEST_TIMEOUT_SECS:=10}" # App startup fast-fail (seconds)
7-
: "${PYTEST_PER_TEST_TIMEOUT:=60}" # Per-test timeout (seconds)
8-
: "${PYTEST_SUITE_TIMEOUT:=6m}" # Whole pytest run timeout
9-
: "${PYTEST_MAXFAIL:=1}" # Fail fast on first failure
10-
: "${PYTEST_XDIST_WORKERS:=auto}" # Parallel workers for pytest-xdist
5+
# Defaults (override via env)
6+
: "${SHINY_TEST_TIMEOUT_SECS:=10}"
7+
: "${PYTEST_PER_TEST_TIMEOUT:=60}"
8+
: "${PYTEST_SUITE_TIMEOUT:=6m}"
9+
: "${PYTEST_MAXFAIL:=1}"
10+
: "${PYTEST_XDIST_WORKERS:=auto}"
11+
: "${ATTEMPTS:=3}"
1112
export SHINY_TEST_TIMEOUT_SECS
1213

13-
# Function to log with timestamp
1414
log_with_timestamp() {
1515
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
1616
}
1717

18-
# Function to cleanup hanging processes
1918
cleanup_processes() {
2019
log_with_timestamp "Cleaning up any hanging processes..."
2120
pkill -f "playwright" || true
2221
pkill -f "chromium" || true
2322
pkill -f "pytest" || true
2423
}
2524

26-
# Set up trap to cleanup on exit
2725
trap cleanup_processes EXIT
2826

29-
for i in {1..3}
30-
do
31-
log_with_timestamp "Starting Attempt $i of 3"
27+
for i in $(seq 1 "$ATTEMPTS"); do
28+
log_with_timestamp "Starting attempt $i of $ATTEMPTS"
3229

33-
# Clean up results from previous attempt to ensure a clean slate
3430
rm -rf results/
3531
mkdir -p results/
3632
rm -f test-results.xml
@@ -43,9 +39,8 @@ do
4339
--log-dir results/ \
4440
--log-format json
4541

46-
log_with_timestamp "[Attempt $i] Running Tests..."
42+
log_with_timestamp "[Attempt $i] Running tests..."
4743
test_exit_code=0
48-
# Disable exit on error just for the pytest command to check the exit code
4944
set +e
5045
timeout "$PYTEST_SUITE_TIMEOUT" pytest tests/inspect-ai/apps \
5146
-n "$PYTEST_XDIST_WORKERS" --dist loadfile \
@@ -57,28 +52,29 @@ do
5752
--timeout="$PYTEST_PER_TEST_TIMEOUT" \
5853
--timeout-method=signal \
5954
-v || test_exit_code=$?
60-
# Re-enable exit on error immediately
6155
set -e
6256

63-
# Check if timeout occurred
6457
if [ "${test_exit_code:-0}" -eq 124 ]; then
65-
log_with_timestamp "Tests timed out on attempt $i - this may indicate hanging tests"
58+
log_with_timestamp "Tests timed out on attempt $i (possible hang)"
6659
cleanup_processes
6760
exit 1
6861
fi
6962

70-
# Check if tests failed and how many failures occurred
7163
if [ "${test_exit_code:-0}" -ne 0 ]; then
72-
failure_count=$(grep -o 'failures="[0-9]*"' test-results.xml | grep -o '[0-9]*' || echo "0")
64+
if [ -f test-results.xml ]; then
65+
failure_count=$(grep -o 'failures="[0-9]*"' test-results.xml | grep -o '[0-9]*' || echo "0")
66+
else
67+
failure_count=0
68+
fi
7369
log_with_timestamp "Found $failure_count test failures on attempt $i"
7470

75-
# Fail the workflow if more than 1 test failed
7671
if [ "$failure_count" -gt 1 ]; then
7772
log_with_timestamp "More than 1 test failed on attempt $i - failing CI"
7873
exit 1
7974
fi
8075
fi
81-
log_with_timestamp "Attempt $i of 3 Succeeded"
76+
77+
log_with_timestamp "Attempt $i of $ATTEMPTS succeeded"
8278
done
8379

84-
log_with_timestamp "All 3 evaluation and test runs passed successfully."
80+
log_with_timestamp "All $ATTEMPTS evaluation and test runs passed successfully."

0 commit comments

Comments
 (0)