Add LLM token usage reporting and improve test script

karangattu · karangattu · commit 50739f953d6c · 2025-08-21T06:12:33.000-07:00
Enhanced ShinyTestGenerator to print LLM token usage, cost, and elapsed time after each chat call. Updated run-test-evaluation.sh to allow configurable number of attempts, improved logging, and made minor cleanup for better maintainability and clarity.
diff --git a/shiny/pytest/_generate/_main.py b/shiny/pytest/_generate/_main.py
@@ -3,11 +3,12 @@
 import os
 import re
 import sys
+import time
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Literal, Optional, Tuple, Union
 
-from chatlas import ChatAnthropic, ChatOpenAI
+from chatlas import ChatAnthropic, ChatOpenAI, token_usage
 from dotenv import load_dotenv
 
 __all__ = [
@@ -196,7 +197,44 @@ def get_llm_response(self, prompt: str, model: Optional[str] = None) -> str:
             else:
                 raise ValueError(f"Unsupported provider: {self.provider}")
 
+            start_time = time.perf_counter()
             response = chat.chat(prompt)
+            elapsed = time.perf_counter() - start_time
+            usage = token_usage()
+            try:
+
+                def _fmt_tokens(n):
+                    try:
+                        n_int = int(n)
+                    except Exception:
+                        return str(n)
+                    if n_int >= 1_000_000:
+                        return f"{n_int / 1_000_000:.1f}M"
+                    if n_int >= 1_000:
+                        return f"{n_int / 1_000:.1f}k"
+                    return str(n_int)
+
+                entries = usage
+                if isinstance(entries, dict):
+                    entries = [entries]
+
+                if isinstance(entries, (list, tuple)) and entries:
+                    print("LLM token usage and cost:")
+                    for e in entries:
+                        name = e.get("name", "N/A")
+                        model_name = e.get("model", "N/A")
+                        input_tokens = int(e.get("input", 0) or 0)
+                        output_tokens = int(e.get("output", 0) or 0)
+                        cost = float(e.get("cost", 0.0) or 0.0)
+                        print(
+                            f"{name} ({model_name}): {_fmt_tokens(input_tokens)} input, {_fmt_tokens(output_tokens)} output | Cost ${cost:.2f} | Time taken: {elapsed:.2f}s\n"
+                        )
+                else:
+                    print(f"Token usage: {usage}\n")
+                    print(f"Time taken: {elapsed:.2f}s")
+            except Exception:
+                print(f"Token usage: {usage}")
+                print(f"Time taken: {elapsed:.2f}s")
 
             if hasattr(response, "content"):
                 return response.content
diff --git a/tests/inspect-ai/scripts/run-test-evaluation.sh b/tests/inspect-ai/scripts/run-test-evaluation.sh
@@ -1,36 +1,32 @@
 #!/bin/bash
 
-set -e # Exit immediately if a command fails
+set -e
 
-# CI fast-fail defaults (override via env)
-: "${SHINY_TEST_TIMEOUT_SECS:=10}"          # App startup fast-fail (seconds)
-: "${PYTEST_PER_TEST_TIMEOUT:=60}"          # Per-test timeout (seconds)
-: "${PYTEST_SUITE_TIMEOUT:=6m}"             # Whole pytest run timeout
-: "${PYTEST_MAXFAIL:=1}"                     # Fail fast on first failure
-: "${PYTEST_XDIST_WORKERS:=auto}"           # Parallel workers for pytest-xdist
+# Defaults (override via env)
+: "${SHINY_TEST_TIMEOUT_SECS:=10}"
+: "${PYTEST_PER_TEST_TIMEOUT:=60}"
+: "${PYTEST_SUITE_TIMEOUT:=6m}"
+: "${PYTEST_MAXFAIL:=1}"
+: "${PYTEST_XDIST_WORKERS:=auto}"
+: "${ATTEMPTS:=3}"
 export SHINY_TEST_TIMEOUT_SECS
 
-# Function to log with timestamp
 log_with_timestamp() {
   echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
 }
 
-# Function to cleanup hanging processes
 cleanup_processes() {
   log_with_timestamp "Cleaning up any hanging processes..."
   pkill -f "playwright" || true
   pkill -f "chromium" || true
   pkill -f "pytest" || true
 }
 
-# Set up trap to cleanup on exit
 trap cleanup_processes EXIT
 
-for i in {1..3}
-do
-  log_with_timestamp "Starting Attempt $i of 3"
+for i in $(seq 1 "$ATTEMPTS"); do
+  log_with_timestamp "Starting attempt $i of $ATTEMPTS"
 
-  # Clean up results from previous attempt to ensure a clean slate
   rm -rf results/
   mkdir -p results/
   rm -f test-results.xml
@@ -43,9 +39,8 @@ do
     --log-dir results/ \
     --log-format json
 
-  log_with_timestamp "[Attempt $i] Running Tests..."
+  log_with_timestamp "[Attempt $i] Running tests..."
   test_exit_code=0
-  # Disable exit on error just for the pytest command to check the exit code
   set +e
   timeout "$PYTEST_SUITE_TIMEOUT" pytest tests/inspect-ai/apps \
     -n "$PYTEST_XDIST_WORKERS" --dist loadfile \
@@ -57,28 +52,29 @@ do
     --timeout="$PYTEST_PER_TEST_TIMEOUT" \
     --timeout-method=signal \
     -v || test_exit_code=$?
-  # Re-enable exit on error immediately
   set -e
 
-  # Check if timeout occurred
   if [ "${test_exit_code:-0}" -eq 124 ]; then
-    log_with_timestamp "Tests timed out on attempt $i - this may indicate hanging tests"
+    log_with_timestamp "Tests timed out on attempt $i (possible hang)"
     cleanup_processes
     exit 1
   fi
 
-  # Check if tests failed and how many failures occurred
   if [ "${test_exit_code:-0}" -ne 0 ]; then
-    failure_count=$(grep -o 'failures="[0-9]*"' test-results.xml | grep -o '[0-9]*' || echo "0")
+    if [ -f test-results.xml ]; then
+      failure_count=$(grep -o 'failures="[0-9]*"' test-results.xml | grep -o '[0-9]*' || echo "0")
+    else
+      failure_count=0
+    fi
     log_with_timestamp "Found $failure_count test failures on attempt $i"
 
-    # Fail the workflow if more than 1 test failed
     if [ "$failure_count" -gt 1 ]; then
       log_with_timestamp "More than 1 test failed on attempt $i - failing CI"
       exit 1
     fi
   fi
-  log_with_timestamp "Attempt $i of 3 Succeeded"
+
+  log_with_timestamp "Attempt $i of $ATTEMPTS succeeded"
 done
 
-log_with_timestamp "All 3 evaluation and test runs passed successfully."
+log_with_timestamp "All $ATTEMPTS evaluation and test runs passed successfully."