Improve test execution flow and output visibility

nikomatsakis · claude · nikomatsakis · commit 78e47cb418b1 · 2025-07-05T12:23:38.000-04:00
Test runner improvements: - Add fail-fast behavior: stop executing remaining conversation steps when one fails - Implement real-time streaming response display instead of truncated preview - Show tool calls on separate lines with full parameter details - Add clear indication of skipped steps when tests fail early These changes make test execution more efficient and provide better visibility into what Claude is actually doing during tests. The fail-fast approach prevents testing invalid conversation states, and streaming output helps debug failures. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/dialectic/dialectic.py b/dialectic/dialectic.py
@@ -211,18 +211,24 @@ async def run_conversation_step(self, step: ConversationStep) -> TestResult:
         response_text = ""
         tools_used = []
         
+        print(f"🤖 Assistant: ", end="", flush=True)
         async for message in query(prompt=step.user_message):
             if isinstance(message, AssistantMessage):
                 for block in message.content:
                     if isinstance(block, TextBlock):
                         response_text += block.text
+                        print(block.text, end="", flush=True)
                     elif isinstance(block, ToolUseBlock):
                         tools_used.append({
                             'tool': block.name,
                             'parameters': block.input
                         })
+                        print(f"\n🔧 Tool: {block.name}")
+                        if block.input:
+                            print(f"   Parameters: {block.input}")
+                        print(f"🤖 Assistant: ", end="", flush=True)
         
-        print(f"🤖 Assistant: {response_text[:200]}{'...' if len(response_text) > 200 else ''}")
+        print()  # New line after streaming response
         
         # Validate response content
         found_phrases = []
@@ -322,6 +328,12 @@ async def run_test_case(self, test_case: TestCase) -> bool:
             else:
                 print(f"❌ Step {i} FAILED")
                 all_steps_passed = False
+                
+                # Stop executing remaining steps - conversation state is now wrong
+                remaining_steps = len(test_case.conversation) - i
+                if remaining_steps > 0:
+                    print(f"⏭️  Skipping {remaining_steps} remaining step(s) due to failure")
+                break
         
         print(f"\n🎯 Test Case Result: {'PASSED' if all_steps_passed else 'FAILED'}")
         return all_steps_passed