1919
2020@dataclass
2121class ToolExpectation :
22- """Expected tool usage with parameter validation rules."""
22+ """
23+ Expected tool usage with parameter validation rules.
24+
25+ Attributes:
26+ tool: Name of the tool that should be called (e.g., 'Bash', 'mcp__socratic-shell__consolidate')
27+ parameters: Optional validation rules for tool parameters. Can be:
28+ - Dict with 'should_contain' key for substring matching
29+ - Dict with 'should_be' key for exact matching
30+ - Direct value for simple equality check
31+ Example: {'content': {'should_contain': ['JWT', 'tokens']},
32+ 'category': 'technical-insight'}
33+ """
2334 tool : str
24- parameters : Optional [Dict [str , Any ]] = None # Parameter validation rules
35+ parameters : Optional [Dict [str , Any ]] = None
2536
2637
2738@dataclass
2839class TestCase :
29- """A complete test case loaded from YAML."""
40+ """
41+ A complete test case loaded from YAML.
42+
43+ Attributes:
44+ name: Short descriptive name for the test case
45+ description: Longer explanation of what the test validates
46+ tags: Optional list of tags for categorizing tests (e.g., ['memory', 'consolidation'])
47+ conversation: List of conversation steps to execute in order
48+ """
3049 name : str
3150 description : str
3251 tags : List [str ]
@@ -35,10 +54,25 @@ class TestCase:
3554
3655@dataclass
3756class ConversationStep :
38- """A single step in a test conversation loaded from YAML."""
57+ """
58+ A single step in a test conversation loaded from YAML.
59+
60+ Each step represents one user message and the expected LLM behavior.
61+
62+ Attributes:
63+ user_message: The message sent to the LLM in this conversation step
64+ response_should_contain: Phrases that must appear in the LLM's response
65+ Example: ['listening', 'tell me more']
66+ response_should_not_contain: Phrases that must NOT appear in the LLM's response
67+ Example: ['I'll help you set up', 'Let me check']
68+ expected_tools: Tools we expect the LLM to call (or not call) in this step.
69+ Empty list means no tools should be called.
70+ Each ToolExpectation can specify parameter validation rules.
71+ """
3972 user_message : str
40- expected_response : Dict [str , List [str ]]
41- expected_tools : List [ToolExpectation ]
73+ response_should_contain : List [str ] = None
74+ response_should_not_contain : List [str ] = None
75+ expected_tools : List [ToolExpectation ] = None
4276
4377
4478@dataclass
@@ -49,25 +83,46 @@ class TestResult:
4983 This captures what happened when we sent a message to the LLM and
5084 validated the response against our YAML test expectations. It's the
5185 "report card" for one step in a conversation test.
86+
87+ Attributes:
88+ success: Did the LLM response pass all validations?
89+ found_phrases: Required phrases that were found in response
90+ missing_phrases: Required phrases that were missing from response
91+ unexpected_phrases: Forbidden phrases that appeared in response
92+ found_tools: Expected tools that were called with correct parameters
93+ invalid_tools: Expected tools that were called with wrong parameters
94+ missing_tools: Expected tools that were not called
95+ unexpected_tools: Tools that were called but not expected
96+ (includes full tool info: {'tool': name, 'parameters': {...}})
97+ response_text: Full LLM response text for debugging
98+ response_length: Length of response in characters
5299 """
53- success : bool # Did the LLM response pass all validations?
54- found_phrases : List [str ] # Required phrases that were found in response
55- missing_phrases : List [str ] # Required phrases that were missing from response
56- unexpected_phrases : List [str ] # Forbidden phrases that appeared in response
57- found_tools : List [ToolExpectation ] # Expected tools that were called with correct parameters
58- invalid_tools : List [ToolExpectation ] # Expected tools that were called with wrong parameters
59- missing_tools : List [ToolExpectation ] # Expected tools that were not called
60- unexpected_tools : List [Dict [str , Any ]] # Tools that were called but not expected
61- response_text : str # Full LLM response text for debugging
62- response_length : int # Length of response in characters
100+ success : bool
101+ found_phrases : List [str ]
102+ missing_phrases : List [str ]
103+ unexpected_phrases : List [str ]
104+ found_tools : List [ToolExpectation ]
105+ invalid_tools : List [ToolExpectation ]
106+ missing_tools : List [ToolExpectation ]
107+ unexpected_tools : List [Dict [str , Any ]]
108+ response_text : str
109+ response_length : int
63110
64111
65112class DialecticRunner :
66- """YAML-based test runner for prompt engineering validation."""
113+ """
114+ YAML-based test runner for prompt engineering validation.
115+
116+ Attributes:
117+ results: Accumulates TestResult objects from all conversation steps executed.
118+ Used for final summary statistics and debugging.
119+ """
120+
121+ results : List [TestResult ]
67122
68123 def __init__ (self ):
69124 """Initialize the test runner."""
70- self .results : List [ TestResult ] = []
125+ self .results = []
71126
72127 def load_test_case (self , yaml_path : Path ) -> TestCase :
73128 """Load a test case from a YAML file."""
@@ -93,7 +148,8 @@ def load_test_case(self, yaml_path: Path) -> TestCase:
93148
94149 step = ConversationStep (
95150 user_message = step_data ['user' ],
96- expected_response = expected_response ,
151+ response_should_contain = expected_response .get ('should_contain' , []),
152+ response_should_not_contain = expected_response .get ('should_not_contain' , []),
97153 expected_tools = expected_tools
98154 )
99155 conversation .append (step )
@@ -169,15 +225,12 @@ async def run_conversation_step(self, step: ConversationStep) -> TestResult:
169225 print (f"🤖 Assistant: { response_text [:200 ]} { '...' if len (response_text ) > 200 else '' } " )
170226
171227 # Validate response content
172- should_contain = step .expected_response .get ('should_contain' , [])
173- should_not_contain = step .expected_response .get ('should_not_contain' , [])
174-
175228 found_phrases = []
176229 missing_phrases = []
177230 unexpected_phrases = []
178231
179232 # Check required phrases
180- for phrase in should_contain :
233+ for phrase in ( step . response_should_contain or []) :
181234 if phrase .lower () in response_text .lower ():
182235 found_phrases .append (phrase )
183236 print (f"✅ Found required: '{ phrase } '" )
@@ -186,7 +239,7 @@ async def run_conversation_step(self, step: ConversationStep) -> TestResult:
186239 print (f"❌ Missing required: '{ phrase } '" )
187240
188241 # Check forbidden phrases
189- for phrase in should_not_contain :
242+ for phrase in ( step . response_should_not_contain or []) :
190243 if phrase .lower () in response_text .lower ():
191244 unexpected_phrases .append (phrase )
192245 print (f"❌ Found forbidden: '{ phrase } '" )
0 commit comments