@@ -25,7 +25,7 @@ def get_app_specific_instructions(app_name: str) -> str:
2525 - Whether the test creates an instance of the InputSlider controller with id "my_plot_module-n_points"
2626 - Ensure that the slider component is verified for its label, min, max, and value attributes.
2727 - Ensure that the test checks by moving the slider to different values and verify the slider values accordingly
28-
28+
2929 IMPORTANT: Only evaluate based on components and IDs that actually exist in the app code.
3030 """ ,
3131 "app_07_modules" : """
@@ -35,15 +35,15 @@ def get_app_specific_instructions(app_name: str) -> str:
3535 - Ensure that the text inputs are verified for their labels and initial values.
3636 - Ensure that the test checks the text output for correct concatenation of input values.
3737 - Check that the test verifies the module's reactivity by changing input values and checking output
38-
38+
3939 IMPORTANT: Only evaluate based on components and IDs that actually exist in the app code.
4040 """ ,
4141 "app_03_slider" : """
4242 For this slider app, focus on components that exist in the app code:
4343 - Whether the test creates an instance of the InputSlider controller with id "slider1"
4444 - Ensure that the slider component is verified for its label, min, max, and value attributes.
4545 - Ensure that the test checks by moving the slider to different values and verify the slider values accordingly.
46-
46+
4747 IMPORTANT: Only evaluate based on components and IDs that actually exist in the app code.
4848 """ ,
4949 "app_06_R_shiny" : """
@@ -73,7 +73,7 @@ def get_app_specific_instructions(app_name: str) -> str:
7373 - Ensure that the output text components are verified for their initial values and updated values based on user interactions.
7474 - Whether the test creates an instance of the OutputDataFrame controller with id "data_grid"
7575 - Ensure that the data grid component is verified for its initial state and updates correctly based on user interactions.
76-
76+
7777 IMPORTANT: Only evaluate based on components and IDs that actually exist in the app code. The test should only test functionality that is actually present in the app.
7878 """ ,
7979 "app_02_express_basic" : """
@@ -83,7 +83,6 @@ def get_app_specific_instructions(app_name: str) -> str:
8383 - Ensure that the test checks the action button state changes and verifies the output text accordingly.
8484 - Ensure that the test creates an instance of the OutputText controller with id "click_counts"
8585 - Ensure that the output text component is verified for its initial value and updated values based on button clicks.
86- - Ensure that the test checks the click counts for each button and verifies the output text accordingly.
8786 - Ensure that the test creates instances of the InputActionButton controller with ids "btn2" and "btn3"
8887 - Ensure that the disabled button with icon is verified for its label and icon.
8988 - Ensure that the styled button is verified for its label and custom styles.
@@ -133,59 +132,68 @@ def extract_component_ids(app_code: str) -> dict:
133132 Returns:
134133 Dictionary with component types as keys and lists of IDs as values
135134 """
136- component_ids = {
137- "input" : [],
138- "output" : [],
139- }
135+ input_ids = set ()
136+ output_ids = set ()
140137
141- patterns = {
142- # Standard ui.input_* and ui.output_* with ID as first arg
143- "ui_input" : r"ui\.input_\w+\(\s*['\"]([^'\"]+)['\"]|ui\.input_\w+\(\s*id\s*=\s*['\"]([^'\"]+)['\"])" , # Both positional and named 'id' param
144- "ui_output" : r"ui\.output_\w+\(\s*['\"]([^'\"]+)['\"]|ui\.output_\w+\(\s*id\s*=\s*['\"]([^'\"]+)['\"])" , # Both positional and named 'id' param
145- # Shiny express syntax
146- "express_input" : r"input\.([\w_]+)\(\)" , # input.name() references
147- "express_output" : r"@render\.[\w_]+\s+def\s+([\w_]+)\(" , # @render.* def name(
148- # Module IDs with instantiation
149- "module_id" : r"\w+_\w+\(['\"]([^'\"]+)['\"])" , # module_name("id")
150- # Nav panels, tabs and similar
151- "ui_nav" : r"ui\.nav[\w_]*\(\s*['\"]([^'\"]+)['\"]|ui\.navset_\w+\(.*?id\s*=\s*['\"]([^'\"]+)['\"])" , # ui.nav* or ui.navset_* with id param
152- }
138+ # 1. Find input components (ui.input_*)
139+ try :
140+ input_matches = re .findall (
141+ r'ui\.input_\w+\(\s*(?:id\s*=\s*)?["\']([^"\']+)["\']' , app_code
142+ )
143+ input_ids .update (input_matches )
144+ except re .error :
145+ pass
146+
147+ # 2. Find output components (ui.output_*)
148+ try :
149+ output_matches = re .findall (
150+ r'ui\.output_\w+\(\s*(?:id\s*=\s*)?["\']([^"\']+)["\']' , app_code
151+ )
152+ output_ids .update (output_matches )
153+ except re .error :
154+ pass
155+
156+ # 3. Find input references (input.name())
157+ try :
158+ input_refs = re .findall (r"input\.([\w_]+)\(\)" , app_code )
159+ input_ids .update (input_refs )
160+ except re .error :
161+ pass
162+
163+ # 4. Find @render.* definitions
164+ try :
165+ render_defs = re .findall (r"@render\.\w+\s+def\s+([\w_]+)\s*\(" , app_code )
166+ output_ids .update (render_defs )
167+ except re .error :
168+ pass
169+
170+ # 5. Find @output wrapped definitions
171+ try :
172+ output_defs = re .findall (r"@output\s+def\s+([\w_]+)\s*\(" , app_code )
173+ output_ids .update (output_defs )
174+ except re .error :
175+ pass
176+
177+ # 6. Find module instantiations
178+ try :
179+ module_ids = re .findall (
180+ r'\w+_\w+_(?:ui|server)\(\s*["\']([^"\']+)["\']' , app_code
181+ )
182+ input_ids .update (module_ids )
183+ output_ids .update (module_ids )
184+ except re .error :
185+ pass
186+
187+ # 7. Find navset components
188+ try :
189+ nav_ids = re .findall (
190+ r'ui\.navset_\w+\(.*?id\s*=\s*["\']([^"\']+)["\']' , app_code
191+ )
192+ input_ids .update (nav_ids )
193+ except re .error :
194+ pass
153195
154- # Process each pattern type
155- for pattern_type , pattern in patterns .items ():
156- # Find all matches of the pattern
157- matches = re .findall (pattern , app_code )
158-
159- # Flatten tuple results if any and filter out empty matches
160- flattened_matches = []
161- for match in matches :
162- if isinstance (match , tuple ):
163- # Add all non-empty groups from the tuple
164- for m in match :
165- if m :
166- flattened_matches .append (m )
167- elif match : # Single string match
168- flattened_matches .append (match )
169-
170- # Add to appropriate category
171- if pattern_type .startswith ("ui_input" ) or pattern_type .startswith (
172- "express_input"
173- ):
174- component_ids ["input" ].extend (flattened_matches )
175- elif pattern_type .startswith ("ui_output" ) or pattern_type .startswith (
176- "express_output"
177- ):
178- component_ids ["output" ].extend (flattened_matches )
179- else : # Other types (nav, module, etc.)
180- # These could go in either category or a new one, but we'll add to both
181- component_ids ["input" ].extend (flattened_matches )
182- component_ids ["output" ].extend (flattened_matches )
183-
184- # Remove duplicates while preserving order
185- component_ids ["input" ] = list (dict .fromkeys (component_ids ["input" ]))
186- component_ids ["output" ] = list (dict .fromkeys (component_ids ["output" ]))
187-
188- return component_ids
196+ return {"input" : sorted (list (input_ids )), "output" : sorted (list (output_ids ))}
189197
190198
191199def create_inspect_ai_samples (test_data : dict ) -> list [Sample ]:
@@ -203,16 +211,14 @@ def create_inspect_ai_samples(test_data: dict) -> list[Sample]:
203211 for test_name , data in test_data .items ():
204212 app_specific_guidance = get_app_specific_instructions (data ["app_name" ])
205213
206- # Extract component IDs from app code to help with evaluation
207214 component_ids = extract_component_ids (data ["app_code" ])
208215 component_ids_str = "\n " .join (
209216 [f"{ k .title ()} IDs: { ', ' .join (v )} " for k , v in component_ids .items () if v ]
210217 )
211218
212- # The question should be clear about what we're evaluating
213219 question = f"""Evaluate the quality of this Shiny test code for app { data ['app_name' ]} .
214220
215- IMPORTANT: First carefully analyze the App Code below to understand what components and IDs actually exist in the app.
221+ IMPORTANT: First carefully analyze the App Code below to understand what components and IDs actually exist in the app.
216222Then evaluate the test code ONLY against components and IDs that actually exist in the app code.
217223
218224Actual Component IDs automatically detected in App:
@@ -261,7 +267,6 @@ def shiny_test_evaluation() -> Task:
261267 """
262268 Inspect AI task for evaluating generated Shiny tests.
263269 """
264- # Load test data from the JSON file
265270 script_dir = Path (__file__ ).parent # Current script directory
266271 metadata_file = script_dir / "test_metadata.json"
267272 with open (metadata_file , "r" ) as f :
@@ -289,8 +294,8 @@ def shiny_test_evaluation() -> Task:
289294 - ONLY evaluate criteria for components that actually exist in the app code
290295 - COMPLETELY IGNORE criteria about components that don't exist in the app
291296 - Grade based ONLY on how well the test code tests the components that actually exist
292-
293- MOST IMPORTANT:
297+
298+ MOST IMPORTANT:
294299 - If the app does not contain a component mentioned in the criteria, IGNORE that part of the criteria completely
295300 - If the app uses a different ID than what's in the criteria (e.g., "data_grid" instead of "data_table"), use the actual ID from the app
296301
0 commit comments