Skip to content

Commit bc440c5

Browse files
committed
Refactor component ID extraction and clean up evaluation script
Simplifies and improves the extract_component_ids function by using direct regex searches and sets for input/output IDs, replacing the previous pattern dictionary and tuple flattening logic. Also removes redundant comments and whitespace, and clarifies evaluation instructions to focus on actual app components and IDs.
1 parent 1ca24c1 commit bc440c5

File tree

1 file changed

+67
-62
lines changed

1 file changed

+67
-62
lines changed

tests/inspect-ai/scripts/evaluation.py

Lines changed: 67 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def get_app_specific_instructions(app_name: str) -> str:
2525
- Whether the test creates an instance of the InputSlider controller with id "my_plot_module-n_points"
2626
- Ensure that the slider component is verified for its label, min, max, and value attributes.
2727
- Ensure that the test checks by moving the slider to different values and verify the slider values accordingly
28-
28+
2929
IMPORTANT: Only evaluate based on components and IDs that actually exist in the app code.
3030
""",
3131
"app_07_modules": """
@@ -35,15 +35,15 @@ def get_app_specific_instructions(app_name: str) -> str:
3535
- Ensure that the text inputs are verified for their labels and initial values.
3636
- Ensure that the test checks the text output for correct concatenation of input values.
3737
- Check that the test verifies the module's reactivity by changing input values and checking output
38-
38+
3939
IMPORTANT: Only evaluate based on components and IDs that actually exist in the app code.
4040
""",
4141
"app_03_slider": """
4242
For this slider app, focus on components that exist in the app code:
4343
- Whether the test creates an instance of the InputSlider controller with id "slider1"
4444
- Ensure that the slider component is verified for its label, min, max, and value attributes.
4545
- Ensure that the test checks by moving the slider to different values and verify the slider values accordingly.
46-
46+
4747
IMPORTANT: Only evaluate based on components and IDs that actually exist in the app code.
4848
""",
4949
"app_06_R_shiny": """
@@ -73,7 +73,7 @@ def get_app_specific_instructions(app_name: str) -> str:
7373
- Ensure that the output text components are verified for their initial values and updated values based on user interactions.
7474
- Whether the test creates an instance of the OutputDataFrame controller with id "data_grid"
7575
- Ensure that the data grid component is verified for its initial state and updates correctly based on user interactions.
76-
76+
7777
IMPORTANT: Only evaluate based on components and IDs that actually exist in the app code. The test should only test functionality that is actually present in the app.
7878
""",
7979
"app_02_express_basic": """
@@ -83,7 +83,6 @@ def get_app_specific_instructions(app_name: str) -> str:
8383
- Ensure that the test checks the action button state changes and verifies the output text accordingly.
8484
- Ensure that the test creates an instance of the OutputText controller with id "click_counts"
8585
- Ensure that the output text component is verified for its initial value and updated values based on button clicks.
86-
- Ensure that the test checks the click counts for each button and verifies the output text accordingly.
8786
- Ensure that the test creates instances of the InputActionButton controller with ids "btn2" and "btn3"
8887
- Ensure that the disabled button with icon is verified for its label and icon.
8988
- Ensure that the styled button is verified for its label and custom styles.
@@ -133,59 +132,68 @@ def extract_component_ids(app_code: str) -> dict:
133132
Returns:
134133
Dictionary with component types as keys and lists of IDs as values
135134
"""
136-
component_ids = {
137-
"input": [],
138-
"output": [],
139-
}
135+
input_ids = set()
136+
output_ids = set()
140137

141-
patterns = {
142-
# Standard ui.input_* and ui.output_* with ID as first arg
143-
"ui_input": r"ui\.input_\w+\(\s*['\"]([^'\"]+)['\"]|ui\.input_\w+\(\s*id\s*=\s*['\"]([^'\"]+)['\"])", # Both positional and named 'id' param
144-
"ui_output": r"ui\.output_\w+\(\s*['\"]([^'\"]+)['\"]|ui\.output_\w+\(\s*id\s*=\s*['\"]([^'\"]+)['\"])", # Both positional and named 'id' param
145-
# Shiny express syntax
146-
"express_input": r"input\.([\w_]+)\(\)", # input.name() references
147-
"express_output": r"@render\.[\w_]+\s+def\s+([\w_]+)\(", # @render.* def name(
148-
# Module IDs with instantiation
149-
"module_id": r"\w+_\w+\(['\"]([^'\"]+)['\"])", # module_name("id")
150-
# Nav panels, tabs and similar
151-
"ui_nav": r"ui\.nav[\w_]*\(\s*['\"]([^'\"]+)['\"]|ui\.navset_\w+\(.*?id\s*=\s*['\"]([^'\"]+)['\"])", # ui.nav* or ui.navset_* with id param
152-
}
138+
# 1. Find input components (ui.input_*)
139+
try:
140+
input_matches = re.findall(
141+
r'ui\.input_\w+\(\s*(?:id\s*=\s*)?["\']([^"\']+)["\']', app_code
142+
)
143+
input_ids.update(input_matches)
144+
except re.error:
145+
pass
146+
147+
# 2. Find output components (ui.output_*)
148+
try:
149+
output_matches = re.findall(
150+
r'ui\.output_\w+\(\s*(?:id\s*=\s*)?["\']([^"\']+)["\']', app_code
151+
)
152+
output_ids.update(output_matches)
153+
except re.error:
154+
pass
155+
156+
# 3. Find input references (input.name())
157+
try:
158+
input_refs = re.findall(r"input\.([\w_]+)\(\)", app_code)
159+
input_ids.update(input_refs)
160+
except re.error:
161+
pass
162+
163+
# 4. Find @render.* definitions
164+
try:
165+
render_defs = re.findall(r"@render\.\w+\s+def\s+([\w_]+)\s*\(", app_code)
166+
output_ids.update(render_defs)
167+
except re.error:
168+
pass
169+
170+
# 5. Find @output wrapped definitions
171+
try:
172+
output_defs = re.findall(r"@output\s+def\s+([\w_]+)\s*\(", app_code)
173+
output_ids.update(output_defs)
174+
except re.error:
175+
pass
176+
177+
# 6. Find module instantiations
178+
try:
179+
module_ids = re.findall(
180+
r'\w+_\w+_(?:ui|server)\(\s*["\']([^"\']+)["\']', app_code
181+
)
182+
input_ids.update(module_ids)
183+
output_ids.update(module_ids)
184+
except re.error:
185+
pass
186+
187+
# 7. Find navset components
188+
try:
189+
nav_ids = re.findall(
190+
r'ui\.navset_\w+\(.*?id\s*=\s*["\']([^"\']+)["\']', app_code
191+
)
192+
input_ids.update(nav_ids)
193+
except re.error:
194+
pass
153195

154-
# Process each pattern type
155-
for pattern_type, pattern in patterns.items():
156-
# Find all matches of the pattern
157-
matches = re.findall(pattern, app_code)
158-
159-
# Flatten tuple results if any and filter out empty matches
160-
flattened_matches = []
161-
for match in matches:
162-
if isinstance(match, tuple):
163-
# Add all non-empty groups from the tuple
164-
for m in match:
165-
if m:
166-
flattened_matches.append(m)
167-
elif match: # Single string match
168-
flattened_matches.append(match)
169-
170-
# Add to appropriate category
171-
if pattern_type.startswith("ui_input") or pattern_type.startswith(
172-
"express_input"
173-
):
174-
component_ids["input"].extend(flattened_matches)
175-
elif pattern_type.startswith("ui_output") or pattern_type.startswith(
176-
"express_output"
177-
):
178-
component_ids["output"].extend(flattened_matches)
179-
else: # Other types (nav, module, etc.)
180-
# These could go in either category or a new one, but we'll add to both
181-
component_ids["input"].extend(flattened_matches)
182-
component_ids["output"].extend(flattened_matches)
183-
184-
# Remove duplicates while preserving order
185-
component_ids["input"] = list(dict.fromkeys(component_ids["input"]))
186-
component_ids["output"] = list(dict.fromkeys(component_ids["output"]))
187-
188-
return component_ids
196+
return {"input": sorted(list(input_ids)), "output": sorted(list(output_ids))}
189197

190198

191199
def create_inspect_ai_samples(test_data: dict) -> list[Sample]:
@@ -203,16 +211,14 @@ def create_inspect_ai_samples(test_data: dict) -> list[Sample]:
203211
for test_name, data in test_data.items():
204212
app_specific_guidance = get_app_specific_instructions(data["app_name"])
205213

206-
# Extract component IDs from app code to help with evaluation
207214
component_ids = extract_component_ids(data["app_code"])
208215
component_ids_str = "\n".join(
209216
[f"{k.title()} IDs: {', '.join(v)}" for k, v in component_ids.items() if v]
210217
)
211218

212-
# The question should be clear about what we're evaluating
213219
question = f"""Evaluate the quality of this Shiny test code for app {data['app_name']}.
214220
215-
IMPORTANT: First carefully analyze the App Code below to understand what components and IDs actually exist in the app.
221+
IMPORTANT: First carefully analyze the App Code below to understand what components and IDs actually exist in the app.
216222
Then evaluate the test code ONLY against components and IDs that actually exist in the app code.
217223
218224
Actual Component IDs automatically detected in App:
@@ -261,7 +267,6 @@ def shiny_test_evaluation() -> Task:
261267
"""
262268
Inspect AI task for evaluating generated Shiny tests.
263269
"""
264-
# Load test data from the JSON file
265270
script_dir = Path(__file__).parent # Current script directory
266271
metadata_file = script_dir / "test_metadata.json"
267272
with open(metadata_file, "r") as f:
@@ -289,8 +294,8 @@ def shiny_test_evaluation() -> Task:
289294
- ONLY evaluate criteria for components that actually exist in the app code
290295
- COMPLETELY IGNORE criteria about components that don't exist in the app
291296
- Grade based ONLY on how well the test code tests the components that actually exist
292-
293-
MOST IMPORTANT:
297+
298+
MOST IMPORTANT:
294299
- If the app does not contain a component mentioned in the criteria, IGNORE that part of the criteria completely
295300
- If the app uses a different ID than what's in the criteria (e.g., "data_grid" instead of "data_table"), use the actual ID from the app
296301

0 commit comments

Comments
 (0)