Refactor component ID extraction and clean up evaluation script

karangattu · karangattu · commit bc440c5876c7 · 2025-08-21T16:27:14.000-07:00
Simplifies and improves the extract_component_ids function by using direct regex searches and sets for input/output IDs, replacing the previous pattern dictionary and tuple flattening logic. Also removes redundant comments and whitespace, and clarifies evaluation instructions to focus on actual app components and IDs.
diff --git a/tests/inspect-ai/scripts/evaluation.py b/tests/inspect-ai/scripts/evaluation.py
@@ -25,7 +25,7 @@ def get_app_specific_instructions(app_name: str) -> str:
         - Whether the test creates an instance of the InputSlider controller with id "my_plot_module-n_points"
         - Ensure that the slider component is verified for its label, min, max, and value attributes.
         - Ensure that the test checks by moving the slider to different values and verify the slider values accordingly
-
+        
         IMPORTANT: Only evaluate based on components and IDs that actually exist in the app code.
         """,
         "app_07_modules": """
@@ -35,15 +35,15 @@ def get_app_specific_instructions(app_name: str) -> str:
         - Ensure that the text inputs are verified for their labels and initial values.
         - Ensure that the test checks the text output for correct concatenation of input values.
         - Check that the test verifies the module's reactivity by changing input values and checking output
-
+        
         IMPORTANT: Only evaluate based on components and IDs that actually exist in the app code.
         """,
         "app_03_slider": """
         For this slider app, focus on components that exist in the app code:
         - Whether the test creates an instance of the InputSlider controller with id "slider1"
         - Ensure that the slider component is verified for its label, min, max, and value attributes.
         - Ensure that the test checks by moving the slider to different values and verify the slider values accordingly.
-
+        
         IMPORTANT: Only evaluate based on components and IDs that actually exist in the app code.
         """,
         "app_06_R_shiny": """
@@ -73,7 +73,7 @@ def get_app_specific_instructions(app_name: str) -> str:
         - Ensure that the output text components are verified for their initial values and updated values based on user interactions.
         - Whether the test creates an instance of the OutputDataFrame controller with id "data_grid"
         - Ensure that the data grid component is verified for its initial state and updates correctly based on user interactions.
-
+        
         IMPORTANT: Only evaluate based on components and IDs that actually exist in the app code. The test should only test functionality that is actually present in the app.
         """,
         "app_02_express_basic": """
@@ -83,7 +83,6 @@ def get_app_specific_instructions(app_name: str) -> str:
         - Ensure that the test checks the action button state changes and verifies the output text accordingly.
         - Ensure that the test creates an instance of the OutputText controller with id "click_counts"
         - Ensure that the output text component is verified for its initial value and updated values based on button clicks.
-        - Ensure that the test checks the click counts for each button and verifies the output text accordingly.
         - Ensure that the test creates instances of the InputActionButton controller with ids "btn2" and "btn3"
         - Ensure that the disabled button with icon is verified for its label and icon.
         - Ensure that the styled button is verified for its label and custom styles.
@@ -133,59 +132,68 @@ def extract_component_ids(app_code: str) -> dict:
     Returns:
         Dictionary with component types as keys and lists of IDs as values
     """
-    component_ids = {
-        "input": [],
-        "output": [],
-    }
+    input_ids = set()
+    output_ids = set()
 
-    patterns = {
-        # Standard ui.input_* and ui.output_* with ID as first arg
-        "ui_input": r"ui\.input_\w+\(\s*['\"]([^'\"]+)['\"]|ui\.input_\w+\(\s*id\s*=\s*['\"]([^'\"]+)['\"])",  # Both positional and named 'id' param
-        "ui_output": r"ui\.output_\w+\(\s*['\"]([^'\"]+)['\"]|ui\.output_\w+\(\s*id\s*=\s*['\"]([^'\"]+)['\"])",  # Both positional and named 'id' param
-        # Shiny express syntax
-        "express_input": r"input\.([\w_]+)\(\)",  # input.name() references
-        "express_output": r"@render\.[\w_]+\s+def\s+([\w_]+)\(",  # @render.* def name(
-        # Module IDs with instantiation
-        "module_id": r"\w+_\w+\(['\"]([^'\"]+)['\"])",  # module_name("id")
-        # Nav panels, tabs and similar
-        "ui_nav": r"ui\.nav[\w_]*\(\s*['\"]([^'\"]+)['\"]|ui\.navset_\w+\(.*?id\s*=\s*['\"]([^'\"]+)['\"])",  # ui.nav* or ui.navset_* with id param
-    }
+    # 1. Find input components (ui.input_*)
+    try:
+        input_matches = re.findall(
+            r'ui\.input_\w+\(\s*(?:id\s*=\s*)?["\']([^"\']+)["\']', app_code
+        )
+        input_ids.update(input_matches)
+    except re.error:
+        pass
+
+    # 2. Find output components (ui.output_*)
+    try:
+        output_matches = re.findall(
+            r'ui\.output_\w+\(\s*(?:id\s*=\s*)?["\']([^"\']+)["\']', app_code
+        )
+        output_ids.update(output_matches)
+    except re.error:
+        pass
+
+    # 3. Find input references (input.name())
+    try:
+        input_refs = re.findall(r"input\.([\w_]+)\(\)", app_code)
+        input_ids.update(input_refs)
+    except re.error:
+        pass
+
+    # 4. Find @render.* definitions
+    try:
+        render_defs = re.findall(r"@render\.\w+\s+def\s+([\w_]+)\s*\(", app_code)
+        output_ids.update(render_defs)
+    except re.error:
+        pass
+
+    # 5. Find @output wrapped definitions
+    try:
+        output_defs = re.findall(r"@output\s+def\s+([\w_]+)\s*\(", app_code)
+        output_ids.update(output_defs)
+    except re.error:
+        pass
+
+    # 6. Find module instantiations
+    try:
+        module_ids = re.findall(
+            r'\w+_\w+_(?:ui|server)\(\s*["\']([^"\']+)["\']', app_code
+        )
+        input_ids.update(module_ids)
+        output_ids.update(module_ids)
+    except re.error:
+        pass
+
+    # 7. Find navset components
+    try:
+        nav_ids = re.findall(
+            r'ui\.navset_\w+\(.*?id\s*=\s*["\']([^"\']+)["\']', app_code
+        )
+        input_ids.update(nav_ids)
+    except re.error:
+        pass
 
-    # Process each pattern type
-    for pattern_type, pattern in patterns.items():
-        # Find all matches of the pattern
-        matches = re.findall(pattern, app_code)
-
-        # Flatten tuple results if any and filter out empty matches
-        flattened_matches = []
-        for match in matches:
-            if isinstance(match, tuple):
-                # Add all non-empty groups from the tuple
-                for m in match:
-                    if m:
-                        flattened_matches.append(m)
-            elif match:  # Single string match
-                flattened_matches.append(match)
-
-        # Add to appropriate category
-        if pattern_type.startswith("ui_input") or pattern_type.startswith(
-            "express_input"
-        ):
-            component_ids["input"].extend(flattened_matches)
-        elif pattern_type.startswith("ui_output") or pattern_type.startswith(
-            "express_output"
-        ):
-            component_ids["output"].extend(flattened_matches)
-        else:  # Other types (nav, module, etc.)
-            # These could go in either category or a new one, but we'll add to both
-            component_ids["input"].extend(flattened_matches)
-            component_ids["output"].extend(flattened_matches)
-
-    # Remove duplicates while preserving order
-    component_ids["input"] = list(dict.fromkeys(component_ids["input"]))
-    component_ids["output"] = list(dict.fromkeys(component_ids["output"]))
-
-    return component_ids
+    return {"input": sorted(list(input_ids)), "output": sorted(list(output_ids))}
 
 
 def create_inspect_ai_samples(test_data: dict) -> list[Sample]:
@@ -203,16 +211,14 @@ def create_inspect_ai_samples(test_data: dict) -> list[Sample]:
     for test_name, data in test_data.items():
         app_specific_guidance = get_app_specific_instructions(data["app_name"])
 
-        # Extract component IDs from app code to help with evaluation
         component_ids = extract_component_ids(data["app_code"])
         component_ids_str = "\n".join(
             [f"{k.title()} IDs: {', '.join(v)}" for k, v in component_ids.items() if v]
         )
 
-        # The question should be clear about what we're evaluating
         question = f"""Evaluate the quality of this Shiny test code for app {data['app_name']}.
 
-IMPORTANT: First carefully analyze the App Code below to understand what components and IDs actually exist in the app.
+IMPORTANT: First carefully analyze the App Code below to understand what components and IDs actually exist in the app. 
 Then evaluate the test code ONLY against components and IDs that actually exist in the app code.
 
 Actual Component IDs automatically detected in App:
@@ -261,7 +267,6 @@ def shiny_test_evaluation() -> Task:
     """
     Inspect AI task for evaluating generated Shiny tests.
     """
-    # Load test data from the JSON file
     script_dir = Path(__file__).parent  # Current script directory
     metadata_file = script_dir / "test_metadata.json"
     with open(metadata_file, "r") as f:
@@ -289,8 +294,8 @@ def shiny_test_evaluation() -> Task:
         - ONLY evaluate criteria for components that actually exist in the app code
         - COMPLETELY IGNORE criteria about components that don't exist in the app
         - Grade based ONLY on how well the test code tests the components that actually exist
-
-        MOST IMPORTANT:
+        
+        MOST IMPORTANT: 
         - If the app does not contain a component mentioned in the criteria, IGNORE that part of the criteria completely
         - If the app uses a different ID than what's in the criteria (e.g., "data_grid" instead of "data_table"), use the actual ID from the app