refactor tests

jardinetsouffleton · jardinetsouffleton · commit fbd4e8e222b8 · 2025-07-16T18:23:21.000-04:00
diff --git a/src/browsergym/workarena/__init__.py b/src/browsergym/workarena/__init__.py
@@ -111,18 +111,19 @@ def get_task_category(task_name):
     return benchmark, TASK_CATEGORY_MAP.get(task_name, None)
 
 
-def get_all_tasks_agents(filter="l2", meta_seed=42, n_seed_l1=10, is_agent_curriculum=True):
+def get_all_tasks_agents(
+    filter="l2", meta_seed=42, n_seed_l1=10, is_agent_curriculum=True, task_bucket=None
+):
     OFFSET = 42
     all_task_tuples = []
     filter = filter.split(".")
+    rng = np.random.RandomState(meta_seed)
     if len(filter) > 2:
         raise Exception("Unsupported filter used.")
     if len(filter) == 1:
         level = filter[0]
         if level not in ["l1", "l2", "l3"]:
             raise Exception("Unsupported category of tasks.")
-        else:
-            rng = np.random.RandomState(meta_seed)
         if level == "l1":
             for task in ATOMIC_TASKS:
                 for seed in rng.randint(0, 1000, n_seed_l1):
@@ -151,9 +152,16 @@ def get_all_tasks_agents(filter="l2", meta_seed=42, n_seed_l1=10, is_agent_curri
     for category, items in ALL_COMPOSITIONAL_TASKS_CATEGORIES.items():
         if filter_category and category != filter_category:
             continue
+        # If a task_bucket is specified, check if it exists in the current category
+        if task_bucket and task_bucket not in items["buckets"]:
+            continue
         for curr_seed in rng.randint(0, 1000, items["num_seeds"]):
             random_gen = np.random.RandomState(curr_seed)
-            for task_set, count in zip(items["buckets"], items["weights"]):
+            for i, task_set in enumerate(items["buckets"]):
+                # if a task_bucket is specified, only select tasks from that bucket
+                if task_bucket and task_set != task_bucket:
+                    continue
+                count = items["weights"][i]
                 tasks = random_gen.choice(task_set, count, replace=False)
                 for task in tasks:
                     all_task_tuples.append((task, int(curr_seed)))
diff --git a/tests/test_compositional.py b/tests/test_compositional.py
@@ -2,7 +2,6 @@
 Tests that are not specific to any particular kind of task.
 
 """
-
 import logging
 import os
 
@@ -14,70 +13,25 @@
 from playwright.sync_api import Page, TimeoutError
 from tenacity import retry, stop_after_attempt, retry_if_exception_type
 
-from browsergym.workarena import ALL_COMPOSITIONAL_TASKS, get_all_tasks_agents
-from browsergym.workarena.tasks.compositional.utils.curriculum import AGENT_CURRICULUM
-
-
-AGENT_L2_SAMPLED_SET = get_all_tasks_agents(filter="l2")
-
-AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS = [sampled_set[0] for sampled_set in AGENT_L2_SAMPLED_SET], [
-    sampled_set[1] for sampled_set in AGENT_L2_SAMPLED_SET
-]
-
-AGENT_L3_SAMPLED_SET = get_all_tasks_agents(filter="l3")
-
-AGENT_L3_SAMPLED_TASKS, AGENT_L3_SEEDS = [sampled_set[0] for sampled_set in AGENT_L3_SAMPLED_SET], [
-    sampled_set[1] for sampled_set in AGENT_L3_SAMPLED_SET
-]
+from browsergym.workarena import get_all_tasks_agents
+from browsergym.workarena.tasks.compositional.base import CompositionalTask
 
+# Combine all tasks into a single list for parameterization
+AGENT_L2_SAMPLED_SET = get_all_tasks_agents(filter="l2", is_agent_curriculum=True)
+AGENT_L3_SAMPLED_SET = get_all_tasks_agents(filter="l3", is_agent_curriculum=True)
 HUMAN_L2_SAMPLED_SET = get_all_tasks_agents(filter="l2", is_agent_curriculum=False)
-
-HUMAN_L2_SAMPLED_TASKS, HUMAN_L2_SEEDS = [sampled_set[0] for sampled_set in HUMAN_L2_SAMPLED_SET], [
-    sampled_set[1] for sampled_set in HUMAN_L2_SAMPLED_SET
-]
-
 HUMAN_L3_SAMPLED_SET = get_all_tasks_agents(filter="l3", is_agent_curriculum=False)
 
-HUMAN_L3_SAMPLED_TASKS, HUMAN_L3_SEEDS = [sampled_set[0] for sampled_set in HUMAN_L3_SAMPLED_SET], [
-    sampled_set[1] for sampled_set in HUMAN_L3_SAMPLED_SET
-]
+all_tasks_to_test = (
+    AGENT_L2_SAMPLED_SET + AGENT_L3_SAMPLED_SET + HUMAN_L2_SAMPLED_SET + HUMAN_L3_SAMPLED_SET
+)
 
 test_category = os.environ.get("TEST_CATEGORY")
-
 if test_category:
-    tasks_to_test = []
-    items = AGENT_CURRICULUM.get(test_category)
-    if items:
-        for bucket in items["buckets"]:
-            tasks_to_test.extend(bucket)
+    # If a category is specified, filter the tasks to test
+    tasks_to_test = get_all_tasks_agents(filter=f"l3.{test_category}", is_agent_curriculum=True)
 else:
-    tasks_to_test = ALL_COMPOSITIONAL_TASKS
-
-
-@retry(
-    stop=stop_after_attempt(5),
-    reraise=True,
-    before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."),
-)
-@pytest.mark.parametrize("task_entrypoint", tasks_to_test)
-@pytest.mark.parametrize("random_seed", range(1))
-@pytest.mark.parametrize("level", range(2, 4))
-@pytest.mark.pricy
-def test_cheat_compositional(task_entrypoint, random_seed, level, page: Page):
-    task = task_entrypoint(seed=random_seed, level=level)
-    goal, info = task.setup(page=page)
-    chat_messages = []
-    for i in range(len(task)):
-        page.wait_for_timeout(1000)
-        task.cheat(page=page, chat_messages=chat_messages, subtask_idx=i)
-        page.wait_for_timeout(1000)
-        reward, done, message, info = task.validate(page=page, chat_messages=chat_messages)
-        if i < len(task) - 1:
-            assert done is False and reward == 0.0
-
-    task.teardown()
-
-    assert done is True and reward == 1.0
+    tasks_to_test = all_tasks_to_test
 
 
 @retry(
@@ -86,89 +40,14 @@ def test_cheat_compositional(task_entrypoint, random_seed, level, page: Page):
     reraise=True,
     before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."),
 )
-@pytest.mark.parametrize("task_entrypoint, seed", zip(AGENT_L2_SAMPLED_TASKS, AGENT_L2_SEEDS))
-@pytest.mark.slow
-@pytest.mark.skip(reason="Tests are too slow")
-def test_cheat_compositional_sampled_agent_set_l2(task_entrypoint, seed, page: Page):
-    task = task_entrypoint(seed=seed)
-    goal, info = task.setup(page=page)
-    chat_messages = []
-    for i in range(len(task)):
-        page.wait_for_timeout(1000)
-        task.cheat(page=page, chat_messages=chat_messages, subtask_idx=i)
-        page.wait_for_timeout(1000)
-        reward, done, message, info = task.validate(page=page, chat_messages=chat_messages)
-        if i < len(task) - 1:
-            assert done is False and reward == 0.0
-
-    task.teardown()
-
-    assert done is True and reward == 1.0
-
-
-@retry(
-    stop=stop_after_attempt(5),
-    retry=retry_if_exception_type(TimeoutError),
-    reraise=True,
-    before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."),
-)
-@pytest.mark.parametrize("task_entrypoint, seed", zip(AGENT_L3_SAMPLED_TASKS, AGENT_L3_SEEDS))
-@pytest.mark.slow
-@pytest.mark.skip(reason="Tests are too slow")
-def test_cheat_compositional_sampled_agent_set_l3(task_entrypoint, seed, page: Page):
-    task = task_entrypoint(seed=seed)
-    goal, info = task.setup(page=page)
-    chat_messages = []
-    for i in range(len(task)):
-        page.wait_for_timeout(1000)
-        task.cheat(page=page, chat_messages=chat_messages, subtask_idx=i)
-        page.wait_for_timeout(1000)
-        reward, done, message, info = task.validate(page=page, chat_messages=chat_messages)
-        if i < len(task) - 1:
-            assert done is False and reward == 0.0
-
-    task.teardown()
-
-    assert done is True and reward == 1.0
-
-
-@retry(
-    stop=stop_after_attempt(5),
-    retry=retry_if_exception_type(TimeoutError),
-    reraise=True,
-    before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."),
-)
-@pytest.mark.parametrize("task_entrypoint, seed", zip(HUMAN_L2_SAMPLED_TASKS, HUMAN_L2_SEEDS))
-@pytest.mark.slow
-@pytest.mark.skip(reason="Tests are too slow")
-def test_cheat_compositional_sampled_human_set_l2(task_entrypoint, seed, page: Page):
-    task = task_entrypoint(seed=seed)
-    goal, info = task.setup(page=page)
-    chat_messages = []
-    for i in range(len(task)):
-        page.wait_for_timeout(1000)
-        task.cheat(page=page, chat_messages=chat_messages, subtask_idx=i)
-        page.wait_for_timeout(1000)
-        reward, done, message, info = task.validate(page=page, chat_messages=chat_messages)
-        if i < len(task) - 1:
-            assert done is False and reward == 0.0
-
-    task.teardown()
-
-    assert done is True and reward == 1.0
-
-
-@retry(
-    stop=stop_after_attempt(5),
-    retry=retry_if_exception_type(TimeoutError),
-    reraise=True,
-    before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."),
-)
-@pytest.mark.parametrize("task_entrypoint, seed", zip(HUMAN_L3_SAMPLED_TASKS, HUMAN_L3_SEEDS))
-@pytest.mark.slow
-@pytest.mark.skip(reason="Tests are too slow")
-def test_cheat_compositional_sampled_human_set_l3(task_entrypoint, seed, page: Page):
-    task = task_entrypoint(seed=seed)
+@pytest.mark.parametrize("task_class, seed", tasks_to_test)
+@pytest.mark.pricy
+def test_cheat_compositional(task_class, seed, page: Page):
+    """
+    Test that the cheat method works for all compositional tasks.
+    This test is parameterized to run for all tasks in the agent and human curricula.
+    """
+    task = task_class(seed=seed)
     goal, info = task.setup(page=page)
     chat_messages = []
     for i in range(len(task)):
diff --git a/tests/test_workarena_utils.py b/tests/test_workarena_utils.py
@@ -0,0 +1,114 @@
+"""
+Tests for workarena utility functions.
+"""
+import pytest
+from browsergym.workarena import get_all_tasks_agents
+from browsergym.workarena.tasks.compositional import (
+    AGENT_CURRICULUM_L2,
+    AGENT_CURRICULUM_L3,
+    HUMAN_CURRICULUM_L2,
+    HUMAN_CURRICULUM_L3,
+    specialize_task_class_to_level,
+)
+from browsergym.workarena.tasks.compositional.base import CompositionalTask
+from browsergym.workarena.tasks.compositional.mark_duplicate_problems import (
+    BasicFilterProblemsAndMarkDuplicatesSmallTask,
+    PriorityFilterProblemsAndMarkDuplicatesSmallTask,
+)
+from browsergym.workarena.tasks.compositional.navigate_and_do_infeasible import (
+    InfeasibleNavigateAndCreateUserWithReasonTask,
+)
+
+
+def get_tasks_from_curriculum(curriculum):
+    """Helper function to extract all unique tasks from a curriculum."""
+    all_tasks = set()
+    for category, items in curriculum.items():
+        for bucket in items["buckets"]:
+            for task in bucket:
+                all_tasks.add(task)
+    return all_tasks
+
+
+def test_get_all_tasks_agents():
+    """Test that get_all_tasks_agents returns the correct tasks from the curricula."""
+    # Test L1 filter (atomic tasks)
+    tasks_with_seeds_l1 = get_all_tasks_agents(filter="l1")
+    assert len(tasks_with_seeds_l1) > 0
+    for task, seed in tasks_with_seeds_l1:
+        assert not issubclass(task, CompositionalTask)
+        assert isinstance(seed, int)
+
+    # Test L2 Human Curriculum
+    tasks_with_seeds_l2_human = get_all_tasks_agents(filter="l2", is_agent_curriculum=False)
+    expected_l2_human_tasks = get_tasks_from_curriculum(HUMAN_CURRICULUM_L2)
+    assert len(tasks_with_seeds_l2_human) > 0
+    for task, seed in tasks_with_seeds_l2_human:
+        assert task in expected_l2_human_tasks
+
+    # Test L3 Human Curriculum
+    tasks_with_seeds_l3_human = get_all_tasks_agents(filter="l3", is_agent_curriculum=False)
+    expected_l3_human_tasks = get_tasks_from_curriculum(HUMAN_CURRICULUM_L3)
+    assert len(tasks_with_seeds_l3_human) > 0
+    for task, seed in tasks_with_seeds_l3_human:
+        assert task in expected_l3_human_tasks
+
+    # Test category filtering
+    category = "planning_and_problem_solving"
+    tasks_with_seeds_cat = get_all_tasks_agents(
+        filter=f"l3.{category}", is_agent_curriculum=True
+    )
+    assert len(tasks_with_seeds_cat) > 0
+    # Expected tasks from the specified category's buckets
+    expected_cat_tasks = set()
+    for bucket in AGENT_CURRICULUM_L3[category]["buckets"]:
+        expected_cat_tasks.update(bucket)
+
+    returned_tasks = {task for task, seed in tasks_with_seeds_cat}
+    assert returned_tasks.issubset(expected_cat_tasks)
+
+    # Check that tasks from other categories are not present
+    for other_category, items in AGENT_CURRICULUM_L3.items():
+        if other_category != category:
+            for bucket in items["buckets"]:
+                for task in bucket:
+                    assert task not in returned_tasks
+
+    # Test task_bucket filtering
+    category = "planning_and_problem_solving"
+    # This bucket contains BasicFilterProblemsAndMarkDuplicatesSmallTask
+    bucket_to_test = AGENT_CURRICULUM_L3[category]["buckets"][0]
+
+    tasks_with_seeds_bucket = get_all_tasks_agents(
+        filter=f"l3.{category}", is_agent_curriculum=True, task_bucket=bucket_to_test
+    )
+    assert len(tasks_with_seeds_bucket) > 0
+
+    returned_tasks_from_bucket = {task for task, seed in tasks_with_seeds_bucket}
+
+    # 1. All returned tasks are from the specified bucket
+    assert returned_tasks_from_bucket.issubset(set(bucket_to_test))
+
+    # 2. A specific task from the bucket is present
+    expected_task_base = BasicFilterProblemsAndMarkDuplicatesSmallTask
+    # Find the specialized task in the bucket that corresponds to the base task
+    expected_task_specialized = next(
+        task
+        for task in bucket_to_test
+        if expected_task_base in task.__mro__
+    )
+    assert expected_task_specialized in returned_tasks_from_bucket
+
+    # A task from a different category is not present
+    unexpected_task = specialize_task_class_to_level(
+        InfeasibleNavigateAndCreateUserWithReasonTask, level=3
+    )
+    assert unexpected_task not in returned_tasks_from_bucket
+
+    # Test invalid filter
+    with pytest.raises(Exception):
+        get_all_tasks_agents(filter="invalid")
+
+    # Test invalid category filter
+    with pytest.raises(Exception):
+        get_all_tasks_agents(filter="l3.invalid_category")