Add unit tests for add_and_update_tasks function

xeon27 · xeon27 · commit 20e234924978 · 2025-03-31T22:12:05.000-04:00
diff --git a/tests/src/capabilities_t2/math/math_mathematics_modeling_real_world/capability.json b/tests/src/capabilities_t2/math/math_mathematics_modeling_real_world/capability.json
@@ -0,0 +1,23 @@
+{
+    "capability_name": "math_mathematics_modeling_real_world",
+    "capability_description": "The math_mathematics_modeling_real_world capability consists of 1500 real-world mathematical modeling problems. Each problem requires the application of mathematical concepts to formulate and solve real-life scenarios, emphasizing critical thinking and problem-solving skills.",
+    "capability_domain": "math",
+    "capability_instructions": "f\"\"\"Solve the following real-world modeling problem step by step. The last line of your response should be of the form \"ANSWER: $ANSWER\" (without quotes) where $ANSWER is the answer to the problem.\n\nProblem: {t[\"problem\"]}\n\nRemember to put your answer on its own line at the end in the form \"ANSWER:$ANSWER\" (without quotes) where $ANSWER is the answer to the problem, and you do not need to use a \\boxed command.\"\"\"",
+    "capability_data": [
+        {
+            "id": "1",
+            "problem": "A company wants to determine the optimal number of units to produce in order to maximize profit. The profit function is given by P(x) = -2x^2 + 40x - 150, where x is the number of units produced. How many units should the company produce to maximize profit?",
+            "answer": "10"
+        },
+        {
+            "id": "2",
+            "problem": "A farmer has 240 meters of fencing to create a rectangular pen. What dimensions will maximize the area of the pen?",
+            "answer": "60 by 60"
+        },
+        {
+            "id": "3",
+            "problem": "A car rental company charges a flat fee of $50 plus $20 per day. If a customer has a budget of $200, how many days can they rent a car?",
+            "answer": "7"
+        }
+    ]
+}
diff --git a/tests/src/capabilities_t2/math/math_mathematics_modeling_real_world/capability.py b/tests/src/capabilities_t2/math/math_mathematics_modeling_real_world/capability.py
@@ -0,0 +1,25 @@
+class Capability:
+    @staticmethod
+    def repr_tasks() -> dict[str, dict]:
+        return {
+    "1": {
+        "problem": "A company wants to determine the optimal number of units to produce in order to maximize profit. The profit function is given by P(x) = -2x^2 + 40x - 150, where x is the number of units produced. How many units should the company produce to maximize profit?",
+        "answer": "10"
+    },
+    "2": {
+        "problem": "A farmer has 240 meters of fencing to create a rectangular pen. What dimensions will maximize the area of the pen?",
+        "answer": "60 by 60"
+    },
+    "3": {
+        "problem": "A car rental company charges a flat fee of $50 plus $20 per day. If a customer has a budget of $200, how many days can they rent a car?",
+        "answer": "7"
+    }
+}
+
+    @staticmethod
+    def get_instructions(t: dict) -> str:
+        return f"""Solve the following real-world modeling problem step by step. The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.\n\nProblem: {t["problem"]}\n\nRemember to put your answer on its own line at the end in the form "ANSWER:$ANSWER" (without quotes) where $ANSWER is the answer to the problem, and you do not need to use a \\boxed command."""
+
+    @staticmethod
+    def score(t: dict, submission: str) -> float | None:
+        return 1.0 if submission==t["answer"] else 0.0
diff --git a/tests/src/test_capability_class.py b/tests/src/test_capability_class.py
@@ -190,7 +190,7 @@ def test_create_capability_from_dict():
         "class": '```python\nclass Capability:\n    @staticmethod\n    def repr_tasks() -> dict[str, dict]:\n        return {\n    "1": {\n        "problem": "Prove that the number of ways to choose 2 elements from a set of n elements is equal to the number of ways to choose n-2 elements from the same set.",\n        "answer": "\\\\binom{n}{2} = \\\\binom{n}{n-2}"\n    },\n    "2": {\n        "problem": "Show that for any positive integer n, the sum of the first n odd numbers equals n^2.",\n        "answer": "1 + 3 + 5 + ... + (2n-1) = n^2"\n    },\n    "3": {\n        "problem": "Demonstrate that \\sum_{k=0}^{n} \\binom{n}{k} = 2^n using a combinatorial argument.",\n        "answer": "\\\\sum_{k=0}^{n} \\binom{n}{k} = 2^n"\n    }\n}\n\n    @staticmethod\n    def get_instructions(t: dict) -> str:\n        return f"""Provide a combinatorial proof for the following problem. The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is your proof or explanation.\\n\\nProblem: {t["problem"]}\\n\\nRemember to put your proof or explanation on its own line at the end in the form "ANSWER:$ANSWER" (without quotes) where $ANSWER is your proof or explanation."""\n\n    @staticmethod\n    def score(t: dict, submission: str) -> float | None:\n        return 1.0 if submission.lower().strip() == t["answer"].lower().strip() else 0.0\n```',
     }
     gen_capability_tests_dir = os.path.join(
-        test_dir, "capabilities", gen_capability_dict["domain"]
+        test_dir, "capabilities_t1", gen_capability_dict["domain"]
     )
     os.makedirs(gen_capability_tests_dir, exist_ok=True)
 
@@ -200,7 +200,7 @@ def test_create_capability_from_dict():
     assert capability.name == gen_capability_dict["name"]
     assert capability.description == gen_capability_dict["description"]
     assert capability.domain == gen_capability_dict["domain"]
-    shutil.rmtree(os.path.join(test_dir, "capabilities"))
+    shutil.rmtree(os.path.join(test_dir, "capabilities_t1"))
 
 
 def test_extract_and_parse_response():
@@ -235,3 +235,70 @@ def test_extract_and_parse_response():
     assert len(extracted_response_dict["parsed_response"]) == len(
         dummy_response_dict["response_json"]
     )
+
+
+def test_add_and_update_tasks_new_task():
+    """
+    Test the add_and_update_tasks method of the Capability class.
+
+    This test verifies that the add_and_update_tasks method correctly
+    adds and updates tasks in the capability. It checks the following:
+    - The number of tasks in the capability is updated correctly.
+    - The tasks are added and updated as expected.
+    - The capability's representation string remains unchanged.
+    """
+    capability_path = "capabilities_t2/math/math_mathematics_modeling_real_world"
+    # Create a copy
+    shutil.copytree(
+        os.path.join(test_dir, capability_path),
+        os.path.join(test_dir, f"copy_{capability_path}"),
+        dirs_exist_ok=True,
+    )
+    # Read the capability configuration from the copy
+    capability = Capability(os.path.join(test_dir, f"copy_{capability_path}"))
+    # Create a list of new tasks to add
+    new_tasks = [
+        {"id": "4", "problem": "Problem 4", "answer": "Answer 4"},
+    ]
+    # Add and update tasks in the capability
+    capability.add_and_update_tasks(tasks=new_tasks)
+    # Check if the number of tasks is updated correctly and
+    # the update doesnt affect the representation string
+    original_capability = Capability(os.path.join(test_dir, capability_path))
+    assert (len(original_capability._data) + len(new_tasks)) == len(capability._data)
+    assert capability.capability_repr_class_str == original_capability.capability_repr_class_str
+    # Clean up
+    shutil.rmtree(os.path.join(test_dir, f"copy_{capability_path.split('/')[0]}"))
+
+
+def test_add_and_update_tasks_repr_tasks():
+    capability_path = "capabilities_t2/math/math_mathematics_modeling_real_world"
+    # Create a copy
+    shutil.copytree(
+        os.path.join(test_dir, capability_path),
+        os.path.join(test_dir, f"copy_{capability_path}"),
+        dirs_exist_ok=True,
+    )
+    # Read the capability configuration from the copy
+    capability = Capability(os.path.join(test_dir, f"copy_{capability_path}"))
+    # Create a list of new tasks to add
+    repr_tasks = [
+        {"id": "1", "problem": "Problem 1", "answer": "Answer 1"},
+        {"id": "2", "problem": "Problem 2", "answer": "Answer 2"},
+        {"id": "3", "problem": "Problem 3", "answer": "Answer 3"},
+    ]
+    repr_tasks.sort(key=lambda x: x["id"])
+    # Add and update tasks in the capability
+    capability.add_and_update_tasks(tasks=repr_tasks)
+    # Check if the existing tasks are updated correctly
+    original_capability = Capability(os.path.join(test_dir, capability_path))
+    assert len(original_capability._data) == len(capability._data)
+    assert capability._data[0]["problem"] == repr_tasks[0]["problem"]
+    assert capability._data[1]["problem"] == repr_tasks[1]["problem"]
+    assert capability._data[2]["problem"] == repr_tasks[2]["problem"]
+    assert capability._data[0]["answer"] == repr_tasks[0]["answer"]
+    assert capability._data[1]["answer"] == repr_tasks[1]["answer"]
+    assert capability._data[2]["answer"] == repr_tasks[2]["answer"]
+    assert capability.capability_repr_class_str != original_capability.capability_repr_class_str
+    # Clean up
+    shutil.rmtree(os.path.join(test_dir, f"copy_{capability_path.split('/')[0]}"))