Add flag for solving sample tasks and fix start id

xeon27 · xeon27 · commit 31978fb2f077 · 2025-03-31T22:51:57.000-04:00
diff --git a/src/capability.py b/src/capability.py
@@ -170,7 +170,7 @@ def _load_capability_json(self) -> None:
         self.domain = _cfg["capability_domain"]
         self.instructions = _cfg["capability_instructions"]
         # TODO: Store data is stored in json or elsewhere?
-        self._data = _cfg["capability_data"]
+        self._data: List[Dict[str, Any]] = _cfg["capability_data"]
         # Check if the capability is a seed capability, use source_dataset as indicator
         self.is_seed = "source_dataset" in _cfg
 
@@ -252,13 +252,14 @@ def add_and_update_tasks(self, tasks: List[Dict[str, Any]]) -> None:
                 "Each task must contain 'id', 'problem', and 'answer' keys."
             )
 
-        existing_task_ids = [task["id"] for task in self._data]
+        existing_tasks = self.get_tasks()
+        existing_task_ids = [task["id"] for task in existing_tasks]
         new_task_ids = [task["id"] for task in tasks]
         # Keep new task for overlapping tasks
         # TODO: Add `overwrite` flag to update existing tasks
         tasks_to_keep = [
             task
-            for task in self._data
+            for task in existing_tasks
             if task["id"]
             not in list(set.intersection(set(existing_task_ids), set(new_task_ids)))
         ] + tasks
@@ -457,6 +458,16 @@ def solve_tasks(
             metadata[task["id"]] = _metadata["api_metadata"]
         return (solved_tasks, metadata)
 
+    def get_tasks(self) -> List[Dict[str, Any]]:
+        """
+        Get the existing tasks for the capability.
+
+        Returns
+        -------
+            List[Dict[str, Any]]: A list of dictionaries containing the tasks.
+        """
+        return self._data
+
     def _create_inspect_file(self) -> None:
         """
         Implement pipeline to evaluate the capability using the inspect framework.
diff --git a/src/generate_tasks.py b/src/generate_tasks.py
@@ -73,6 +73,7 @@ def generate_tasks_using_llm(
     num_tasks: int,
     scientist_llm_gen_cfg_task_gen: Dict[str, Any],
     scientist_llm_gen_cfg_task_solve: Dict[str, Any],
+    solve_sample_tasks: bool = False,
     **kwargs: Any,
 ) -> None:
     """
@@ -91,6 +92,7 @@ def generate_tasks_using_llm(
             for task generation using the scientist LLM.
         scientist_llm_gen_cfg_task_solve (Dict[str, Any]): The generation configuration
             for solving tasks using the scientist LLM.
+        solve_sample_tasks (bool, optional): Whether to solve sample tasks.
         **kwargs (Any): Additional arguments for task generation.
     """
     # TODO: Implement the function with the following components
@@ -122,9 +124,6 @@ def generate_tasks_using_llm(
     # Generate task problems
     # Extract sample tasks from representative tasks
     sample_tasks = capability.get_repr_tasks()
-    for task in sample_tasks:
-        # Remove the answer
-        task.pop("answer", None)
 
     # Generate new tasks using the scientist LLM
     sys_prompt, user_prompt = get_task_generation_prompt(
@@ -144,14 +143,20 @@ def generate_tasks_using_llm(
     print(f"Metadata: {task_gen_metadata}")
     parsed_response = extract_and_parse_response(response)
     new_tasks = parsed_response["parsed_response"]
-    # Combine with sample tasks to get the full set of tasks
-    start_id = len(sample_tasks) + 1
-    all_tasks = sample_tasks + [
+
+    # Solve task and generate answers
+    # Set starting ID for new tasks
+    start_id = len(capability.get_tasks()) + 1
+    all_tasks = [
         {"id": str(start_id + idx), "problem": new_tasks[idx]}
         for idx in range(len(new_tasks))
     ]
-
-    # Solve task and generate answers
+    # Add sample tasks if solving them
+    if solve_sample_tasks:
+        for task in sample_tasks:
+            # Remove the answer
+            task.pop("answer", None)
+        all_tasks = sample_tasks + all_tasks
     solved_tasks, task_solver_metadata = capability.solve_tasks(
         tasks=all_tasks,
         llm=scientist_llm,
diff --git a/src/run.py b/src/run.py
@@ -115,6 +115,7 @@ def main(cfg: DictConfig) -> None:
             num_tasks=cfg.capabilities_cfg.num_gen_tasks_per_capability,
             scientist_llm_gen_cfg_task_gen=cfg.scientist_llm.gen_cfg.task_gen,
             scientist_llm_gen_cfg_task_solve=cfg.scientist_llm.gen_cfg.task_solve,
+            solve_sample_tasks=True,
             few_shot=cfg.capabilities_cfg.task_gen_few_shot,
         )
         #   # Evaluate subject LLM on each capability
@@ -143,6 +144,8 @@ def main(cfg: DictConfig) -> None:
     #         num_tasks=cfg.capabilities_cfg.num_gen_tasks_per_capability,
     #         scientist_llm_gen_cfg_task_gen=cfg.scientist_llm.gen_cfg.task_gen,
     #         scientist_llm_gen_cfg_task_solve=cfg.scientist_llm.gen_cfg.task_solve,
+    #         solve_sample_tasks=True,
+    #         few_shot=cfg.capabilities_cfg.task_gen_few_shot,
     #     )
     #     # Evaluate subject LLM on new capability
     #     new_capability.evaluate([subject_llm])