From 1c91933e39038b6928e5ff92d4a18a36a9a40e33 Mon Sep 17 00:00:00 2001 From: Farnaz Kohankhaki Date: Fri, 12 Dec 2025 01:41:12 -0800 Subject: [PATCH 1/8] Remove run_capability_generation.py --- .../capability_score_visualization.yaml | 8 +- experimental/diverse_task_config.yaml | 37 - experimental/diverse_task_generator.py | 347 -------- experimental/extract_subtopics.py | 41 - experimental/generate_blueprints.py | 65 -- experimental/generate_tasks.py | 85 -- experimental/model_utils.py | 61 -- .../diverse_task_constants.py | 0 .../diverse_task_dataclasses.py | 0 .../diverse_task_prompts.py | 34 +- src/base_task_generation/extract_subtopics.py | 61 ++ .../find_combinations.py | 48 +- .../generate_blueprints.py | 89 ++ src/base_task_generation/generate_tasks.py | 117 +++ .../base_task_generation}/verify_tasks.py | 0 src/cfg/run_cfg.yaml | 41 +- src/generate_capabilities.py | 493 +++-------- src/generate_diverse_tasks.py | 68 ++ src/model.py | 6 +- src/run_capability_generation.py | 150 ---- src/run_generation_pipeline.py | 829 ++++++++++++++++++ src/utils/capability_management_utils.py | 44 +- src/utils/capability_utils.py | 21 +- src/utils/constants.py | 3 + src/utils/data_utils.py | 20 +- src/utils/embedding_utils.py | 48 +- src/utils/model_client_utils.py | 216 +---- src/utils/prompts.py | 95 +- 28 files changed, 1570 insertions(+), 1457 deletions(-) delete mode 100644 experimental/diverse_task_config.yaml delete mode 100644 experimental/diverse_task_generator.py delete mode 100644 experimental/extract_subtopics.py delete mode 100644 experimental/generate_blueprints.py delete mode 100644 experimental/generate_tasks.py delete mode 100644 experimental/model_utils.py rename {experimental => src/base_task_generation}/diverse_task_constants.py (100%) rename {experimental => src/base_task_generation}/diverse_task_dataclasses.py (100%) rename {experimental => src/base_task_generation}/diverse_task_prompts.py (94%) create mode 100644 src/base_task_generation/extract_subtopics.py rename {experimental => src/base_task_generation}/find_combinations.py (62%) create mode 100644 src/base_task_generation/generate_blueprints.py create mode 100644 src/base_task_generation/generate_tasks.py rename {experimental => src/base_task_generation}/verify_tasks.py (100%) create mode 100644 src/generate_diverse_tasks.py delete mode 100644 src/run_capability_generation.py create mode 100644 src/run_generation_pipeline.py diff --git a/example_scripts/example_cfg/capability_score_visualization.yaml b/example_scripts/example_cfg/capability_score_visualization.yaml index 0f1be7d..71d989f 100644 --- a/example_scripts/example_cfg/capability_score_visualization.yaml +++ b/example_scripts/example_cfg/capability_score_visualization.yaml @@ -13,11 +13,11 @@ capabilities_cfg: # Set to -1 to use all seed capabilities num_seed_capabilities: 1 # Number of initial capabilities to generate using the scientist LLM - num_gen_capabilities: 100 + num_capabilities: 100 # Buffer for capability generation - num_gen_capabilities_buffer: 0.2 - # Number of capability areas to generate - num_capability_areas: 10 + num_capabilities_buffer: 0.2 + # Number of areas to generate + num_areas: 10 # Number of initial capabilities to generate per run num_gen_capabilities_per_run: 5 # Number of tasks to generate for each capability diff --git a/experimental/diverse_task_config.yaml b/experimental/diverse_task_config.yaml deleted file mode 100644 index c261782..0000000 --- a/experimental/diverse_task_config.yaml +++ /dev/null @@ -1,37 +0,0 @@ -# Configuration for Diverse Task Generator - -# Model settings -model: - name: gpt-4o # OpenAI model to use - temperature: 1.0 # Temperature for all steps - max_tokens: 8192 # Max tokens for all steps - max_retries: 3 # Number of retry attempts for API calls - retry_delay: 2.0 # Initial delay between retries in seconds (exponential backoff) - -# Task generation settings -generation: - tasks_per_blueprint: 3 # Number of tasks to generate per blueprint - min_subtopics: 3 # Suggested minimum number of sub-topics - max_subtopics: 8 # Suggested maximum number of sub-topics - -# Output settings -output: - base_dir: diverse_task_outputs - save_intermediate_steps: true # Save each step's output - pretty_print_json: true # Indent JSON files - -# Input settings -input: - capability_json_path: capability.json # Default capability JSON file path - -# Verification criteria -verification: - pass_threshold: 0.8 # Minimum pass rate to consider successful - strict_mode: false # If true, all alignment criteria must pass - -# Example capability for quick testing -example_capability: - name: "compound_interest_calculations" - description: "The ability to calculate compound interest for various scenarios, including different compounding frequencies (annually, semi-annually, quarterly, monthly), different time periods, and understanding how changes in principal, rate, or time affect the final amount." - domain: "personal_finance" - area: "investing_and_savings" diff --git a/experimental/diverse_task_generator.py b/experimental/diverse_task_generator.py deleted file mode 100644 index 06595d3..0000000 --- a/experimental/diverse_task_generator.py +++ /dev/null @@ -1,347 +0,0 @@ -"""Standalone script for generating diverse tasks for a single capability.""" - -import argparse -import json -import logging -import os -from dataclasses import asdict -from datetime import datetime -from functools import partial -from pathlib import Path -from typing import Any - -import yaml -from diverse_task_dataclasses import ( - Blueprint, - Capability, - Combination, - SubTopic, - Task, - VerificationResult, -) -from extract_subtopics import extract_subtopics -from find_combinations import find_valid_combinations -from generate_blueprints import generate_blueprints -from model_utils import call_model -from openai import OpenAI -from verify_tasks import verify_tasks - -from generate_tasks import generate_tasks - - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", -) -logger = logging.getLogger(__name__) - - -class DiverseTaskGenerator: - """Generate diverse tasks for a capability using multi-dimensional approach.""" - - def __init__( - self, - capability_dict: dict, - config: dict, - resume: bool = False, - resume_dir: str = None, - ) -> None: - """Initialize the diverse task generator.""" - # Extract example tasks from capability_data if present - example_tasks = ( - capability_dict.get("capability_data", [])[:3] - if "capability_data" in capability_dict - else [] - ) - - self.capability = Capability( - name=capability_dict["capability_name"], - description=capability_dict["capability_description"], - domain=capability_dict["capability_domain"], - area=capability_dict.get("capability_area"), - example_tasks=example_tasks, - ) - - # Store configuration - self.config = config - self.resume = resume - - # Use config values - self.model_name = self.config["model"]["name"] - self.temperature = self.config["model"]["temperature"] - self.max_tokens = self.config["model"]["max_tokens"] - self.max_retries = self.config["model"].get("max_retries", 3) - self.retry_delay = self.config["model"].get("retry_delay", 2.0) - self.output_dir = Path(self.config["output"]["base_dir"]) - - # Initialize OpenAI client - api_key = os.getenv("OPENAI_API_KEY") - if not api_key: - raise ValueError("OPENAI_API_KEY environment variable not set") - self.client = OpenAI(api_key=api_key) - - # Create or resume output directory - if resume and resume_dir: - self.run_output_dir = Path(resume_dir) - if not self.run_output_dir.exists(): - raise ValueError(f"Resume directory does not exist: {resume_dir}") - logger.info(f"Resuming from: {self.run_output_dir}") - else: - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - self.run_output_dir = ( - self.output_dir / f"{self.capability.name}_{timestamp}" - ) - self.run_output_dir.mkdir(parents=True, exist_ok=True) - - logger.info("=" * 80) - logger.info(f"Initialized DiverseTaskGenerator for: {self.capability.name}") - logger.info(f"Mode: {'RESUME' if resume else 'NEW RUN'}") - logger.info(f"Model: {self.model_name}") - logger.info(f"Temperature: {self.temperature}") - logger.info(f"Max tokens: {self.max_tokens}") - logger.info(f"Output directory: {self.run_output_dir}") - logger.info("=" * 80) - - # Create API caller with pre-configured parameters - self._call_api = partial( - call_model, - self.client, - model_name=self.model_name, - temperature=self.temperature, - max_tokens=self.max_tokens, - max_retries=self.max_retries, - retry_delay=self.retry_delay, - ) - - def _save_json(self, filename: str, data_key: str, data: Any) -> Path: - """Save data to JSON file.""" - output_file = self.run_output_dir / filename - # Convert dataclass objects to dicts if needed - if data and hasattr( - data[0] if isinstance(data, list) else data, "__dataclass_fields__" - ): - data = ( - [asdict(item) for item in data] - if isinstance(data, list) - else asdict(data) - ) - - with open(output_file, "w") as f: - json.dump({data_key: data} if data_key else data, f, indent=2) - logger.info(f"Saved to: {output_file}") - return output_file - - def _load_json(self, filename: str, data_key: str = None): - """Load data from JSON file if it exists.""" - file_path = self.run_output_dir / filename - if not file_path.exists(): - return None - - with open(file_path, "r") as f: - data = json.load(f) - - if data_key: - return data.get(data_key) - return data - - def _load_subtopics(self): - """Load subtopics from checkpoint.""" - data = self._load_json("subtopics.json", "sub_topics") - if data: - return [SubTopic(**item) for item in data] - return None - - def _load_combinations(self): - """Load combinations from checkpoint.""" - data = self._load_json("combinations.json", "valid_combinations") - if data: - return [Combination(**item) for item in data] - return None - - def _load_blueprints(self): - """Load blueprints from checkpoint.""" - data = self._load_json("blueprints.json", "blueprints") - if data: - return [Blueprint(**item) for item in data] - return None - - def _load_tasks(self): - """Load tasks from checkpoint.""" - data = self._load_json("tasks.json", "tasks") - if data: - return [Task(**item) for item in data] - return None - - def extract_and_save_subtopics(self) -> list[SubTopic]: - """Extract sub-topics and save results.""" - subtopics = extract_subtopics(self.capability, self._call_api) - self._save_json("subtopics.json", "sub_topics", subtopics) - return subtopics - - def find_and_save_combinations( - self, subtopics: list[SubTopic] - ) -> list[Combination]: - """Find valid combinations and save results.""" - combinations = find_valid_combinations( - self.capability, subtopics, self._call_api, self.config - ) - self._save_json("combinations.json", "valid_combinations", combinations) - return combinations - - def generate_and_save_blueprints( - self, combinations: list[Combination] - ) -> list[Blueprint]: - """Generate blueprints and save results.""" - blueprints = generate_blueprints( - self.capability, combinations, self._call_api, self.config - ) - self._save_json("blueprints.json", "blueprints", blueprints) - return blueprints - - def generate_and_save_tasks(self, blueprints: list[Blueprint]) -> list[Task]: - """Generate tasks and save results.""" - tasks_per_blueprint = self.config["generation"]["tasks_per_blueprint"] - tasks = generate_tasks( - self.capability, blueprints, self._call_api, tasks_per_blueprint - ) - self._save_json("tasks.json", "tasks", tasks) - return tasks - - def verify_and_save_tasks( - self, tasks: list[Task], blueprints: list[Blueprint] - ) -> VerificationResult: - """Verify tasks and save results.""" - verification = verify_tasks(self.capability, tasks, blueprints, self._call_api) - self._save_json("verification.json", None, verification) - return verification - - def run_full_pipeline(self) -> dict: - """Run the complete diverse task generation pipeline.""" - logger.info("=" * 80) - logger.info("Starting Diverse Task Generation Pipeline") - logger.info(f"Capability: {self.capability.name}") - logger.info(f"Model: {self.model_name}") - logger.info("=" * 80) - - # Extract sub-topics - subtopics = self._load_subtopics() if self.resume else None - if subtopics: - logger.info(f"Loaded {len(subtopics)} subtopics from checkpoint") - else: - subtopics = self.extract_and_save_subtopics() - - # Find valid combinations - combinations = self._load_combinations() if self.resume else None - if combinations: - logger.info(f"Loaded {len(combinations)} combinations from checkpoint") - else: - combinations = self.find_and_save_combinations(subtopics) - - # Generate blueprints - blueprints = self._load_blueprints() if self.resume else None - if blueprints: - logger.info(f"Loaded {len(blueprints)} blueprints from checkpoint") - else: - blueprints = self.generate_and_save_blueprints(combinations) - - # Generate tasks - tasks = self._load_tasks() if self.resume else None - if tasks: - logger.info(f"Loaded {len(tasks)} tasks from checkpoint") - else: - tasks = self.generate_and_save_tasks(blueprints) - - # Verify tasks - verification = self.verify_and_save_tasks(tasks, blueprints) - - # Compile final results - results = { - "capability_name": self.capability.name, - "capability_description": self.capability.description, - "capability_domain": self.capability.domain, - "model_name": self.model_name, - "timestamp": datetime.now().isoformat(), - "subtopics": [asdict(st) for st in subtopics], - "combinations": [asdict(c) for c in combinations], - "blueprints": [asdict(bp) for bp in blueprints], - "tasks": [asdict(t) for t in tasks], - "verification": verification, - } - - # Save final results - self._save_json("final_results.json", None, results) - - logger.info("=" * 80) - logger.info("Pipeline Complete!") - logger.info(f"All results saved to: {self.run_output_dir}") - logger.info("=" * 80) - - return results - - -def load_capability_from_json(capability_json_path: str) -> dict: - """Load capability information from a JSON file.""" - with open(capability_json_path, "r") as f: - return json.load(f) - - -def main() -> None: - """Generate diverse tasks for a single capability.""" - parser = argparse.ArgumentParser( - description="Generate diverse tasks for a capability from JSON file" - ) - parser.add_argument( - "--capability-json-path", - type=str, - help="Path to capability JSON file (default: from config file)", - ) - parser.add_argument( - "--model-name", - type=str, - help="OpenAI model name (default: from config file)", - ) - parser.add_argument( - "--output-dir", - type=str, - help="Output directory (default: from config file)", - ) - parser.add_argument( - "--resume-dir", - type=str, - help="Resume from an existing run directory", - ) - - args = parser.parse_args() - - # Load config - config_file = Path(__file__).parent / "diverse_task_config.yaml" - with open(config_file, "r") as f: - config = yaml.safe_load(f) - - # Override config with command-line arguments - if args.model_name: - config["model"]["name"] = args.model_name - if args.output_dir: - config["output"]["base_dir"] = args.output_dir - if args.capability_json_path: - config["input"]["capability_json_path"] = args.capability_json_path - - logger.info(f"Loading capability from: {config['input']['capability_json_path']}") - capability_dict = load_capability_from_json(config["input"]["capability_json_path"]) - - # Initialize and run generator - # If resume_dir is provided, automatically enable resume mode - generator = DiverseTaskGenerator( - capability_dict=capability_dict, - resume=bool(args.resume_dir), - resume_dir=args.resume_dir, - config=config, - ) - generator.run_full_pipeline() - - logger.info("Done!") - - -if __name__ == "__main__": - main() diff --git a/experimental/extract_subtopics.py b/experimental/extract_subtopics.py deleted file mode 100644 index 5cbbe43..0000000 --- a/experimental/extract_subtopics.py +++ /dev/null @@ -1,41 +0,0 @@ -"""Extract sub-topics for a capability.""" - -import json -import logging -from typing import Callable - -from diverse_task_dataclasses import Capability, SubTopic -from diverse_task_prompts import format_subtopic_prompt - - -logger = logging.getLogger(__name__) - - -def extract_subtopics(capability: Capability, call_llm: Callable) -> list[SubTopic]: - """Extract sub-topics for the given capability.""" - logger.info("Extracting sub-topics...") - - system_prompt, user_prompt = format_subtopic_prompt( - capability_name=capability.name, - capability_description=capability.description, - capability_domain=capability.domain, - capability_area=capability.area, - ) - - response = call_llm( - system_prompt=system_prompt, - user_prompt=user_prompt, - response_format={"type": "json_object"}, - ) - - result = json.loads(response) - subtopic_names = result.get("sub_topics", []) - - # Create SubTopic objects - subtopics = [SubTopic(name=name) for name in subtopic_names] - - logger.info(f"Extracted {len(subtopics)} sub-topics:") - for st in subtopics: - logger.info(f" - {st.name}") - - return subtopics diff --git a/experimental/generate_blueprints.py b/experimental/generate_blueprints.py deleted file mode 100644 index 5ea4169..0000000 --- a/experimental/generate_blueprints.py +++ /dev/null @@ -1,65 +0,0 @@ -"""Generate task blueprints for each valid combination.""" - -import json -import logging -from typing import Callable - -from diverse_task_constants import BLOOMS_TAXONOMY, DIFFICULTY_LEVELS -from diverse_task_dataclasses import Blueprint, Capability, Combination -from diverse_task_prompts import format_blueprint_prompt - - -logger = logging.getLogger(__name__) - - -def generate_blueprints( - capability: Capability, - combinations: list[Combination], - call_llm: Callable, - config: dict, -) -> list[Blueprint]: - """Generate task blueprints for each valid combination.""" - logger.info("Generating task blueprints...") - - blueprints = [] - - for i, combo in enumerate(combinations): - logger.info( - f"Generating blueprint {i + 1}/{len(combinations)}: " - f"{combo.content} | {combo.difficulty} | {combo.reasoning}" - ) - - system_prompt, user_prompt = format_blueprint_prompt( - capability_name=capability.name, - capability_description=capability.description, - capability_domain=capability.domain, - capability_area=capability.area, - subtopic=combo.content, - difficulty=combo.difficulty, - difficulty_description=DIFFICULTY_LEVELS[combo.difficulty.lower()][ - "description" - ], - reasoning=combo.reasoning, - reasoning_description=BLOOMS_TAXONOMY[combo.reasoning]["description"], - ) - - response = call_llm( - system_prompt=system_prompt, - user_prompt=user_prompt, - response_format={"type": "json_object"}, - ) - - blueprint_data = json.loads(response) - blueprint = Blueprint( - combination_id=i, - subtopic=combo.content, - difficulty=combo.difficulty, - reasoning=combo.reasoning, - blueprint=blueprint_data["blueprint"], - rationale=combo.rationale, - ) - blueprints.append(blueprint) - - logger.info(f"Generated {len(blueprints)} blueprints") - - return blueprints diff --git a/experimental/generate_tasks.py b/experimental/generate_tasks.py deleted file mode 100644 index 4c89de8..0000000 --- a/experimental/generate_tasks.py +++ /dev/null @@ -1,85 +0,0 @@ -"""Generate multiple-choice questions for each blueprint.""" - -import json -import logging -from typing import Callable - -from diverse_task_dataclasses import Blueprint, Capability, Task -from diverse_task_prompts import format_task_prompt - - -logger = logging.getLogger(__name__) - - -def generate_tasks( - capability: Capability, - blueprints: list[Blueprint], - call_llm: Callable, - tasks_per_blueprint: int = 3, -) -> list[Task]: - """Generate multiple-choice questions for each blueprint.""" - logger.info("Generating tasks from blueprints...") - - all_tasks = [] - - for blueprint in blueprints: - logger.info( - f"Generating {tasks_per_blueprint} tasks for blueprint " - f"{blueprint.combination_id}: {blueprint.subtopic} | " - f"{blueprint.difficulty} | {blueprint.reasoning}" - ) - - # Generate multiple tasks for this blueprint - for j in range(tasks_per_blueprint): - task_id = f"task_{blueprint.combination_id}_{j}" - - try: - system_prompt, user_prompt = format_task_prompt( - capability_name=capability.name, - capability_description=capability.description, - capability_domain=capability.domain, - capability_area=capability.area, - blueprint_description=blueprint.blueprint, - ) - - response = call_llm( - system_prompt=system_prompt, - user_prompt=user_prompt, - response_format={"type": "json_object"}, - ) - - task_data = json.loads(response) - - # Create Task object - task = Task( - task_id=task_id, - blueprint_id=blueprint.combination_id, - subtopic=blueprint.subtopic, - difficulty=blueprint.difficulty, - reasoning=blueprint.reasoning, - question=task_data["question"], - choices=task_data["options"], - correct_answer=task_data["correct_answer"], - ) - all_tasks.append(task) - - except Exception as e: - logger.error(f" Failed to generate {task_id}: {e}") - # Create a task with error information - task = Task( - task_id=task_id, - blueprint_id=blueprint.combination_id, - subtopic=blueprint.subtopic, - difficulty=blueprint.difficulty, - reasoning=blueprint.reasoning, - question=f"ERROR: Failed to generate task - {str(e)}", - choices={"A": "N/A", "B": "N/A", "C": "N/A", "D": "N/A"}, - correct_answer="A", - ) - all_tasks.append(task) - - logger.info(f" Generated {tasks_per_blueprint} tasks") - - logger.info(f"Generated {len(all_tasks)} total tasks") - - return all_tasks diff --git a/experimental/model_utils.py b/experimental/model_utils.py deleted file mode 100644 index fdf0091..0000000 --- a/experimental/model_utils.py +++ /dev/null @@ -1,61 +0,0 @@ -"""Utilities for LLM API calls.""" - -import logging -import time - -from openai import OpenAI - - -logger = logging.getLogger(__name__) - - -def call_model( - client: OpenAI, - system_prompt: str, - user_prompt: str, - model_name: str, - temperature: float, - max_tokens: int, - response_format: dict = None, - max_retries: int = 3, - retry_delay: float = 2.0, -) -> str: - """Call LLM API with given prompts and automatic retries.""" - messages = [ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": user_prompt}, - ] - - kwargs = { - "model": model_name, - "messages": messages, - "temperature": temperature, - "max_tokens": max_tokens, - } - if response_format: - kwargs["response_format"] = response_format - - last_error = None - for attempt in range(max_retries): - try: - response = client.chat.completions.create(**kwargs) - content = response.choices[0].message.content - - if content is None: - raise ValueError("Model returned empty response") - - return content - - except Exception as e: - last_error = e - if attempt < max_retries - 1: - wait_time = retry_delay * (2**attempt) - logger.warning( - f"API call failed (attempt {attempt + 1}/{max_retries}): {e}" - ) - logger.info(f"Retrying in {wait_time:.1f} seconds...") - time.sleep(wait_time) - else: - logger.error(f"API call failed after {max_retries} attempts: {e}") - - raise last_error diff --git a/experimental/diverse_task_constants.py b/src/base_task_generation/diverse_task_constants.py similarity index 100% rename from experimental/diverse_task_constants.py rename to src/base_task_generation/diverse_task_constants.py diff --git a/experimental/diverse_task_dataclasses.py b/src/base_task_generation/diverse_task_dataclasses.py similarity index 100% rename from experimental/diverse_task_dataclasses.py rename to src/base_task_generation/diverse_task_dataclasses.py diff --git a/experimental/diverse_task_prompts.py b/src/base_task_generation/diverse_task_prompts.py similarity index 94% rename from experimental/diverse_task_prompts.py rename to src/base_task_generation/diverse_task_prompts.py index 3cb6c46..0d60a23 100644 --- a/experimental/diverse_task_prompts.py +++ b/src/base_task_generation/diverse_task_prompts.py @@ -16,7 +16,7 @@ The name, description, and domain/area of the capability will be provided. -Your goal is to decompose the capability into meaningful sub-topics that together provide full and balanced coverage of testing the given capability. +Your goal is to decompose the capability into meaningful sub-topics that together provide full and balanced coverage of testing the given capability. Generate between {min_subtopics} and {max_subtopics} sub-topics depending on the granularity and scope of the capability. Respond precisely in the following format, including the JSON start and end markers: @@ -25,7 +25,8 @@ "sub_topics": [ "", "", - "" + "", + ... ] }}}} @@ -42,7 +43,7 @@ Capability Name: {capability_name} Capability Description: {capability_description} -Depending on the granularity of the capability, generate 2–10 sub-topics that comprehensively represent this capability. +Depending on the granularity of the capability, generate {min_subtopics}–{max_subtopics} sub-topics that comprehensively represent this capability. """ @@ -173,6 +174,11 @@ 1. What the learner is expected to do. 2. What kind of reasoning the task requires. 3. How difficulty manifests in the structure or context of the task. + +Respond in the following JSON format: +{{{{ + "blueprint": "" +}}}} """ @@ -333,13 +339,29 @@ def format_subtopic_prompt( - capability_name, capability_description, capability_domain, capability_area=None + capability_name, + capability_description, + capability_domain, + capability_area=None, + min_subtopics=3, + max_subtopics=8, ): - """Format subtopic extraction prompts.""" + """Format subtopic extraction prompts. + + Args: + capability_name: Name of the capability + capability_description: Description of the capability + capability_domain: Domain name + capability_area: Area name (optional) + min_subtopics: Minimum number of subtopics to generate + max_subtopics: Maximum number of subtopics to generate + """ area_text = capability_area if capability_area else "N/A" system_prompt = SUBTOPIC_SYSTEM_PROMPT.format( capability_domain=capability_domain, + min_subtopics=min_subtopics, + max_subtopics=max_subtopics, ) user_prompt = SUBTOPIC_USER_PROMPT_TEMPLATE.format( @@ -347,6 +369,8 @@ def format_subtopic_prompt( capability_description=capability_description, capability_domain=capability_domain, capability_area=area_text, + min_subtopics=min_subtopics, + max_subtopics=max_subtopics, ) return system_prompt, user_prompt diff --git a/src/base_task_generation/extract_subtopics.py b/src/base_task_generation/extract_subtopics.py new file mode 100644 index 0000000..294f7b4 --- /dev/null +++ b/src/base_task_generation/extract_subtopics.py @@ -0,0 +1,61 @@ +"""Extract sub-topics for a capability.""" + +import asyncio +import logging + +from autogen_core.models import ChatCompletionClient + +from src.base_task_generation.diverse_task_dataclasses import SubTopic +from src.base_task_generation.diverse_task_prompts import format_subtopic_prompt + + +logger = logging.getLogger(__name__) + + +def extract_subtopics( + capability, + client: ChatCompletionClient, + min_subtopics: int = 3, + max_subtopics: int = 8, +) -> list[SubTopic]: + """Extract sub-topics for the given capability. + + Args: + capability: Schema Capability object with area.domain.name, area.name, etc. + client: ChatCompletionClient for API calls + min_subtopics: Minimum number of subtopics to generate + max_subtopics: Maximum number of subtopics to generate + """ + logger.info(f"Extracting sub-topics (range: {min_subtopics}-{max_subtopics}) ...") + + # Import here to avoid circular dependency + from src.utils.model_client_utils import ModelCallMode, async_call_model + + system_prompt, user_prompt = format_subtopic_prompt( + capability_name=capability.name, + capability_description=capability.description, + capability_domain=capability.area.domain.name, + capability_area=capability.area.name, + min_subtopics=min_subtopics, + max_subtopics=max_subtopics, + ) + + response = asyncio.run( + async_call_model( + client, + system_prompt=system_prompt, + user_prompt=user_prompt, + mode=ModelCallMode.JSON_PARSE, + ) + ) + + subtopic_names = response.get("sub_topics", []) + + # Create SubTopic objects + subtopics = [SubTopic(name=name) for name in subtopic_names] + + logger.info(f"Extracted {len(subtopics)} sub-topics:") + for st in subtopics: + logger.info(f" - {st.name}") + + return subtopics diff --git a/experimental/find_combinations.py b/src/base_task_generation/find_combinations.py similarity index 62% rename from experimental/find_combinations.py rename to src/base_task_generation/find_combinations.py index adf9d4f..c591e4d 100644 --- a/experimental/find_combinations.py +++ b/src/base_task_generation/find_combinations.py @@ -1,22 +1,36 @@ -"""Find valid combinations of (Content, Difficulty, Reasoning).""" +"""Find valid (Content, Difficulty, Reasoning) combinations.""" -import json +import asyncio import logging -from diverse_task_constants import BLOOMS_TAXONOMY, DIFFICULTY_LEVELS -from diverse_task_dataclasses import Capability, Combination, SubTopic -from diverse_task_prompts import format_combination_prompt +from autogen_core.models import ChatCompletionClient + +from src.base_task_generation.diverse_task_constants import ( + BLOOMS_TAXONOMY, + DIFFICULTY_LEVELS, +) +from src.base_task_generation.diverse_task_dataclasses import Combination, SubTopic +from src.base_task_generation.diverse_task_prompts import format_combination_prompt logger = logging.getLogger(__name__) def find_valid_combinations( - capability: Capability, subtopics: list[SubTopic], call_llm, config: dict + capability, subtopics: list[SubTopic], client: ChatCompletionClient ) -> list[Combination]: - """Find valid combinations of Content, Difficulty, and Reasoning.""" + """Find valid combinations of Content, Difficulty, and Reasoning. + + Args: + capability: Schema Capability object with area.domain.name, area.name, etc. + subtopics: List of SubTopic objects + client: ChatCompletionClient for API calls + """ logger.info("Finding valid combinations...") + # Import here to avoid circular dependency + from src.utils.model_client_utils import ModelCallMode, async_call_model + # Get difficulty levels and reasoning types from constants difficulty_levels = list(DIFFICULTY_LEVELS.keys()) reasoning_types = list(BLOOMS_TAXONOMY.keys()) @@ -47,19 +61,21 @@ def find_valid_combinations( system_prompt, user_prompt = format_combination_prompt( capability_name=capability.name, capability_description=capability.description, - capability_domain=capability.domain, - capability_area=capability.area, + capability_domain=capability.area.domain.name, + capability_area=capability.area.name, content_list=content_list, ) - response = call_llm( - system_prompt=system_prompt, - user_prompt=user_prompt, - response_format={"type": "json_object"}, + response = asyncio.run( + async_call_model( + client, + system_prompt=system_prompt, + user_prompt=user_prompt, + mode=ModelCallMode.JSON_PARSE, + ) ) - result = json.loads(response) - combinations_data = result.get("valid_combinations", []) + combinations_data = response.get("valid_combinations", []) # Create Combination objects combinations = [ @@ -81,4 +97,4 @@ def find_valid_combinations( if len(combinations) > 5: logger.info(f" ... and {len(combinations) - 5} more") - return combinations + return combinations[0:1] diff --git a/src/base_task_generation/generate_blueprints.py b/src/base_task_generation/generate_blueprints.py new file mode 100644 index 0000000..89e3ebf --- /dev/null +++ b/src/base_task_generation/generate_blueprints.py @@ -0,0 +1,89 @@ +"""Generate task blueprints for each combination.""" + +import asyncio +import logging + +from autogen_core.models import ChatCompletionClient + +from src.base_task_generation.diverse_task_constants import ( + BLOOMS_TAXONOMY, + DIFFICULTY_LEVELS, +) +from src.base_task_generation.diverse_task_dataclasses import Blueprint, Combination +from src.base_task_generation.diverse_task_prompts import format_blueprint_prompt + + +logger = logging.getLogger(__name__) + + +def generate_blueprints( + capability, + combinations: list[Combination], + client: ChatCompletionClient, +) -> list[Blueprint]: + """Generate task blueprints for each valid combination. + + Args: + capability: Schema Capability object with area.domain.name, area.name, etc. + combinations: List of Combination objects + client: ChatCompletionClient for API calls + """ + logger.info("Generating task blueprints...") + + # Import here to avoid circular dependency + from src.utils.model_client_utils import ModelCallMode, async_call_model + + blueprints = [] + + for i, combo in enumerate(combinations): + logger.info( + f"Generating blueprint {i + 1}/{len(combinations)}: " + f"{combo.content} | {combo.difficulty} | {combo.reasoning}" + ) + + system_prompt, user_prompt = format_blueprint_prompt( + capability_name=capability.name, + capability_description=capability.description, + capability_domain=capability.area.domain.name, + capability_area=capability.area.name, + subtopic=combo.content, + difficulty=combo.difficulty, + difficulty_description=DIFFICULTY_LEVELS[combo.difficulty.lower()][ + "description" + ], + reasoning=combo.reasoning, + reasoning_description=BLOOMS_TAXONOMY[combo.reasoning]["description"], + ) + + response = asyncio.run( + async_call_model( + client, + system_prompt=system_prompt, + user_prompt=user_prompt, + mode=ModelCallMode.JSON_PARSE, + ) + ) + + # Validate response has blueprint key + if "blueprint" not in response: + logger.error( + f"Response missing 'blueprint' key. Response keys: {response.keys()}" + ) + logger.error(f"Full response: {response}") + raise ValueError( + "Invalid blueprint response format: missing 'blueprint' key" + ) + + blueprint = Blueprint( + combination_id=i, + subtopic=combo.content, + difficulty=combo.difficulty, + reasoning=combo.reasoning, + blueprint=response["blueprint"], + rationale=combo.rationale, + ) + blueprints.append(blueprint) + + logger.info(f"Generated {len(blueprints)} blueprints") + + return blueprints diff --git a/src/base_task_generation/generate_tasks.py b/src/base_task_generation/generate_tasks.py new file mode 100644 index 0000000..35dffb2 --- /dev/null +++ b/src/base_task_generation/generate_tasks.py @@ -0,0 +1,117 @@ +"""Generate multiple-choice questions for each blueprint.""" + +import asyncio +import logging + +from autogen_core.models import ChatCompletionClient + +from src.base_task_generation.diverse_task_dataclasses import Blueprint +from src.base_task_generation.diverse_task_prompts import format_task_prompt + + +logger = logging.getLogger(__name__) + + +def generate_tasks( + capability, + blueprints: list[Blueprint], + client: ChatCompletionClient, + tasks_per_blueprint: int = 3, +): + """Generate multiple-choice questions for each blueprint. + + Args: + capability: Schema Capability object with area.domain.name, area.name, etc. + blueprints: List of Blueprint objects + client: ChatCompletionClient for API calls + tasks_per_blueprint: Number of tasks to generate per blueprint + + Returns + ------- + List of schema TaskSolution objects + """ + logger.info("Generating tasks from blueprints...") + + # Import here to avoid circular dependency + from src.schemas.solution_schemas import TaskSolution + from src.schemas.task_schemas import Task + from src.utils.model_client_utils import ModelCallMode, async_call_model + + all_task_solutions = [] + + for blueprint in blueprints: + logger.info( + f"Generating {tasks_per_blueprint} tasks for blueprint " + f"{blueprint.combination_id}: {blueprint.subtopic} | " + f"{blueprint.difficulty} | {blueprint.reasoning}" + ) + + # Generate multiple tasks for this blueprint + for _j in range(tasks_per_blueprint): + task_id = f"task_{len(all_task_solutions):03d}" + + try: + system_prompt, user_prompt = format_task_prompt( + capability_name=capability.name, + capability_description=capability.description, + capability_domain=capability.area.domain.name, + capability_area=capability.area.name, + blueprint_description=blueprint.blueprint, + ) + + response = asyncio.run( + async_call_model( + client, + system_prompt=system_prompt, + user_prompt=user_prompt, + mode=ModelCallMode.JSON_PARSE, + ) + ) + + # Format the task question with multiple choice options + task_text = f"{response['question']}\n\n" + for choice_key, choice_text in response["options"].items(): + task_text += f"{choice_key}. {choice_text}\n" + + # Create metadata dict + generation_metadata = { + "method": "diverse_task_generation", + "blueprint_id": blueprint.combination_id, + "subtopic": blueprint.subtopic, + "difficulty": blueprint.difficulty, + "reasoning": blueprint.reasoning, + "correct_answer": response["correct_answer"], + "explanation": response.get("explanation", ""), + "alignment_notes": response.get("alignment_notes", ""), + } + + # Create schema Task object (without generation_metadata) + task = Task( + task_id=task_id, + task=task_text, + capability=capability, + ) + + # Create TaskSolution object with the task and metadata + task_solution = TaskSolution( + task_id=task_id, + task=task_text, + solution=response["correct_answer"], + reasoning=response.get("explanation", ""), + task_obj=task, + generation_metadata=generation_metadata, + ) + all_task_solutions.append(task_solution) + + except Exception as e: + logger.error(f" Failed to generate {task_id}: {e}") + # Skip failed tasks and continue + continue + + logger.info( + f" Generated {len([ts for ts in all_task_solutions if ts.generation_metadata.get('blueprint_id') == blueprint.combination_id])} task solutions for this blueprint" + ) + + logger.info(f"Generated {len(all_task_solutions)} total task solutions") + + return all_task_solutions diff --git a/experimental/verify_tasks.py b/src/base_task_generation/verify_tasks.py similarity index 100% rename from experimental/verify_tasks.py rename to src/base_task_generation/verify_tasks.py diff --git a/src/cfg/run_cfg.yaml b/src/cfg/run_cfg.yaml index ac1fcb1..b3957c2 100644 --- a/src/cfg/run_cfg.yaml +++ b/src/cfg/run_cfg.yaml @@ -58,17 +58,27 @@ subject_llm: prompt_cfg: sys_msg: Complete the given task to the best of your ability. +# Diverse task generation configuration +task_generation_cfg: + tasks_per_blueprint: 1 # Number of tasks to generate per blueprint + min_subtopics: 1 # Suggested minimum number of sub-topics + max_subtopics: 1 # Suggested maximum number of sub-topics + +# Task verification configuration +task_verification_cfg: + pass_threshold: 0.8 # Minimum pass rate to consider successful + strict_mode: false # If true, all alignment criteria must pass + capabilities_cfg: - capabilities_dir: /fs01/projects/aieng/public/ace/artifacts + capabilities_dir: ./ace-output/ results_dir: gs://ace-artifacts inspect_evals_dir: /fs01/projects/aieng/public/ace/inspect_evals/src/ace_evals - domain: math method: "hierarchical" num_seed_capabilities: 1 - num_gen_capabilities: 100 - num_gen_capabilities_buffer: 0.0 - num_capability_areas: 10 - num_gen_capabilities_per_run: 5 + num_capabilities: 2 + num_capabilities_buffer: 0.0 + num_areas: 3 + num_gen_capabilities_per_run: 1 num_gen_tasks_per_capability: 100 num_gen_tasks_buffer: 0.0 task_gen_few_shot: false @@ -102,7 +112,24 @@ dimensionality_reduction_cfg: exp_cfg: seed: 37 trial_run: false - exp_id: "" + exp_id: "farnaz_exp" + +# Stage control +stage: "all" # Which stage to run: 0, 1, 2, 3, 5, or "all" +areas_tag: null # Areas tag from Stage 1 (required for stage 2 standalone) +capabilities_tag: null # Capabilities tag from Stage 2 (required for stage 3 standalone) +tasks_tag: null # Tasks tag from Stage 3 (required for stage 5 standalone) +solution_tag: null # Solution tag from Stage 5 (optional for resume) + +# Debug settings +use_langchain: false # Set to false for easier debugging (disables LangChain features) + +# Global configuration + +global_cfg: + domain: personal finance + output_dir: non_agentic_output/ #Base output directory for all agentic outputs + pipeline_type: base defaults: - _self_ diff --git a/src/generate_capabilities.py b/src/generate_capabilities.py index 9dc954c..4696b87 100644 --- a/src/generate_capabilities.py +++ b/src/generate_capabilities.py @@ -1,324 +1,145 @@ """Generate capabilities using the scientist LLM.""" -import json +import asyncio import logging -import os -import shutil -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List import numpy as np -from langsmith import tracing_context -from tenacity import Retrying, stop_after_attempt +from autogen_core.models import ChatCompletionClient -from src.capability import Capability -from src.model import Model -from src.utils import constants, prompts -from src.utils.capability_management_utils import ( - _sample_seed_capabilities, - get_previous_capabilities, -) -from src.utils.capability_utils import extract_and_parse_response +from src.schemas.area_schemas import Area +from src.schemas.capability_schemas import Capability +from src.schemas.domain_schemas import Domain +from src.utils import prompts +from src.utils.model_client_utils import ModelCallMode, async_call_model logger = logging.getLogger(__name__) -def generate_capability_areas( - domain: str, +def generate_areas( + domain: Domain, num_areas: int, num_capabilities_per_area: int, - scientist_llm: Model, - user_prompt: str, - scientist_llm_gen_cfg: Dict[str, Any], - sys_prompt: Union[str, None] = None, - **kwargs: Any, -) -> Dict[str, Any]: + scientist_llm_client: ChatCompletionClient, +) -> List[Area]: """ - Generate capability areas for the specified domain. + Generate areas for the specified domain. Args ---- domain (str): The domain name. - num_areas (int): The number of capability areas to generate. + num_areas (int): The number of areas to generate. num_capabilities_per_area (int): The number of capabilities per area. - scientist_llm (Model): The scientist LLM model. - user_prompt (str): The user prompt for generating capability areas. - scientist_llm_gen_cfg (Dict[str, Any]): The generation configuration - for the scientist LLM. - sys_prompt (str | None): The system prompt for the scientist LLM. - **kwargs (Any): Additional keyword arguments. + scientist_llm (ChatCompletionClient): The scientist LLM client. Returns ------- - Dict[str, Any]: A dictionary containing the generated capability areas + Dict[str, Any]: A dictionary containing the generated areas and metadata about the generation process. """ - logger.info(f"Generating {num_areas} capability areas ...") - # Generate output using the model with specified generation arguments - user_prompt = user_prompt.format( + logger.info(f"Generating {num_areas} areas ...") + user_prompt = prompts.AREAS_GENERATION_USER_PROMPT.format( num_areas=num_areas, num_capabilities_per_area=num_capabilities_per_area, - domain=domain, - response_json_format=prompts.CAPABILITY_AREAS_GENERATION_RESPONSE_JSON_FORMAT, + domain=domain.name, + response_json_format=prompts.AREAS_GENERATION_RESPONSE_JSON_FORMAT, ) - with tracing_context( - enabled=True, - tags=["generate_capability_areas"], - metadata={ - "ls_provider": scientist_llm.model_provider, - "ls_model_name": scientist_llm.get_model_name(with_provider=False), - "ls_model_type": "chat", - "exp_id": kwargs.get("run_id"), - "domain": domain, - "num_areas": num_areas, - **{f"ls_{k}": v for k, v in scientist_llm_gen_cfg.items()}, - }, - ): - response, metadata = scientist_llm.generate( - sys_prompt=sys_prompt if sys_prompt else "", + + # Use async_call_model with asyncio.run() for sync execution + response = asyncio.run( + async_call_model( + scientist_llm_client, + system_prompt="", user_prompt=user_prompt, - generation_config=scientist_llm_gen_cfg, + mode=ModelCallMode.JSON_PARSE, ) - - parsed_response = extract_and_parse_response(response, has_thought=False) - capability_areas = parsed_response["parsed_response"] - - logger.info( - f"Capability areas generation tokens summary:\n{json.dumps(metadata, indent=4)}" ) - if len(capability_areas) > num_areas: - logger.warning( - f"Generated {len(capability_areas)} capability areas, but only {num_areas} are needed. " - + f"Keeping the first {num_areas} areas." + areas = [] + for idx, area_name in enumerate(response.get("areas", [])): + area = Area( + name=area_name, + area_id=f"area_{idx:03d}", + domain=domain, + description="", ) - capability_areas = capability_areas[:num_areas] + areas.append(area) - logger.info(f"Generated capability areas:\n{capability_areas}") + logger.info(f"Generated {len(areas)} areas") - return { - "capability_areas": capability_areas, - "metadata": { - "model": scientist_llm.get_model_name(), - "api_metadata": metadata, - }, - } + return areas def generate_capabilities( - domain: str, + area: Area, num_capabilities: int, num_capabilities_per_run: int, - base_capability_dir: str, - scientist_llm: Model, - num_seed_capabilities: int, - scientist_llm_gen_cfg: Dict[str, Any], - method: str = "flat", - include_seed_capability_names: Optional[List[str]] = None, - exclude_seed_capability_names: Optional[List[str]] = None, - **kwargs: Any, + scientist_llm_client: ChatCompletionClient, ) -> List[Capability]: """ - Generate initial capabilities for the specified domain. + Generate capabilities for a given area. Args ---- - domain (str): The domain name. + area (Area): The area object containing domain information. num_capabilities (int): The number of capabilities to generate. num_capabilities_per_run (int): The number of capabilities to generate per run. - base_capability_dir (str): The base directory to store - the generated capabilities for the specified domain. - scientist_llm (Model): The scientist LLM model. - num_seed_capabilities (int): The number of seed capabilities to use. + scientist_llm (ChatCompletionClient): The scientist LLM client. scientist_llm_gen_cfg (Dict[str, Any]): The generation configuration for the scientist LLM. - method (str): The method to use for generating capabilities. - Choose from "flat" or "hierarchical". - include_seed_capability_names (List[str] | None): A list of seed capability - names to include in the generation process. - exclude_seed_capability_names (List[str] | None): A list of seed capability - names to exclude from the generation process. **kwargs (Any): Additional keyword arguments. Returns ------- List[Capability]: The generated capabilities. """ - gen_capabilities = [] - run_metadata = [] - - if method == "hierarchical": - assert "num_capability_areas" in kwargs, ( - "`num_capability_areas` should be specified for hierarchical generation." - ) - num_capability_areas = kwargs["num_capability_areas"] - assert num_capabilities >= num_capability_areas, ( - "Number of capabilities should be greater than or equal to the number of capability areas, " - + "so that each area can have at least one capability." - ) - # Uniformly distribute num_capabilities across num_capability_areas - num_capabilities_per_area = [ - num_capabilities // num_capability_areas - ] * num_capability_areas - for i in range(num_capabilities % num_capability_areas): - num_capabilities_per_area[i] += 1 - num_runs = [ - int(np.ceil(num / num_capabilities_per_run)) - for num in num_capabilities_per_area - ] - - # Generate capability areas for the specified domain - response = generate_capability_areas( - domain=domain, - num_areas=kwargs["num_capability_areas"], - num_capabilities_per_area=num_capabilities_per_area[0], - scientist_llm=scientist_llm, - user_prompt=prompts.HIERARCHICAL_CAPABILITY_AREAS_GENERATION_USER_PROMPT, - scientist_llm_gen_cfg=scientist_llm_gen_cfg, - **kwargs, - ) - capability_areas = response["capability_areas"] - # Select only the specified number of capability areas - # even if more are generated - capability_areas = capability_areas[:num_capability_areas] - else: - num_capabilities_per_area = [num_capabilities] - num_runs = [int(np.ceil(num_capabilities / num_capabilities_per_run))] - # No capability areas for flat generation, use the domain as the area - capability_areas = [domain] - - for idx, capability_area in enumerate(capability_areas): - if method == "hierarchical": - logger.info(f"Generating capabilities for area: {capability_area}") - # Fetch previously generated capabilities, if any - prev_capabilities = get_previous_capabilities( - capability_dir=base_capability_dir, capability_area=capability_area - ) - user_prompt = prompts.HIERARCHICAL_CAPABILITY_GENERATION_USER_PROMPT.format( - capability_area=capability_area, - ) - else: - prev_capabilities = get_previous_capabilities( - capability_dir=base_capability_dir - ) - user_prompt = prompts.CAPABILITY_GENERATION_USER_PROMPT - - # Add all seed capabilities to the list of prev_capabilities - seed_capability_dir = os.path.join( - constants.BASE_ARTIFACTS_DIR, "seed_capabilities", domain - ) - prev_capabilities.extend( - _sample_seed_capabilities( - seed_capability_dir=seed_capability_dir, - num_seed_capabilities=-1, - random_seed=int(kwargs.get("seed", constants.DEFAULT_RANDOM_SEED)), - ) - ) + capabilities = [] - num_capabilities_left = num_capabilities_per_area[idx] - for run_id in range(num_runs[idx]): - logger.info(f"Run ID: {run_id}") - # Generate capabilities using the scientist LLM - - response = generate_capabilities_using_llm( - domain=domain, - num_capabilities=min( - num_capabilities_per_run, - num_capabilities_left, - ), - scientist_llm=scientist_llm, - sys_prompt=prompts.CAPABILITY_GENERATION_SYSTEM_PROMPT, - user_prompt=user_prompt, - num_seed_capabilities=num_seed_capabilities, - seed_capability_dir=seed_capability_dir, - prev_capabilities=prev_capabilities, - scientist_llm_gen_cfg=scientist_llm_gen_cfg, - base_capability_dir=base_capability_dir, - include_seed_capability_names=include_seed_capability_names, - exclude_seed_capability_names=exclude_seed_capability_names, - capability_area=capability_area if method == "hierarchical" else None, - local_run_id=run_id, - **kwargs, - ) - gen_capabilities.extend(response["capabilities"]) - num_capabilities_left -= len(response["capabilities"]) - run_metadata.append(response["metadata"]) + # Calculate number of runs needed + num_runs = int(np.ceil(num_capabilities / num_capabilities_per_run)) - # Update the list of previously generated capabilities - prev_capabilities.extend(response["capabilities"]) + # Generate capabilities in batches + num_capabilities_left = num_capabilities + for run in range(num_runs): + logger.info(f"Capability generation for area: {area.name} at run {run}") - # Analyze tokens metadata for capability generation - total_input_tokens = sum([m["api_metadata"]["input_tokens"] for m in run_metadata]) - total_output_tokens = sum( - [m["api_metadata"]["output_tokens"] for m in run_metadata] - ) - tokens_summary = { - "total_input_tokens": total_input_tokens, - "total_output_tokens": total_output_tokens, - "total_tokens": total_input_tokens + total_output_tokens, - "input_tokens_per_run": int(total_input_tokens / sum(num_runs)), - "output_tokens_per_run": int(total_output_tokens / sum(num_runs)), - "total_tokens_per_run": int( - (total_input_tokens + total_output_tokens) / sum(num_runs) - ), - "input_tokens_per_capability": int(total_input_tokens / len(gen_capabilities)), - "output_tokens_per_capability": int( - total_output_tokens / len(gen_capabilities) - ), - "total_tokens_per_capability": int( - (total_input_tokens + total_output_tokens) / len(gen_capabilities) - ), - } - logger.info( - f"Capability generation tokens summary:\n{json.dumps(tokens_summary, indent=4)}" - ) + run_capabilities = generate_capabilities_using_llm( + area=area, + num_capabilities=min(num_capabilities_per_run, num_capabilities_left), + scientist_llm_client=scientist_llm_client, + prev_capabilities=capabilities, + ) + capabilities.extend(run_capabilities) + num_capabilities_left -= len(run_capabilities) - return gen_capabilities + return capabilities def generate_capabilities_using_llm( - domain: str, + area: Area, num_capabilities: int, - scientist_llm: Model, - sys_prompt: str, - user_prompt: str, - num_seed_capabilities: int, - seed_capability_dir: str, + scientist_llm_client: ChatCompletionClient, prev_capabilities: List[Capability], - scientist_llm_gen_cfg: Dict[str, Any], - base_capability_dir: str, - include_seed_capability_names: Optional[List[str]] = None, - exclude_seed_capability_names: Optional[List[str]] = None, - capability_area: Union[str, None] = None, - **kwargs: Any, ) -> Dict[str, Any]: """ Generate capabilities using the scientist LLM. - Prompt the scientist LLM with instructions and - seed capabilities for the specified domain - to generate initial capabilities. + Prompt the scientist LLM with instructions to generate initial capabilities. Args ---- - domain (str): The domain name. + domain_name (str): The domain name. + area_name (str): The area name. + area (Area): The area object. num_capabilities (int): The number of capabilities to generate. - scientist_llm (Model): The scientist LLM model name. + scientist_llm (ChatCompletionClient): The scientist LLM client. + scientist_llm_gen_cfg (Dict[str, Any]): The generation configuration + for the scientist LLM. sys_prompt (str): The system prompt. user_prompt (str): The user prompt. - num_seed_capabilities (int): The number of seed capabilities to use. - seed_capability_dir (str): The directory containing the seed capabilities. prev_capabilities (List[Capability]): The list of previously generated capabilities. - scientist_llm_gen_cfg (Dict[str, Any]): The generation configuration - for the scientist LLM. - base_capability_dir (str): The base directory to store - the generated capabilities for the specified domain. - include_seed_capability_names (List[str] | None): A list of seed capability - names to include in the generation process. - exclude_seed_capability_names (List[str] | None): A list of seed capability - names to exclude from the generation process. - capability_area (str | None): The capability area for the generation **kwargs (Any): Additional keyword arguments. Returns @@ -326,143 +147,53 @@ def generate_capabilities_using_llm( Dict[str, Any]: A dictionary containing the generated capabilities and metadata about the generation process. """ - # Sample seed capabilities for the generation process - seed_capabilities = _sample_seed_capabilities( - seed_capability_dir=seed_capability_dir, - num_seed_capabilities=num_seed_capabilities, - include_capability_names=include_seed_capability_names, - exclude_capability_names=exclude_seed_capability_names, - random_seed=int(kwargs.get("seed", constants.DEFAULT_RANDOM_SEED)), - ) - # Get capability JSON strings (without scores) - seed_capabilities_repr = [ - capability.to_json_str() for capability in seed_capabilities - ] - - # LLM input - user_prompt = user_prompt.format( - sample_capability_json="\n".join(seed_capabilities_repr), + sys_prompt = prompts.CAPABILITY_GENERATION_SYSTEM_PROMPT + user_prompt = prompts.HIERARCHICAL_CAPABILITY_GENERATION_USER_PROMPT.format( + area=area.name, + domain=area.domain.name, + num_capabilities=num_capabilities, prev_capabilities="\n".join([elm.name for elm in prev_capabilities]), - domain=domain, - num_gen_capabilities=num_capabilities, ) - # Generate output using the model with specified generation arguments - num_attempts = kwargs.get( - "retry_attempts", constants.DEFAULT_CAPABILITY_GENERATION_RETRY_ATTEMPTS - ) - try: - # Retry the generation process if an error occurs - # Common errors: - # - [ill-formatted python class] - # - SyntaxError: unterminated triple-quoted string literal - for attempt in Retrying( - stop=stop_after_attempt(num_attempts), - reraise=True, - ): - with attempt: - # Update the seed for each attempt - scientist_llm_gen_cfg["seed"] += 1 - with tracing_context( - enabled=True, - tags=["generate_capabilities_using_llm"], - metadata={ - "ls_provider": scientist_llm.model_provider, - "ls_model_name": scientist_llm.get_model_name( - with_provider=False - ), - "ls_model_type": "chat", - "exp_id": kwargs.get("run_id"), - "run_id": kwargs.get("local_run_id"), - "domain": domain, - "capability_area": capability_area, - "num_capabilities": num_capabilities, - "seed_capabilities": [elm.name for elm in seed_capabilities], - "prev_capabilities": [elm.name for elm in prev_capabilities], - **{f"ls_{k}": v for k, v in scientist_llm_gen_cfg.items()}, - }, - ): - response, metadata = scientist_llm.generate( - sys_prompt=sys_prompt, - user_prompt=user_prompt, - generation_config=scientist_llm_gen_cfg, - ) - - parsed_response = extract_and_parse_response(response) - gen_capabilities = parsed_response["parsed_response"] - # Convert JSON string to dict if needed - gen_capabilities_dict = [] - for capability in gen_capabilities: - if isinstance(capability, dict): - capability_dict = capability - elif isinstance(capability, str): - try: - capability_dict = json.loads(capability) - except json.JSONDecodeError as e: - logger.warning( - f"Error decoding JSON string: {capability}: {repr(e)}" - ) - continue - else: - logger.warning( - f"Invalid capability format: {capability}. Expected str or dict." - ) - continue - gen_capabilities_dict.append(capability_dict) - gen_capabilities_clean = [] - for capability in gen_capabilities_dict: - try: - if capability_area is not None: - # Add the capability area to the generated capabilities - capability["area"] = capability_area - capability_obj = Capability.from_dict( - capability_dict=capability, - base_dir=base_capability_dir, - score_dir_suffix=(kwargs.get("run_id")), - ) - except FileExistsError: - # 1. Same name as existing capability - # Do not delete the capability directory if it already exists - logger.warning( - f"Capability {capability['name']} already exists. Skipping it." - ) - # Skip this capability - continue - except Exception as e: - # 2. "problem" replaced with "riddle" or some other keyword - # leads to KeyError - # 3. Ill-formatted `capability.py` file due to missing quotes - logger.warning( - f"Error creating capability object {capability['name']}, hence skipping it: {e}" - ) - # Delete the capability directory if it exists - capability_dir = os.path.join( - base_capability_dir, capability["name"] - ) - if os.path.exists(capability_dir): - shutil.rmtree(capability_dir) - # Skip this capability - continue - else: - gen_capabilities_clean.append(capability_obj) - if len(gen_capabilities_clean) != len(gen_capabilities): - logger.warning( - f"Only {len(gen_capabilities_clean)} capabilities were created out of {len(gen_capabilities)} generated capabilities." - ) - except Exception as e: - logger.error(f"Error generating capabilities: {e}") - logger.error(f"Response:\n{response}") - raise e - - logger.info( - f"Generated {len(gen_capabilities_clean)} capabilities:\n{gen_capabilities_clean}" + # Use async_call_model with asyncio.run() for sync execution + # Retry logic is handled inside async_call_model + response = asyncio.run( + async_call_model( + scientist_llm_client, + system_prompt=sys_prompt, + user_prompt=user_prompt, + mode=ModelCallMode.JSON_PARSE, + ) ) - return { - "capabilities": gen_capabilities_clean, - "metadata": { - "model": scientist_llm.get_model_name(), - "thought": parsed_response["thought"], - "api_metadata": metadata, - }, - } + # Response is already a parsed dict from JSON_PARSE mode + gen_capabilities_dict = response.get("capabilities", []) + capabilities = [] + + for idx, capability_dict in enumerate(gen_capabilities_dict): + try: + # Create capability object without saving to disk + capability_id = f"cap_{idx:03d}" + capability = Capability( + name=capability_dict["name"], + capability_id=capability_id, + area=area, + description=capability_dict["description"], + ) + except Exception as e: + logger.warning( + f"Error creating capability object {capability_dict['name']}, hence skipping it: {e}" + ) + # Skip this capability + continue + else: + capabilities.append(capability) + + if len(capabilities) != len(gen_capabilities_dict): + logger.warning( + f"Only {len(capabilities)} capabilities were created out of {len(gen_capabilities_dict)} generated capabilities." + ) + + logger.info(f"Generated {len(capabilities)} capabilities.") + + return capabilities diff --git a/src/generate_diverse_tasks.py b/src/generate_diverse_tasks.py new file mode 100644 index 0000000..a3fd231 --- /dev/null +++ b/src/generate_diverse_tasks.py @@ -0,0 +1,68 @@ +"""Generate diverse tasks using base task generation multi-dimensional approach. + +This module adapts the base task generator for use in Stage 3 +of the standardized pipeline. +""" + +import logging +from typing import List + +from autogen_core.models import ChatCompletionClient + +# Import base task generation components +from src.base_task_generation.extract_subtopics import extract_subtopics +from src.base_task_generation.find_combinations import find_valid_combinations +from src.base_task_generation.generate_blueprints import generate_blueprints +from src.base_task_generation.generate_tasks import generate_tasks + +# Import schema objects +from src.schemas.capability_schemas import Capability +from src.schemas.solution_schemas import TaskSolution + + +logger = logging.getLogger(__name__) + + +def generate_diverse_tasks_for_capability( + capability: Capability, + tasks_per_blueprint: int, + client: ChatCompletionClient, + min_subtopics: int = 3, + max_subtopics: int = 8, +) -> List[TaskSolution]: + """Generate diverse tasks with solutions for a single capability using experimental method. + + Args: + capability: Schema Capability object + tasks_per_blueprint: Number of tasks to generate per blueprint + client: ChatCompletionClient from get_standard_model_client + min_subtopics: Minimum number of subtopics to generate + max_subtopics: Maximum number of subtopics to generate + + Returns + ------- + List of schema TaskSolution objects + """ + logger.info(f"Generating diverse tasks for capability: {capability.name}") + + # Step 1: Extract sub-topics + logger.info("Step 1: Extracting sub-topics") + subtopics = extract_subtopics(capability, client, min_subtopics, max_subtopics) + logger.info(f"Extracted {len(subtopics)} sub-topics") + + # Step 2: Find valid combinations + logger.info("Step 2: Finding valid combinations") + combinations = find_valid_combinations(capability, subtopics, client) + logger.info(f"Found {len(combinations)} valid combinations") + + # Step 3: Generate blueprints + logger.info("Step 3: Generating blueprints") + blueprints = generate_blueprints(capability, combinations, client) + logger.info(f"Generated {len(blueprints)} blueprints") + + # Step 4: Generate tasks (returns schema TaskSolution objects directly) + logger.info("Step 4: Generating tasks with solutions") + task_solutions = generate_tasks(capability, blueprints, client, tasks_per_blueprint) + logger.info(f"Generated {len(task_solutions)} task solutions") + + return task_solutions diff --git a/src/model.py b/src/model.py index 4a76fc0..3449407 100644 --- a/src/model.py +++ b/src/model.py @@ -10,7 +10,7 @@ from langchain_anthropic import ChatAnthropic from langchain_google_genai import ChatGoogleGenerativeAI from langchain_openai import ChatOpenAI -from langsmith import traceable +# from langsmith import traceable # COMMENTED OUT FOR DEBUGGING from pydantic import SecretStr from ratelimit import limits, sleep_and_retry @@ -64,7 +64,7 @@ def _set_llm( @sleep_and_retry # type: ignore @limits(**RATE_LIMIT) # type: ignore - @traceable + # @traceable # COMMENTED OUT FOR DEBUGGING def generate( self, sys_prompt: str, user_prompt: str, generation_config: Dict[str, Any] ) -> Tuple[str | None, Dict[str, int | Any]]: @@ -113,7 +113,7 @@ def generate( @sleep_and_retry # type: ignore @limits(**RATE_LIMIT) # type: ignore - @traceable + # @traceable # COMMENTED OUT FOR DEBUGGING async def async_generate( self, sys_prompt: str, user_prompt: str, generation_config: Dict[str, Any] ) -> Tuple[str | None, Dict[str, int | Any]]: diff --git a/src/run_capability_generation.py b/src/run_capability_generation.py deleted file mode 100644 index 3893432..0000000 --- a/src/run_capability_generation.py +++ /dev/null @@ -1,150 +0,0 @@ -"""Script to generate capabilities and tasks using the scientist LLM.""" - -import logging -import os - -import hydra -from omegaconf import DictConfig - -from src.generate_capabilities import ( - generate_capabilities, -) -from src.generate_tasks import ( - generate_tasks_using_llm, -) -from src.model import Model -from src.utils import constants -from src.utils.capability_management_utils import ( - filter_capabilities, - get_previous_capabilities, -) -from src.utils.data_utils import check_cfg, get_run_id -from src.utils.embedding_utils import ( - generate_and_set_capabilities_embeddings, -) - - -@hydra.main(version_base=None, config_path="cfg", config_name="run_cfg") -def main(cfg: DictConfig) -> None: - """ - Run capability generation with the specified configuration. - - This includes generating capabilities and generating, solving and verifying tasks. - - Args: - cfg (DictConfig): Configuration for the model. - """ - check_cfg(cfg, logger) - run_id = get_run_id(cfg) - logger.info(f"Run ID: {run_id}") - - # Initialize the scientist LLM model - scientist_llm = Model( - model_name=cfg.scientist_llm.name, - model_provider=cfg.scientist_llm.provider, - ) - scientist_llm_gen_cfg = cfg.scientist_llm.generation_cfg - - # Generate initial capabilities - # Set the base capability directory - base_capability_dir = os.path.join( - constants.BASE_ARTIFACTS_DIR, - f"capabilities_{run_id}", - cfg.capabilities_cfg.domain, - ) - target_num_capabilities = cfg.capabilities_cfg.num_gen_capabilities - if os.path.exists(base_capability_dir): - # Fetch previously generated capabilities - logger.info( - f"Base capability directory already exists: {base_capability_dir}. " - "Fetching previously generated capabilities." - ) - capabilities = get_previous_capabilities(capability_dir=base_capability_dir) - else: - os.makedirs(base_capability_dir, exist_ok=False) - logger.info("Starting capability generation ...") - num_capabilities = int( - target_num_capabilities - * (1 + cfg.capabilities_cfg.num_gen_capabilities_buffer) - ) - - capabilities = generate_capabilities( - domain=cfg.capabilities_cfg.domain, - num_capabilities=num_capabilities, - num_capabilities_per_run=cfg.capabilities_cfg.num_gen_capabilities_per_run, - base_capability_dir=base_capability_dir, - scientist_llm=scientist_llm, - num_seed_capabilities=cfg.capabilities_cfg.num_seed_capabilities, - scientist_llm_gen_cfg=dict(scientist_llm_gen_cfg.capability_generation), - method=cfg.capabilities_cfg.method, - num_capability_areas=cfg.capabilities_cfg.num_capability_areas, - exclude_seed_capability_names=["word_problems"], - run_id=run_id, - trial_run=cfg.exp_cfg.trial_run, - seed=cfg.exp_cfg.seed, - retry_attempts=cfg.capabilities_cfg.capabilities_gen_retry_attempts, - ) - capabilities = sorted(capabilities, key=lambda x: x.name) - logger.info(f"Capability names ({len(capabilities)}):\n{capabilities}") - if len(capabilities) < target_num_capabilities: - logger.warning( - f"Only {len(capabilities)} capabilities were created. " - f"Target number of capabilities not reached: {target_num_capabilities}. " - "It is recommended to increase the buffer." - ) - - task_gen_prompt_version = cfg.capabilities_cfg.task_gen_prompt_version - - if "mock_seed_capabilities" in run_id: - # Use all capabilities (skip filtering) for the mock_seed_capabilities runs - filtered_capabilities = capabilities - if "v2" in run_id: - # Use the task generation prompt version 2 for mock seed capabilities v2 - task_gen_prompt_version = "v2" - else: - # Embed capabilities using openai embedding model - generate_and_set_capabilities_embeddings( - capabilities=capabilities, - embedding_model_name=cfg.embedding_cfg.embedding_model, - embed_dimensions=cfg.embedding_cfg.embedding_size, - ) - # Filter capabilities based on their embeddings - filtered_capabilities = filter_capabilities( - capabilities, - embedding_model_name=cfg.embedding_cfg.embedding_model, - similarity_threshold=cfg.embedding_cfg.filtering_similarity_threshold, - ) - logger.info( - f"Capabilities retained after filtering ({len(filtered_capabilities)}/{len(capabilities)}): {filtered_capabilities}" - ) - - for capability in filtered_capabilities: - # Generate tasks for each capability - generate_tasks_using_llm( - capability=capability, - scientist_llm=scientist_llm, - num_tasks=cfg.capabilities_cfg.num_gen_tasks_per_capability, - num_tasks_buffer=cfg.capabilities_cfg.num_gen_tasks_buffer, - scientist_llm_gen_cfg_task_gen=dict(scientist_llm_gen_cfg.task_generation), - scientist_llm_gen_cfg_task_solve=dict(scientist_llm_gen_cfg.task_solve), - scientist_llm_gen_cfg_task_verify=dict(scientist_llm_gen_cfg.task_verify), - solve_sample_tasks=True, - few_shot=cfg.capabilities_cfg.task_gen_few_shot, - run_id=run_id, - tasks_gen_retry_attempts=cfg.capabilities_cfg.tasks_gen_retry_attempts, - concurrency_task_solver=cfg.capabilities_cfg.concurrency_task_solver, - concurrency_task_verifier=cfg.capabilities_cfg.concurrency_task_verifier, - seed=cfg.exp_cfg.seed, - task_gen_prompt_version=task_gen_prompt_version, - ) - if cfg.exp_cfg.trial_run: - logger.info( - f"Trial run enabled. Stopping after generating tasks for {capability.name}." - ) - break - - -if __name__ == "__main__": - logger = logging.getLogger(__name__) - - main() diff --git a/src/run_generation_pipeline.py b/src/run_generation_pipeline.py new file mode 100644 index 0000000..f2087cb --- /dev/null +++ b/src/run_generation_pipeline.py @@ -0,0 +1,829 @@ +"""Script to generate capabilities (Stage 2) and tasks (Stage 3). + +This module keeps the existing behavior but makes the flow explicit: +- Stage 0: setup (config validation, run id, model init) +- Stage 1: create a minimal single area artifact (schema alignment) +- Stage 2: generate capabilities, embed + filter +- Stage 3: generate tasks for retained capabilities + +Usage: + # Run specific stage using Hydra override syntax + python -m src.run_capability_generation stage=0 + python -m src.run_capability_generation stage=1 + python -m src.run_capability_generation stage=2 areas_tag=_20251211_214002 + python -m src.run_capability_generation stage=3 capabilities_tag=_20251211_220000 + + # Run all stages + python -m src.run_capability_generation stage=all + python -m src.run_capability_generation # defaults to "all" +""" + +import json +import logging +import math +from datetime import datetime +from pathlib import Path + +import hydra +from omegaconf import DictConfig, OmegaConf + +from src.generate_capabilities import generate_areas, generate_capabilities +from src.generate_diverse_tasks import generate_diverse_tasks_for_capability +from src.schemas.domain_schemas import Domain +from src.schemas.experiment_schemas import Experiment +from src.schemas.io_utils import ( + load_areas, + load_capabilities, + load_domain, + save_areas, + save_capabilities, + save_domain, + save_experiment, + save_solution, + save_validation, +) +from src.schemas.metadata_schemas import PipelineMetadata +from src.schemas.validation_schemas import ValidationResult +from src.utils import constants +from src.utils.capability_management_utils import ( + filter_schema_capabilities_by_embeddings, +) +from src.utils.data_utils import check_cfg +from src.utils.embedding_utils import ( + generate_schema_capabilities_embeddings, +) +from src.utils.model_client_utils import get_standard_model_client + + +logger = logging.getLogger(__name__) + + +def stage0_setup( + cfg: DictConfig, +) -> None: + """Stage 0: basic setup (config check, run id, base dir, schema files).""" + check_cfg(cfg, logger) + exp_id = cfg.exp_cfg.exp_id + output_base_dir = Path(cfg.global_cfg.output_dir) + domain_name = cfg.global_cfg.domain + pipeline_type = cfg.global_cfg.pipeline_type + logger.info( + "Stage 0: exp_id=%s | domain=%s | output_base_dir=%s | pipeline_type=%s", + exp_id, + domain_name, + output_base_dir, + pipeline_type, + ) + + domain_id = "domain_000" + domain_obj = Domain( + name=domain_name, + domain_id=domain_id, + description=None, + ) + + # Convert entire config to dictionary for experiment configuration + config_dict = OmegaConf.to_container(cfg, resolve=True) + + experiment_obj = Experiment( + experiment_id=exp_id, + domain=domain_name, + domain_id=domain_id, + pipeline_type=pipeline_type, + configuration=config_dict, + ) + + metadata = PipelineMetadata( + experiment_id=exp_id, + output_base_dir=str(output_base_dir), + timestamp=_iso_timestamp(), + input_stage_tag=None, + output_stage_tag=None, + resume=False, + ) + save_experiment( + experiment=experiment_obj, + metadata=metadata, + output_path=output_base_dir / exp_id / "experiment.json", + ) + save_domain( + domain=domain_obj, + metadata=metadata, + output_path=output_base_dir / exp_id / "domain" / "domain.json", + ) + logger.info( + "Stage 0: saved experiment and domain artifacts under %s", output_base_dir + ) + + +def _timestamp_tag() -> str: + """Return a timestamp tag in `_YYYYMMDD_HHMMSS` format.""" + return f"_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}" + + +def _iso_timestamp() -> str: + """Return an ISO 8601 formatted timestamp with UTC timezone.""" + return datetime.utcnow().isoformat() + "Z" + + +def stage1_generate_areas(cfg: DictConfig) -> str: + """ + Stage 1: Generate capability areas using hierarchical method. + + Uses LLM to generate multiple areas within the domain. + + Args: + cfg: The configuration object + + Returns + ------- + The areas_tag for this generation + """ + experiment_id = cfg.exp_cfg.exp_id + output_base_dir = Path(cfg.global_cfg.output_dir) + + # Load domain from Stage 0 output + domain_path = output_base_dir / experiment_id / "domain" / "domain.json" + domain, _ = load_domain(domain_path) + + # Initialize scientist LLM client directly with generation parameters + scientist_llm_gen_cfg = dict(cfg.scientist_llm.generation_cfg.capability_generation) + scientist_llm_client = get_standard_model_client( + cfg.scientist_llm.name, + seed=scientist_llm_gen_cfg.get("seed", cfg.exp_cfg.seed), + temperature=scientist_llm_gen_cfg.get( + "temperature", constants.DEFAULT_TEMPERATURE + ), + max_tokens=scientist_llm_gen_cfg.get( + "max_tokens", constants.DEFAULT_MAX_TOKENS + ), + ) + + num_areas = cfg.capabilities_cfg.num_areas + logger.info(f"Generating {num_areas} capability areas for domain: {domain.name}") + + areas = generate_areas( + domain=domain, + num_areas=num_areas, + num_capabilities_per_area=cfg.capabilities_cfg.num_capabilities // num_areas, + scientist_llm_client=scientist_llm_client, + ) + + # Convert area names to Area objects + if len(areas) > num_areas: + logger.warning( + f"Generated {len(areas)} areas, but only {num_areas} are needed." + + f"Keeping the first {num_areas} areas." + ) + areas = areas[:num_areas] + + # Save areas + areas_tag = _timestamp_tag() + areas_path = output_base_dir / experiment_id / "areas" / areas_tag / "areas.json" + metadata = PipelineMetadata( + experiment_id=experiment_id, + output_base_dir=str(output_base_dir), + timestamp=_iso_timestamp(), + input_stage_tag=None, + output_stage_tag=areas_tag, + resume=False, + ) + save_areas(areas, metadata, areas_path) + logger.info(f"Stage 1: saved {len(areas)} areas to {areas_path}") + return areas_tag + + +def stage2_generate_and_filter_capabilities( + cfg: DictConfig, + areas_tag: str, + capabilities_tag: str = None, +) -> str: + """Stage 2: generate capabilities, embed, and filter per schema intent. + + Args: + cfg: The configuration object + areas_tag: The tag from Stage 1 to load areas from + capabilities_tag: Optional resume tag. If provided, resumes from existing tag. + + Returns + ------- + The capabilities_tag for this generation + """ + experiment_id = cfg.exp_cfg.exp_id + output_base_dir = Path(cfg.global_cfg.output_dir) + + # Load areas from Stage 1 output + areas_path = output_base_dir / experiment_id / "areas" / areas_tag / "areas.json" + areas, _ = load_areas(areas_path) + logger.info(f"Loaded {len(areas)} area(s) from Stage 1") + + # Initialize scientist LLM client directly with generation parameters + scientist_llm_gen_cfg = dict(cfg.scientist_llm.generation_cfg.capability_generation) + scientist_llm_client = get_standard_model_client( + cfg.scientist_llm.name, + seed=scientist_llm_gen_cfg.get("seed", cfg.exp_cfg.seed), + temperature=scientist_llm_gen_cfg.get( + "temperature", constants.DEFAULT_TEMPERATURE + ), + max_tokens=scientist_llm_gen_cfg.get( + "max_tokens", constants.DEFAULT_MAX_TOKENS + ), + ) + + # Determine capabilities tag (resume or new) + is_resume = capabilities_tag is not None + if is_resume: + logger.info(f"Resuming Stage 2 with capabilities_tag: {capabilities_tag}") + else: + capabilities_tag = _timestamp_tag() + logger.info(f"Starting new Stage 2 with capabilities_tag: {capabilities_tag}") + + # Calculate target capabilities per area + target_num_capabilities_per_area = math.ceil( + cfg.capabilities_cfg.num_capabilities / len(areas) + ) + num_capabilities_per_area = int( + target_num_capabilities_per_area + * (1 + cfg.capabilities_cfg.num_capabilities_buffer) + ) + + # Process each area + for area in areas: + # Check if capabilities already exist for this area (resume logic) + capabilities_path = ( + output_base_dir + / experiment_id + / "capabilities" + / capabilities_tag + / area.area_id + / "capabilities.json" + ) + + if is_resume and capabilities_path.exists(): + logger.info( + f"Skipping area {area.name} ({area.area_id}) - capabilities already exist at {capabilities_path}" + ) + continue + + logger.info(f"Generating capabilities for area: {area.name} ({area.area_id})") + + # Generate capabilities using existing function + capabilities = generate_capabilities( + area=area, + num_capabilities=num_capabilities_per_area, + num_capabilities_per_run=cfg.capabilities_cfg.num_gen_capabilities_per_run, + scientist_llm_client=scientist_llm_client, + ) + + # Sort capabilities + capabilities = sorted(capabilities, key=lambda x: x.name) + if len(capabilities) < target_num_capabilities_per_area: + logger.warning( + f"Only {len(capabilities)} capabilities were created. " + f"Target number not reached: {target_num_capabilities_per_area}. " + "It is recommended to increase the buffer." + ) + + # Generate embeddings for schema capabilities + embeddings = generate_schema_capabilities_embeddings( + capabilities=capabilities, + embedding_model_name=cfg.embedding_cfg.embedding_model, + embed_dimensions=cfg.embedding_cfg.embedding_size, + ) + + # Filter capabilities based on embedding similarity + filtered_capabilities, retained_indices = ( + filter_schema_capabilities_by_embeddings( + capabilities=capabilities, + embeddings=embeddings, + similarity_threshold=cfg.embedding_cfg.filtering_similarity_threshold, + ) + ) + + logger.info( + f"Capabilities retained after filtering: {len(filtered_capabilities)}/{len(capabilities)}" + ) + + for idx, cap in enumerate(filtered_capabilities): + cap.generation_metadata = { + "embedding_model": cfg.embedding_cfg.embedding_model, + "similarity_threshold": cfg.embedding_cfg.filtering_similarity_threshold, + "original_index": idx, + } + + # Save capabilities for this area + metadata = PipelineMetadata( + experiment_id=experiment_id, + output_base_dir=str(output_base_dir), + timestamp=_iso_timestamp(), + input_stage_tag=areas_tag, + output_stage_tag=capabilities_tag, + resume=is_resume, + ) + + save_capabilities(filtered_capabilities, metadata, capabilities_path) + logger.info( + f"Stage 2: saved {len(filtered_capabilities)} capabilities to {capabilities_path}" + ) + + return capabilities_tag + + +def stage3_generate_tasks( + cfg: DictConfig, + capabilities_tag: str, + tasks_tag: str = None, +) -> str: + """Stage 3: Generate diverse tasks with solutions for each capability. + + Generates tasks using the diverse task generation method and creates + TaskSolution objects with the correct answer and explanation. + + Args: + cfg: The configuration object + capabilities_tag: The tag from Stage 2 to load capabilities from + tasks_tag: Optional resume tag. If provided, resumes from existing tag. + + Returns + ------- + The tasks_tag for this generation + """ + experiment_id = cfg.exp_cfg.exp_id + output_base_dir = Path(cfg.global_cfg.output_dir) + + # Determine tasks tag (resume or new) + is_resume = tasks_tag is not None + if is_resume: + logger.info(f"Resuming Stage 3 with tasks_tag: {tasks_tag}") + else: + tasks_tag = _timestamp_tag() + logger.info(f"Starting new Stage 3 with tasks_tag: {tasks_tag}") + + # Initialize scientist LLM client using task_generation config + scientist_llm_gen_cfg = dict(cfg.scientist_llm.generation_cfg.task_generation) + scientist_llm_client = get_standard_model_client( + cfg.scientist_llm.name, + seed=scientist_llm_gen_cfg.get("seed", cfg.exp_cfg.seed), + temperature=scientist_llm_gen_cfg.get( + "temperature", constants.DEFAULT_TEMPERATURE + ), + max_tokens=scientist_llm_gen_cfg.get( + "max_tokens", constants.DEFAULT_MAX_TOKENS + ), + ) + + # Get task generation parameters from config + tasks_per_blueprint = cfg.task_generation_cfg.get("tasks_per_blueprint", 3) + min_subtopics = cfg.task_generation_cfg.get("min_subtopics", 3) + max_subtopics = cfg.task_generation_cfg.get("max_subtopics", 8) + + # Find all area directories under capabilities// + capabilities_base_dir = ( + output_base_dir / experiment_id / "capabilities" / capabilities_tag + ) + area_dirs = [d for d in capabilities_base_dir.iterdir() if d.is_dir()] + + logger.info(f"Found {len(area_dirs)} area directories") + + # Process each area + for area_dir in area_dirs: + area_id = area_dir.name + logger.info(f"Processing area: {area_id}") + + # Load capabilities for this area + capabilities_path = area_dir / "capabilities.json" + capabilities, _ = load_capabilities(capabilities_path) + logger.info(f"Loaded {len(capabilities)} capabilities from {area_id}") + + # Process each capability + for capability in capabilities: + capability_id = capability.capability_id + + # Check if task solutions already exist for this capability (resume logic) + solutions_dir = ( + output_base_dir + / experiment_id + / "task_solutions" + / tasks_tag + / area_id + / capability_id + ) + + if ( + is_resume + and solutions_dir.exists() + and any(solutions_dir.glob("*.json")) + ): + logger.info( + f"Skipping {area_id}/{capability_id} - task solutions already exist at {solutions_dir}" + ) + continue + + logger.info( + f"Generating tasks for capability: {capability.name} ({area_id}/{capability_id})" + ) + + try: + # Generate diverse tasks with solutions + task_solutions = generate_diverse_tasks_for_capability( + capability=capability, + tasks_per_blueprint=tasks_per_blueprint, + client=scientist_llm_client, + min_subtopics=min_subtopics, + max_subtopics=max_subtopics, + ) + + logger.info( + f"Generated {len(task_solutions)} task solutions for {capability.name}" + ) + + # Save each task solution + metadata = PipelineMetadata( + experiment_id=experiment_id, + output_base_dir=str(output_base_dir), + timestamp=_iso_timestamp(), + input_stage_tag=capabilities_tag, + output_stage_tag=tasks_tag, + resume=is_resume, + ) + + # Save task solutions in task_solutions directory + for task_solution in task_solutions: + solution_path = ( + output_base_dir + / experiment_id + / "task_solutions" + / tasks_tag + / area_id + / capability_id + / f"{task_solution.task_id}.json" + ) + save_solution(task_solution, metadata, solution_path) + + logger.info( + f"Stage 3: saved {len(task_solutions)} task solutions to " + f"task_solutions/{tasks_tag}/{area_id}/{capability_id}/" + ) + + except Exception as e: + logger.error( + f"Error generating tasks for {area_id}/{capability_id}: {e}", + exc_info=True, + ) + # Continue with next capability instead of failing completely + continue + + return tasks_tag + + +def stage5_validate_tasks( + cfg: DictConfig, + solution_tag: str, + validation_tag: str = None, +) -> str: + """Stage 5: Validate generated task solutions. + + Args: + cfg: The configuration object + solution_tag: The tag from Stage 3 to load task solutions from + validation_tag: Optional resume tag. If provided, resumes from existing tag. + + Returns + ------- + The validation_tag for this validation + """ + experiment_id = cfg.exp_cfg.exp_id + output_base_dir = Path(cfg.global_cfg.output_dir) + + # Determine validation tag (resume or new) + is_resume = validation_tag is not None + if is_resume: + logger.info(f"Resuming Stage 5 with validation_tag: {validation_tag}") + else: + validation_tag = _timestamp_tag() + logger.info(f"Starting new Stage 5 with validation_tag: {validation_tag}") + + # Initialize validator LLM client + validator_llm_gen_cfg = dict( + cfg.get("validator_llm", {}) + .get("generation_cfg", {}) + .get("task_validation", {}) + ) + validator_llm_name = cfg.get("validator_llm", {}).get( + "name", cfg.scientist_llm.name + ) + validator_llm_client = get_standard_model_client( + validator_llm_name, + seed=validator_llm_gen_cfg.get("seed", cfg.exp_cfg.seed), + temperature=validator_llm_gen_cfg.get( + "temperature", constants.DEFAULT_TEMPERATURE + ), + max_tokens=validator_llm_gen_cfg.get( + "max_tokens", constants.DEFAULT_MAX_TOKENS + ), + ) + + # Get validation parameters from config + pass_threshold = cfg.get("task_verification_cfg", {}).get("pass_threshold", 0.8) + strict_mode = cfg.get("task_verification_cfg", {}).get("strict_mode", False) + + # Find all task_solutions directories + task_solutions_base_dir = ( + output_base_dir / experiment_id / "task_solutions" / solution_tag + ) + + if not task_solutions_base_dir.exists(): + logger.error(f"Task solutions directory not found: {task_solutions_base_dir}") + return validation_tag + + # Find all area directories + area_dirs = [d for d in task_solutions_base_dir.iterdir() if d.is_dir()] + logger.info(f"Found {len(area_dirs)} area directories") + + # Process each area + for area_dir in area_dirs: + area_id = area_dir.name + logger.info(f"Processing area: {area_id}") + + # Find all capability directories + capability_dirs = [d for d in area_dir.iterdir() if d.is_dir()] + + for capability_dir in capability_dirs: + capability_id = capability_dir.name + + # Find all task solution files + task_solution_files = list(capability_dir.glob("*.json")) + + if not task_solution_files: + logger.warning(f"No task solutions found in {area_id}/{capability_id}") + continue + + logger.info( + f"Validating {len(task_solution_files)} task solutions for {area_id}/{capability_id}" + ) + + for task_solution_file in task_solution_files: + task_id = task_solution_file.stem + + # Check if validation already exists (resume logic) + validation_path = ( + output_base_dir + / experiment_id + / "validations" + / validation_tag + / area_id + / capability_id + / f"{task_id}.json" + ) + + if is_resume and validation_path.exists(): + logger.info( + f"Skipping {area_id}/{capability_id}/{task_id} - validation already exists" + ) + continue + + try: + # Load task solution + with open(task_solution_file, "r") as f: + task_solution_data = json.load(f) + + # Extract necessary information + task_text = task_solution_data.get("task", "") + solution = task_solution_data.get("solution", "") + reasoning = task_solution_data.get("reasoning", "") + generation_metadata = task_solution_data.get( + "generation_metadata", {} + ) + + # For validation, we need to check if the task is well-formed + # Simple validation: check if task has content and solution exists + verification = ( + len(task_text.strip()) > 0 and len(solution.strip()) > 0 + ) + feedback = ( + "Task validation passed" + if verification + else "Task validation failed: missing content" + ) + + # Create Task object for ValidationResult + from src.schemas.area_schemas import Area + from src.schemas.capability_schemas import Capability + from src.schemas.domain_schemas import Domain + from src.schemas.task_schemas import Task + + domain = Domain( + name=task_solution_data.get("domain", ""), + domain_id=task_solution_data.get("domain_id", ""), + description="", + ) + area = Area( + name=task_solution_data.get("area", ""), + area_id=task_solution_data.get("area_id", ""), + domain=domain, + description=task_solution_data.get("area_description", ""), + ) + capability = Capability( + name=task_solution_data.get("capability", ""), + capability_id=task_solution_data.get("capability_id", ""), + area=area, + description=task_solution_data.get( + "capability_description", "" + ), + ) + task = Task( + task_id=task_id, + task=task_text, + capability=capability, + ) + + # Create ValidationResult + validation_result = ValidationResult( + task_id=task_id, + task=task_text, + verification=verification, + feedback=feedback, + task_obj=task, + generation_metadata={ + "method": "simple_validation", + "pass_threshold": pass_threshold, + "strict_mode": strict_mode, + **generation_metadata, + }, + ) + + # Save validation + metadata = PipelineMetadata( + experiment_id=experiment_id, + output_base_dir=str(output_base_dir), + timestamp=_iso_timestamp(), + input_stage_tag=solution_tag, + output_stage_tag=validation_tag, + resume=is_resume, + ) + + save_validation(validation_result, metadata, validation_path) + logger.info( + f"Validated {task_id}: {'✓ PASS' if verification else '✗ FAIL'}" + ) + + except Exception as e: + logger.error( + f"Error validating {area_id}/{capability_id}/{task_id}: {e}", + exc_info=True, + ) + continue + + logger.info(f"Stage 5 completed. Validation tag: {validation_tag}") + return validation_tag + + +@hydra.main(version_base=None, config_path="cfg", config_name="run_cfg") +def main(cfg: DictConfig) -> None: + """Run specific pipeline stages based on configuration. + + Stage 0: Experiment and domain setup + Stage 1: Area generation + Stage 2: Capability generation and filtering + Stage 3: Task generation with solutions + Stage 5: Task validation + "all": Run all stages sequentially + """ + # Suppress httpx and autogen_core INFO logs + logging.getLogger("httpx").setLevel(logging.WARNING) + logging.getLogger("autogen_core.events").setLevel(logging.WARNING) + + # Get stage from config (can be overridden via command line) + stage = cfg.get("stage", "all") + + # Convert string to int if numeric + if isinstance(stage, str) and stage.isdigit(): + stage = int(stage) + + logger.info(f"Running stage: {stage}") + + # Track tags across stages + areas_tag = cfg.get("areas_tag", None) + capabilities_tag = cfg.get("capabilities_tag", None) + solution_tag = cfg.get("solution_tag", None) + validation_tag = None + + if stage == 0 or stage == "all": + logger.info("=" * 60) + logger.info("STAGE 0: Experiment and Domain Setup") + logger.info("=" * 60) + stage0_setup(cfg) + if stage == 0: + return + + if stage == 1 or stage == "all": + logger.info("=" * 60) + logger.info("STAGE 1: Area Generation (Hierarchical)") + logger.info("=" * 60) + areas_tag = stage1_generate_areas(cfg) + logger.info("Stage 1 areas tag: %s", areas_tag) + if stage == 1: + return + + if stage == 2 or stage == "all": + logger.info("=" * 60) + logger.info("STAGE 2: Capability Generation and Filtering") + logger.info("=" * 60) + + # When running stage 2 standalone, areas_tag must be provided + if stage == 2 and not areas_tag: + logger.error("areas_tag is required when running stage 2 standalone") + logger.error( + "Usage: python -m src.run_capability_generation stage=2 areas_tag=_YYYYMMDD_HHMMSS" + ) + logger.error( + "Optional: capabilities_tag=_YYYYMMDD_HHMMSS to resume from existing run" + ) + return + + # Check if resuming + resume_capabilities_tag = ( + cfg.get("capabilities_tag", None) if stage == 2 else None + ) + if resume_capabilities_tag: + logger.info( + f"Resume mode: Will skip areas that already have capabilities in tag {resume_capabilities_tag}" + ) + + capabilities_tag = stage2_generate_and_filter_capabilities( + cfg=cfg, + areas_tag=areas_tag, + capabilities_tag=resume_capabilities_tag, + ) + logger.info("Stage 2 capabilities tag: %s", capabilities_tag) + if stage == 2: + return + + if stage == 3 or stage == "all": + logger.info("=" * 60) + logger.info("STAGE 3: Diverse Task Generation") + logger.info("=" * 60) + + # When running stage 3 standalone, capabilities_tag must be provided + if stage == 3 and not capabilities_tag: + logger.error("capabilities_tag is required when running stage 3 standalone") + logger.error( + "Usage: python -m src.run_capability_generation stage=3 capabilities_tag=_YYYYMMDD_HHMMSS" + ) + logger.error( + "Optional: tasks_tag=_YYYYMMDD_HHMMSS to resume from existing run" + ) + return + + # Check if resuming + resume_tasks_tag = cfg.get("tasks_tag", None) if stage == 3 else None + if resume_tasks_tag: + logger.info( + f"Resume mode: Will skip capabilities that already have tasks in tag {resume_tasks_tag}" + ) + + solution_tag = stage3_generate_tasks( + cfg=cfg, + capabilities_tag=capabilities_tag, + tasks_tag=resume_tasks_tag, + ) + logger.info("Stage 3 solution tag: %s", solution_tag) + if stage == 3: + return + + if stage == 5 or stage == "all": + logger.info("=" * 60) + logger.info("STAGE 5: Task Validation") + logger.info("=" * 60) + + # When running stage 5 standalone, solution_tag must be provided + if stage == 5 and not solution_tag: + logger.error("solution_tag is required when running stage 5 standalone") + logger.error( + "Usage: python -m src.run_capability_generation stage=5 solution_tag=_YYYYMMDD_HHMMSS" + ) + logger.error( + "Optional: validation_tag=_YYYYMMDD_HHMMSS to resume from existing run" + ) + return + + # Check if resuming + resume_validation_tag = cfg.get("validation_tag", None) if stage == 5 else None + if resume_validation_tag: + logger.info( + f"Resume mode: Will skip tasks that already have validations in tag {resume_validation_tag}" + ) + + validation_tag = stage5_validate_tasks( + cfg=cfg, + solution_tag=solution_tag, + validation_tag=resume_validation_tag, + ) + logger.info("Stage 5 validation tag: %s", validation_tag) + if stage == 5: + return + + +if __name__ == "__main__": + logger = logging.getLogger(__name__) + main() diff --git a/src/utils/capability_management_utils.py b/src/utils/capability_management_utils.py index 6416c4a..d693cab 100644 --- a/src/utils/capability_management_utils.py +++ b/src/utils/capability_management_utils.py @@ -4,7 +4,9 @@ import logging import os import random -from typing import Any, List, Union +from typing import Any, List, Tuple, Union + +import torch from src.capability import Capability from src.generate_embeddings import filter_embeddings @@ -186,3 +188,43 @@ def filter_capabilities( f"Filtered out {len(filtered_out_capabilities)} capabilities:\n{filtered_out_capabilities}" ) return [capabilities[i] for i in remaining_indices] + + +def filter_schema_capabilities_by_embeddings( + capabilities: List[Any], # List of schema Capability objects + embeddings: List[torch.Tensor], + similarity_threshold: float, +) -> Tuple[List[Any], List[int]]: + """Filter schema capabilities based on embedding similarity. + + This function filters capabilities without mutating them, returning both + the filtered list and the indices of retained capabilities. + + Args + ---- + capabilities (List[Any]): The list of schema Capability objects. + embeddings (List[torch.Tensor]): The embeddings corresponding to capabilities. + similarity_threshold (float): The threshold for cosine similarity + above which capabilities are considered duplicates. + + Returns + ------- + Tuple[List[Any], List[int]]: + - List of filtered capabilities + - List of indices of retained capabilities + """ + if len(capabilities) != len(embeddings): + raise ValueError( + f"Number of capabilities ({len(capabilities)}) must match " + f"number of embeddings ({len(embeddings)})" + ) + + remaining_indices = filter_embeddings(embeddings, similarity_threshold) + + logger.info( + f"Filtered out {len(capabilities) - len(remaining_indices)} " + f"capabilities out of {len(capabilities)}" + ) + + filtered_capabilities = [capabilities[i] for i in remaining_indices] + return filtered_capabilities, remaining_indices diff --git a/src/utils/capability_utils.py b/src/utils/capability_utils.py index dbe8927..5cb0289 100644 --- a/src/utils/capability_utils.py +++ b/src/utils/capability_utils.py @@ -13,8 +13,8 @@ import numpy as np from inspect_ai import eval as inspect_eval from inspect_ai.scorer import CORRECT -from langsmith import traceable, tracing_context +# from langsmith import traceable, tracing_context # COMMENTED OUT FOR DEBUGGING from src.model import Model from src.utils import constants from src.utils.data_utils import read_json_file @@ -247,9 +247,9 @@ def run_inspect_evals(path: str, model: Model, log_dir: str, **kwargs: Any) -> N } ls_metadata.update({f"ls_{k}": v for k, v in kwargs.items()}) - @traceable( - run_type="llm", - ) + # @traceable( # COMMENTED OUT FOR DEBUGGING + # run_type="llm", + # ) def _run_inspect_evals() -> Dict[str, Any]: """ Run the inspect evals command for a given capability and model. @@ -299,12 +299,13 @@ def _run_inspect_evals() -> Dict[str, Any]: else: inspect_model_name = model_name - with tracing_context( - enabled=True, - tags=["run_inspect_evals"], - metadata=ls_metadata, - ): - output = _run_inspect_evals() + # COMMENTED OUT LANGSMITH TRACING FOR DEBUGGING + # with tracing_context( + # enabled=True, + # tags=["run_inspect_evals"], + # metadata=ls_metadata, + # ): + output = _run_inspect_evals() if model.model_provider == "local": # Reset OPENAI_BASE_URL to actual openai URL diff --git a/src/utils/constants.py b/src/utils/constants.py index 9d2c6ec..6fe2a91 100644 --- a/src/utils/constants.py +++ b/src/utils/constants.py @@ -56,6 +56,9 @@ class VecInfStatus(Enum): DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1" +DEFAULT_TEMPERATURE = 0.7 +DEFAULT_MAX_TOKENS = 16384 + DEFAULT_CAPABILITY_GENERATION_RETRY_ATTEMPTS = 3 DEFAULT_TASK_GENERATION_RETRY_ATTEMPTS = 3 diff --git a/src/utils/data_utils.py b/src/utils/data_utils.py index 463363c..30dbc7b 100644 --- a/src/utils/data_utils.py +++ b/src/utils/data_utils.py @@ -243,11 +243,21 @@ def check_cfg(cfg: DictConfig, logger: logging.Logger) -> None: cfg (DictConfig): The provided configuration. logger (logging.Logger): The logger instance to log messages. """ - assert cfg.capabilities_cfg.num_gen_capabilities > 0 + assert getattr(cfg, "exp_cfg", None) is not None, "exp_cfg must be set." + assert getattr(cfg.exp_cfg, "exp_id", ""), "exp_id must be set in exp_cfg." + assert getattr(cfg, "global_cfg", None) is not None, "global_cfg must be set." + assert getattr(cfg.global_cfg, "output_dir", ""), ( + "global_cfg.output_dir must be set." + ) + assert getattr(cfg.global_cfg, "domain", ""), "global_cfg.domain must be set." + assert getattr(cfg.global_cfg, "pipeline_type", None) is not None, ( + "global_cfg.pipeline_type must be set." + ) + assert cfg.capabilities_cfg.num_capabilities > 0 assert cfg.capabilities_cfg.num_gen_capabilities_per_run > 0 num_capabilities = int( - cfg.capabilities_cfg.num_gen_capabilities - * (1 + cfg.capabilities_cfg.num_gen_capabilities_buffer) + cfg.capabilities_cfg.num_capabilities + * (1 + cfg.capabilities_cfg.num_capabilities_buffer) ) assert num_capabilities >= cfg.capabilities_cfg.num_gen_capabilities_per_run, ( "The total number of capabilities to generate must be greater than or equal to the number of capabilities to generate per run." @@ -280,8 +290,8 @@ def get_run_id(cfg: DictConfig) -> str: if cfg.exp_cfg.exp_id: run_id = str(cfg.exp_cfg.exp_id) else: - run_id = f"{cfg.scientist_llm.name}_C{cfg.capabilities_cfg.num_gen_capabilities}_R{cfg.capabilities_cfg.num_gen_capabilities_per_run}" + run_id = f"{cfg.scientist_llm.name}_C{cfg.capabilities_cfg.num_capabilities}_R{cfg.capabilities_cfg.num_gen_capabilities_per_run}" if cfg.capabilities_cfg.method == "hierarchical": - run_id += f"_A{cfg.capabilities_cfg.num_capability_areas}" + run_id += f"_A{cfg.capabilities_cfg.num_areas}" run_id += f"_T{cfg.capabilities_cfg.num_gen_tasks_per_capability}" return run_id diff --git a/src/utils/embedding_utils.py b/src/utils/embedding_utils.py index bd8406c..2c14841 100644 --- a/src/utils/embedding_utils.py +++ b/src/utils/embedding_utils.py @@ -1,7 +1,9 @@ """Utility functions for capability embeddings and dimensionality reduction.""" import logging -from typing import List +from typing import Any, List + +import torch from src.capability import Capability from src.dimensionality_reduction import DimensionalityReductionMethod @@ -163,3 +165,47 @@ def generate_and_set_capabilities_embeddings( capability.set_embedding( embedding_name=embedding_model_name, embedding_tensor=embeddings[i] ) + + +def generate_schema_capabilities_embeddings( + capabilities: List[Any], # List of schema Capability objects + embedding_model_name: str, + embed_dimensions: int, +) -> List[torch.Tensor]: + """Generate embeddings for schema-based capabilities. + + This function generates embeddings for capabilities that use the schema + dataclass (from src.schemas.capability_schemas) instead of the old + Capability class. It returns the embeddings as a list rather than + mutating the capability objects. + + Args + ---- + capabilities (List[Any]): The list of schema Capability objects. + embedding_model_name (str): The name of the embedding model to use. + embed_dimensions (int): The number of dimensions for the embeddings. + + Returns + ------- + List[torch.Tensor]: List of embedding tensors, one per capability. + """ + # Convert the embedding model name to `EmbeddingModelName` to ensure + # that the provided model name is valid and supported. + embedding_generator = EmbeddingGenerator( + model_name=EmbeddingModelName(embedding_model_name), + embed_dimensions=embed_dimensions, + ) + + # Generate embeddings for the capabilities, all at the same time. + # Embeddings are generated from: area name, capability name, and description. + texts = [] + for capability in capabilities: + # Create representation string from area, name, and description + rep_string = ( + f"{capability.area.name}, {capability.name}, {capability.description}" + ) + logger.debug(f"Representation string: {rep_string}") + texts.append(rep_string) + + embeddings = embedding_generator.generate_embeddings(texts) + return embeddings diff --git a/src/utils/model_client_utils.py b/src/utils/model_client_utils.py index c0944ba..63cf296 100644 --- a/src/utils/model_client_utils.py +++ b/src/utils/model_client_utils.py @@ -16,13 +16,6 @@ ) from autogen_ext.models.anthropic import AnthropicChatCompletionClient from autogen_ext.models.openai import OpenAIChatCompletionClient -from tenacity import ( - before_sleep_log, - retry, - retry_if_exception_type, - stop_after_attempt, - wait_exponential, -) MAX_TOKENS = 1024 * 30 @@ -32,85 +25,6 @@ GEMINI_STUDIO_BASE = "https://generativelanguage.googleapis.com/v1beta/openai/" -class RetryableModelClient: - """Wrap a client and retry `create` on transient API errors.""" - - def __init__(self, client: Any, max_retries: int = 3): - self.client = client - self.max_retries = max_retries - - @retry( - stop=stop_after_attempt(3), - wait=wait_exponential(multiplier=1, min=4, max=10), - retry=retry_if_exception_type( - ( - openai.RateLimitError, - openai.APITimeoutError, - openai.InternalServerError, - anthropic.RateLimitError, - anthropic.APITimeoutError, - anthropic.InternalServerError, - ) - ), - before_sleep=before_sleep_log(logger, logging.WARNING), - reraise=True, - ) - async def create(self, *args: Any, **kwargs: Any) -> Any: - """Create with retry logic for transient errors.""" - return await self.client.create(*args, **kwargs) - - def __getattr__(self, name: str) -> Any: - """Delegate all other attributes to the wrapped client.""" - return getattr(self.client, name) - - -def get_model_client(model_name: str, seed: Optional[int] = None, **kwargs: Any) -> Any: - """Legacy factory: return a retry-wrapped client for `model_name`.""" - n = model_name.lower() - - if n.startswith(("gpt-", "o1-", "o3-", "gpt-5")): - kwargs.setdefault("max_completion_tokens", MAX_TOKENS) - openai_client = OpenAIChatCompletionClient( - model=model_name, seed=seed, **kwargs - ) - return RetryableModelClient(openai_client) - - if "claude" in n: - kwargs.setdefault("max_tokens", MAX_TOKENS) - kwargs.setdefault("timeout", None) - anthropic_client = AnthropicChatCompletionClient(model=model_name, **kwargs) - return RetryableModelClient(anthropic_client) - - if "gemini" in n: - api_key = kwargs.pop("api_key", os.getenv("GOOGLE_API_KEY")) - if not api_key: - raise ValueError("Set GOOGLE_API_KEY for Gemini (AI Studio).") - - model_info = kwargs.pop( - "model_info", - ModelInfo( - vision=True, - function_calling=True, - json_output=True, - structured_output=True, - family="unknown", - ), - ) - - kwargs.setdefault("max_completion_tokens", MAX_TOKENS) - - client = OpenAIChatCompletionClient( - model=model_name, - base_url=GEMINI_STUDIO_BASE, - api_key=api_key, - model_info=model_info, - **kwargs, - ) - return RetryableModelClient(client) - - raise ValueError(f"Unsupported model '{model_name}'.") - - def get_standard_model_client( model_name: str, *, @@ -121,7 +35,21 @@ def get_standard_model_client( n = model_name.lower() # OpenAI GPT / o-series models - if n.startswith(("gpt-", "o1-", "o3-", "gpt-5")): + if n.startswith(("gpt-", "o1-", "o3-", "gpt-5", "o4-")): + # Convert max_tokens to max_completion_tokens for OpenAI + if "max_tokens" in kwargs: + kwargs["max_completion_tokens"] = kwargs.pop("max_tokens") + + # o-series models (o1, o3-mini, o4-mini) don't support custom temperature + # Remove temperature if it's set for these models + if any(key in n for key in ("o1-", "o3-", "o4-")): + if "temperature" in kwargs: + logger.debug( + "Removing 'temperature' parameter for model '%s' - not supported", + model_name, + ) + kwargs.pop("temperature") + return OpenAIChatCompletionClient(model=model_name, seed=seed, **kwargs) # Anthropic Claude models @@ -172,47 +100,26 @@ class ModelCallMode: async def async_call_model( model_client: ChatCompletionClient, *, - model_name: Optional[str] = None, system_prompt: Optional[str] = None, user_prompt: Optional[str] = None, messages: Optional[Sequence[Any]] = None, mode: str = ModelCallMode.TEXT, - temperature: Optional[float] = None, - max_tokens: Optional[int] = None, - top_p: Optional[float] = None, - seed: Optional[int] = None, max_attempts: int = 3, extra_kwargs: Optional[Mapping[str, Any]] = None, ) -> Any: - """Perform a standard async model call with provider-aware args and output modes. + """Perform a standard async model call with output modes and retry logic. - Builds messages from prompts if `messages` is None. - - Maps `temperature`, `max_tokens`, `top_p`, `seed` to the right provider kwargs. - `mode`: - TEXT: return `str` content. - JSON_PARSE: parse JSON and return `dict`. - STRUCTURED: return the raw provider response. - - Retries only for empty content / JSON parse failures; other errors raise - `ModelCallError` immediately. - """ - # Try to infer model name if not provided explicitly. - resolved_model_name: Optional[str] = model_name - if resolved_model_name is None: - underlying = getattr(model_client, "client", model_client) - resolved_model_name = getattr(underlying, "model", None) - - # Identify provider family from the model name. - provider: Optional[str] = None - lowered_name = ( - resolved_model_name.lower() if isinstance(resolved_model_name, str) else "" - ) - if lowered_name.startswith(("gpt-", "o1-", "o3-", "gpt-5")): - provider = "openai" - elif "claude" in lowered_name: - provider = "anthropic" - elif "gemini" in lowered_name: - provider = "gemini" + - Retries on transient API errors (rate limits, timeouts, server errors), + empty content, and JSON parse failures. + Note: temperature, max_tokens, seed should be passed to get_standard_model_client() + when creating the client, not here. + """ if messages is None: if user_prompt is None and system_prompt is None: raise ValueError( @@ -231,45 +138,20 @@ async def async_call_model( raise ValueError("max_attempts must be at least 1") last_error: Exception | None = None - drop_temperature_for_model = False + + # Define retryable exceptions + retryable_exceptions = ( + openai.RateLimitError, + openai.APITimeoutError, + openai.InternalServerError, + anthropic.RateLimitError, + anthropic.APITimeoutError, + anthropic.InternalServerError, + ) for attempt in range(1, max_attempts + 1): request_kwargs: Dict[str, Any] = {} - if temperature is not None and not drop_temperature_for_model: - if provider == "openai" and lowered_name: - # "o1" models: special handling, often ignore temperature. - # "o3-mini", "o3", "o4-mini": temperature is not always supported. - if any( - key in lowered_name for key in ("o1", "o3-mini", "o3", "o4-mini") - ): - logger.debug( - "Not sending 'temperature' for model '%s' due to known " - "limitations.", - resolved_model_name, - ) - else: - request_kwargs["temperature"] = temperature - elif provider in {"anthropic", "gemini", None}: - # Anthropic Claude and Gemini generally support temperature; - # for unknown providers we optimistically pass it through. - request_kwargs["temperature"] = temperature - - # Map unified `max_tokens` to provider-specific kwarg. - if max_tokens is not None: - if provider in {"openai", "gemini"}: - request_kwargs["max_completion_tokens"] = max_tokens - elif provider == "anthropic": - request_kwargs["max_tokens"] = max_tokens - else: - request_kwargs["max_tokens"] = max_tokens - - # `top_p` only for OpenAI-style providers. - if top_p is not None and provider in {"openai", "gemini", None}: - request_kwargs["top_p"] = top_p - if seed is not None: - request_kwargs["seed"] = seed - # Output / structured config if mode in (ModelCallMode.JSON_PARSE, ModelCallMode.STRUCTURED): # Many clients support json_output / structured_output flags. @@ -287,31 +169,27 @@ async def async_call_model( messages=list(messages), # type: ignore[arg-type] **request_kwargs, ) - except TypeError as exc: - # Some models (e.g., certain reasoning or o-series models) do not - # support temperature or other generation parameters. If the error - # message clearly points to 'temperature', drop it and retry once. - if ( - "temperature" in str(exc) - and "temperature" in request_kwargs - and not drop_temperature_for_model - ): + except retryable_exceptions as exc: + # Retry on transient API errors + last_error = exc + if attempt < max_attempts: + wait_time = min(2**attempt, 10) # Exponential backoff, max 10s logger.warning( - "Model rejected 'temperature' parameter; retrying without it. " - "Error was: %s", + "Transient API error on attempt %d/%d: %s. Retrying in %ds...", + attempt, + max_attempts, exc, + wait_time, ) - drop_temperature_for_model = True - last_error = exc + import asyncio + + await asyncio.sleep(wait_time) continue - last_error = exc - logger.error("Model call failed with TypeError: %s", exc) + logger.error("Max retries reached for transient API error: %s", exc) break - except Exception as exc: # pragma: no cover - network/SDK errors - # Let lower-level client / infrastructure handle any network or - # transient retries. At this layer we convert to ModelCallError - # without additional retry loops to avoid duplicating behaviour. - logger.error("Model call failed with unexpected error: %s", exc) + except Exception as exc: + # Non-retryable error - fail immediately + logger.error("Model call failed with non-retryable error: %s", exc) last_error = exc break diff --git a/src/utils/prompts.py b/src/utils/prompts.py index 7afd404..0dc0ff7 100644 --- a/src/utils/prompts.py +++ b/src/utils/prompts.py @@ -5,64 +5,23 @@ Valid capabilities will be added to a capability archive. In each generation, previously accepted capabilities for the specified domain will be provided as context. -Each capability should be designed according to the METR Standard, which requires the following Python format: -```python -class Capability: - @staticmethod - def repr_tasks() -> dict[str, dict]: - # Returns a mapping from task ids to arbitrary data that is specific to each capability. - # Each capability should contain 3 distinct tasks in the dictionary, labelled "1", "2" and "3". - # Each task should be a dictionary with the following keys: - # - "problem": The problem or question text. - # - "answer": The expected answer to the problem. - # ... implementation ... - - @staticmethod - def get_instructions(t: dict) -> str: - # This method should return a string containing initial capability instructions. - # ... implementation ... - - @staticmethod - def score(t: dict, submission: str) -> float | None: - # ‘score‘ is called after the subject model submits a solution. - # Return 0 on failure and 1 on success. - # Throwing an exception will result in a score of 0. - # ... implementation ... -``` -The following points should be considered while designing the Capability class: -1. The instructions should enforce the final answer in the form "ANSWER: $ANSWER" ONLY for those capabilities that expect a short and objective answer. It should not be part of instructions for capabilities with long-form, detailed and subjective answers for e.g. explanations, proofs, essays, etc. -2. If the instructions enforces the final answer in the form "ANSWER: $ANSWER", then the score function should first call a helper function to parse the submission string and extract the answer: -```python -def parse_submission(submission: str) -> str: - # Parse the submission string to extract the answer based on the "ANSWER" keyword. - # Return an empty string if no match is found. -``` -3. The score function should use a helper function that uses a large language model (LLM) as a judge to score the submission: -```python -def evaluate_with_llm_judge( - submission: str, - target: str, -) -> bool: - # Evaluate the submission using an LLM judge. -``` -4. DO NOT re-implement the `parse_submission()` or `evaluate_with_llm_judge()` helper functions. +Respond precisely in the following JSON format: -Respond precisely in the following format, including the JSON start and end markers: - -THOUGHT: -RESPONSE JSON: { - "capability_0": , - "capability_1": , - ... + "thought": , + "capabilities": [ + { + "name": , + "description": + }, + ... + ] } -In , briefly think and reason about what kind of capability you want to propose. -In , provide a JSON response of the new capability with the following fields: -- "name": A concise, descriptive label (lowercase, no spaces, e.g., "personalized_budget_planning"). -- "description": A clear explanation of what the capability entails (e.g., "Ability to generate a realistic monthly budget tailored to an individual's income, fixed and variable expenses, and financial goals. Requires understanding spending categories, prioritization, and basic cash flow allocation."). -- "domain": The domain to which the capability belongs to (e.g., personal finance, math, etc.). -- "class": The fully implemented Python code for the Capability class. This should be easily human-readable. +In "thought", briefly think and reason about what kind of capabilities you want to propose. +In "capabilities", provide an array of new capability objects with the following fields: +- "name": A concise, descriptive label (lowercase, underscores for spaces, e.g., "personalized_budget_planning"). +- "description": A clear and detailed explanation of what the capability entails, including the skills and knowledge required (e.g., "Ability to generate a realistic monthly budget tailored to an individual's income, fixed and variable expenses, and financial goals. Requires understanding spending categories, prioritization, and basic cash flow allocation."). Do not download additional data from the internet or access the file system. @@ -82,39 +41,37 @@ def evaluate_with_llm_judge( Existing capability names: {prev_capabilities} -Generate {num_gen_capabilities} new capabilities within the {domain} domain that are **semantically and functionally distinct** from the existing capabilities. +Generate {num_capabilities} new capabilities within the {domain} domain that are **semantically and functionally distinct** from the existing capabilities. """ HIERARCHICAL_CAPABILITY_GENERATION_USER_PROMPT = """ -A sample capability JSON is provided below. The names of all existing capabilities are also provided. - -Sample capability: -{{sample_capability_json}} +The names of all existing capabilities are provided below. Existing capability names: {{prev_capabilities}} -Generate {{num_gen_capabilities}} new capabilities for the "{capability_area}" area within the {{domain}} domain that do not overlap with the existing capabilities. +Generate {{num_capabilities}} new capabilities for the "{area}" area within the {{domain}} domain that do not overlap with the existing capabilities. """ -HIERARCHICAL_CAPABILITY_AREAS_GENERATION_USER_PROMPT = """ +AREAS_GENERATION_USER_PROMPT = """ You are an expert in designing capabilities to assess the abilities of foundation models. For the domain of {domain}, identify {num_areas} high-level, broad, diverse, and non-overlapping areas for capability generation. Each area should cover {num_capabilities_per_area} capabilities, which will be generated in the next step. Aim for each area to cover a broad subdomain or skill cluster within the domain. -Respond precisely in the following format: +Respond in the following JSON format: -RESPONSE JSON: {response_json_format} """ -CAPABILITY_AREAS_GENERATION_RESPONSE_JSON_FORMAT = """ -{ - "area_0": , - "area_1": , - ... -}""".strip("\n") +AREAS_GENERATION_RESPONSE_JSON_FORMAT = """ +{{ + "areas": [ + , + , + ... + ] +}}""" SCORE_BASED_NEW_CAPABILITY_DISCOVERY_USER_PROMPT = """ A sample capability JSON is provided below. Additionally, the names of all existing capabilities and their respective scores for the subject LLM are provided. From 41fa531b6c16a9bad7c3375eb663daacbd182188 Mon Sep 17 00:00:00 2001 From: Farnaz Kohankhaki Date: Fri, 12 Dec 2025 02:35:08 -0800 Subject: [PATCH 2/8] used task diverse verification impelementation. --- README.md | 62 ++++- .../generate_diverse_tasks.py | 4 +- src/base_task_generation/validate_tasks.py | 129 +++++++++ src/base_task_generation/verify_tasks.py | 146 ----------- src/run_generation_pipeline.py | 247 ++++++++---------- 5 files changed, 295 insertions(+), 293 deletions(-) rename src/{ => base_task_generation}/generate_diverse_tasks.py (97%) create mode 100644 src/base_task_generation/validate_tasks.py delete mode 100644 src/base_task_generation/verify_tasks.py diff --git a/README.md b/README.md index ac4423b..5493c04 100644 --- a/README.md +++ b/README.md @@ -50,12 +50,53 @@ gcloud auth application-default login 2. Modify `src/cfg/run_cfg.yaml`, if required. -### Capability Generation using the scientist LLM +### Generation Pipeline -Generates capability names and descriptions in the first step. In the second step, for each capability, it generates tasks, solves them, and verifies the solutions. +The generation pipeline consists of multiple stages that can be run sequentially or individually: + +- **Stage 0**: Experiment and domain setup +- **Stage 1**: Area generation +- **Stage 2**: Capability generation and filtering +- **Stage 3**: Task generation with solutions +- **Stage 5**: Task validation + +#### Run All Stages + +```bash +python -m src.run_generation_pipeline stage=all +``` + +#### Run Individual Stages + +```bash +# Stage 0: Setup +python -m src.run_generation_pipeline stage=0 + +# Stage 1: Generate areas +python -m src.run_generation_pipeline stage=1 + +# Stage 2: Generate capabilities (requires areas_tag from Stage 1) +python -m src.run_generation_pipeline stage=2 areas_tag=_YYYYMMDD_HHMMSS + +# Stage 3 and 4: Generate tasks and solutions (requires capabilities_tag from Stage 2) + +python -m src.run_generation_pipeline stage=3 capabilities_tag=_YYYYMMDD_HHMMSS + +# Stage 5: Validate tasks (requires solution_tag from Stage 3) +python -m src.run_generation_pipeline stage=5 solution_tag=_YYYYMMDD_HHMMSS +``` + +#### Resume from Existing Runs ```bash -python -m src.run_capability_generation +# Resume Stage 2 from existing capabilities_tag +python -m src.run_generation_pipeline stage=2 areas_tag=_YYYYMMDD_HHMMSS capabilities_tag=_YYYYMMDD_HHMMSS + +# Resume Stage 3 from existing tasks_tag +python -m src.run_generation_pipeline stage=3 capabilities_tag=_YYYYMMDD_HHMMSS tasks_tag=_YYYYMMDD_HHMMSS + +# Resume Stage 5 from existing validation_tag +python -m src.run_generation_pipeline stage=5 solution_tag=_YYYYMMDD_HHMMSS validation_tag=_YYYYMMDD_HHMMSS ``` ### Evaluation of subject LLM on generated capabilities @@ -222,3 +263,18 @@ Configure `wikipedia/cfg/static_vs_generated.yaml`: cd wikipedia python static_vs_generated.py ``` + + +## Development Guidelines + +When implementing new features or modifying existing pipeline stages: + +1. **Follow Schema Guidelines**: All data objects must use the schema classes defined in `src/schemas/`: + - Use `Domain`, `Area`, `Capability`, `Task`, `TaskSolution`, `ValidationResult` objects + - Load/save using schema IO functions from `src/schemas/io_utils.py` (e.g., `load_solution()`, `save_validation()`) + - See `src/schemas/PIPELINE_SCHEMAS.md` for detailed schema documentation + +2. **Use Model Call Utilities**: All LLM interactions must use the standardized model client utilities: + - Import from `src.utils.model_client_utils` + - Use `get_standard_model_client()` to initialize clients + - Use `async_call_model()` with appropriate `ModelCallMode` (e.g., `JSON_PARSE`, `TEXT`) diff --git a/src/generate_diverse_tasks.py b/src/base_task_generation/generate_diverse_tasks.py similarity index 97% rename from src/generate_diverse_tasks.py rename to src/base_task_generation/generate_diverse_tasks.py index a3fd231..041019e 100644 --- a/src/generate_diverse_tasks.py +++ b/src/base_task_generation/generate_diverse_tasks.py @@ -30,7 +30,9 @@ def generate_diverse_tasks_for_capability( min_subtopics: int = 3, max_subtopics: int = 8, ) -> List[TaskSolution]: - """Generate diverse tasks with solutions for a single capability using experimental method. + """Generate diverse tasks with solutions for a single capability. + + Uses multi-dimensional task generation approach. Args: capability: Schema Capability object diff --git a/src/base_task_generation/validate_tasks.py b/src/base_task_generation/validate_tasks.py new file mode 100644 index 0000000..ed0c2aa --- /dev/null +++ b/src/base_task_generation/validate_tasks.py @@ -0,0 +1,129 @@ +"""Validate that generated tasks align with intended dimensions.""" + +import asyncio +import logging + +from autogen_core.models import ChatCompletionClient + +from src.base_task_generation.diverse_task_prompts import format_verification_prompt +from src.schemas.solution_schemas import TaskSolution +from src.schemas.task_schemas import Task +from src.schemas.validation_schemas import ValidationResult +from src.utils.model_client_utils import ModelCallMode, async_call_model + + +logger = logging.getLogger(__name__) + + +def validate_tasks( + task_solutions: list[TaskSolution], + client: ChatCompletionClient, +) -> list[ValidationResult]: + """Validate that generated tasks align with intended dimensions. + + Args: + task_solutions: List of schema TaskSolution objects (non-empty) + client: ChatCompletionClient for API calls + + Returns + ------- + List of ValidationResult objects for validated tasks + """ + logger.info("Validateing task alignment...") + + validation_results = [] + + for i, task_solution in enumerate(task_solutions): + logger.info( + f"Validateing task {i + 1}/{len(task_solutions)}: {task_solution.task_id}" + ) + capability = task_solution.task_obj.capability + + try: + # Get blueprint info from generation_metadata + blueprint_info = task_solution.generation_metadata or {} + blueprint_text = blueprint_info.get("blueprint", "N/A") + + # Parse the task to extract question and choices + task_lines = task_solution.task.strip().split("\n") + question = task_lines[0] if task_lines else "" + + # Extract choices (A, B, C, D) + choices = {} + for task_line in task_lines[1:]: + line = task_line.strip() + if line and len(line) > 2 and line[1] == ".": + choice_letter = line[0] + choice_text = line[3:].strip() + choices[choice_letter] = choice_text + + system_prompt, user_prompt = format_verification_prompt( + capability_domain=capability.area.domain.name, + capability_area=capability.area.name, + capability_name=capability.name, + capability_description=capability.description, + task_blueprint=blueprint_text, + question=question, + option_a=choices.get("A", ""), + option_b=choices.get("B", ""), + option_c=choices.get("C", ""), + option_d=choices.get("D", ""), + correct_answer=task_solution.solution, + ) + + response = asyncio.run( + async_call_model( + client, + system_prompt=system_prompt, + user_prompt=user_prompt, + mode=ModelCallMode.JSON_PARSE, + ) + ) + + # Map verification response to result format + overall_aligned = response.get("overall_verdict", "Fail") == "Pass" + + # Create Task object + task = Task( + task_id=task_solution.task_id, + task=task_solution.task, + capability=capability, + ) + + # Create ValidationResult schema object + validation_result = ValidationResult( + task_id=task_solution.task_id, + task=task_solution.task, + verification=overall_aligned, + feedback=response.get("explanation", ""), + task_obj=task, + generation_metadata={ + "method": "validate_tasks", + "subtopic_aligned": response.get("blueprint_alignment", "No") + == "Yes", + "difficulty_aligned": response.get( + "difficulty_reasoning_match", "No" + ) + == "Yes", + "reasoning_aligned": response.get("capability_alignment", "No") + == "Yes", + "choices_appropriate": response.get("single_correct_answer", "No") + == "Yes", + "suggested_improvements": response.get( + "suggested_improvements", "" + ), + **task_solution.generation_metadata, + }, + ) + validation_results.append(validation_result) + + status = "✓ PASS" if overall_aligned else "✗ FAIL" + logger.info(f" {status}") + + except Exception as e: + logger.error(f" Failed to validate {task_solution.task_id}: {e}") + logger.info(" ✗ ERROR - Skipping this task") + # Skip tasks that fail verification + continue + + return validation_results diff --git a/src/base_task_generation/verify_tasks.py b/src/base_task_generation/verify_tasks.py deleted file mode 100644 index 504b823..0000000 --- a/src/base_task_generation/verify_tasks.py +++ /dev/null @@ -1,146 +0,0 @@ -"""Verify that generated tasks align with intended dimensions.""" - -import json -import logging -from typing import Callable - -from diverse_task_dataclasses import Blueprint, Capability, Task, VerificationResult -from diverse_task_prompts import format_verification_prompt - - -logger = logging.getLogger(__name__) - - -def verify_tasks( - capability: Capability, - tasks: list[Task], - blueprints: list[Blueprint], - call_llm: Callable, -) -> VerificationResult: - """Verify that generated tasks align with intended dimensions.""" - logger.info("Verifying task alignment...") - - # Create blueprint lookup - blueprint_dict = {bp.combination_id: bp for bp in blueprints} - - verification_results = [] - - for i, task in enumerate(tasks): - logger.info(f"Verifying task {i + 1}/{len(tasks)}: {task.task_id}") - - try: - # Skip verification for tasks that failed generation - if task.question.startswith("ERROR:"): - logger.warning(" Skipping verification (task generation failed)") - verification = VerificationResult( - task_id=task.task_id, - subtopic_aligned=False, - difficulty_aligned=False, - reasoning_aligned=False, - choices_appropriate=False, - overall_aligned=False, - feedback="Task generation failed - verification skipped", - ) - verification_results.append(verification) - logger.info(" ✗ SKIPPED") - continue - - # Get blueprint for this task - blueprint = blueprint_dict.get(task.blueprint_id) - blueprint_text = blueprint.blueprint if blueprint else "N/A" - - system_prompt, user_prompt = format_verification_prompt( - capability_domain=capability.domain, - capability_area=capability.area, - capability_name=capability.name, - capability_description=capability.description, - task_blueprint=blueprint_text, - question=task.question, - option_a=task.choices.get("A", ""), - option_b=task.choices.get("B", ""), - option_c=task.choices.get("C", ""), - option_d=task.choices.get("D", ""), - correct_answer=task.correct_answer, - ) - - response = call_llm( - system_prompt=system_prompt, - user_prompt=user_prompt, - response_format={"type": "json_object"}, - ) - - verification_data = json.loads(response) - - # Map new verification format to old format - overall_aligned = verification_data.get("overall_verdict", "Fail") == "Pass" - - verification = VerificationResult( - task_id=task.task_id, - subtopic_aligned=verification_data.get("blueprint_alignment", "No") - == "Yes", - difficulty_aligned=verification_data.get( - "difficulty_reasoning_match", "No" - ) - == "Yes", - reasoning_aligned=verification_data.get("capability_alignment", "No") - == "Yes", - choices_appropriate=verification_data.get("single_correct_answer", "No") - == "Yes", - overall_aligned=overall_aligned, - feedback=verification_data.get("explanation", ""), - ) - verification_results.append(verification) - - status = "✓ PASS" if verification.overall_aligned else "✗ FAIL" - logger.info(f" {status}") - - except Exception as e: - logger.error(f" Failed to verify {task.task_id}: {e}") - # Create a verification result with error information - verification = VerificationResult( - task_id=task.task_id, - subtopic_aligned=False, - difficulty_aligned=False, - reasoning_aligned=False, - choices_appropriate=False, - overall_aligned=False, - feedback=f"Verification failed: {str(e)}", - ) - verification_results.append(verification) - logger.info(" ✗ ERROR") - - # Calculate statistics - total = len(verification_results) - passed = sum(1 for v in verification_results if v.overall_aligned) - failed = total - passed - - # Convert to dict for JSON serialization - verification_details_dict = [ - { - "task_id": v.task_id, - "subtopic_aligned": v.subtopic_aligned, - "difficulty_aligned": v.difficulty_aligned, - "reasoning_aligned": v.reasoning_aligned, - "choices_appropriate": v.choices_appropriate, - "overall_aligned": v.overall_aligned, - "feedback": v.feedback, - "suggested_improvements": v.suggested_improvements, - } - for v in verification_results - ] - - summary = { - "total_tasks": total, - "passed": passed, - "failed": failed, - "pass_rate": passed / total if total > 0 else 0, - "verification_details": verification_details_dict, - } - - logger.info("\nVerification Summary:") - logger.info(f" Total tasks: {total}") - logger.info(f" Passed: {passed}") - logger.info(f" Failed: {failed}") - logger.info(f" Pass rate: {summary['pass_rate']:.1%}") - - return summary diff --git a/src/run_generation_pipeline.py b/src/run_generation_pipeline.py index f2087cb..5e54ab1 100644 --- a/src/run_generation_pipeline.py +++ b/src/run_generation_pipeline.py @@ -18,7 +18,6 @@ python -m src.run_capability_generation # defaults to "all" """ -import json import logging import math from datetime import datetime @@ -27,14 +26,18 @@ import hydra from omegaconf import DictConfig, OmegaConf +from src.base_task_generation.generate_diverse_tasks import ( + generate_diverse_tasks_for_capability, +) +from src.base_task_generation.validate_tasks import validate_tasks from src.generate_capabilities import generate_areas, generate_capabilities -from src.generate_diverse_tasks import generate_diverse_tasks_for_capability from src.schemas.domain_schemas import Domain from src.schemas.experiment_schemas import Experiment from src.schemas.io_utils import ( load_areas, load_capabilities, load_domain, + load_solution, save_areas, save_capabilities, save_domain, @@ -43,7 +46,6 @@ save_validation, ) from src.schemas.metadata_schemas import PipelineMetadata -from src.schemas.validation_schemas import ValidationResult from src.utils import constants from src.utils.capability_management_utils import ( filter_schema_capabilities_by_embeddings, @@ -523,10 +525,6 @@ def stage5_validate_tasks( ), ) - # Get validation parameters from config - pass_threshold = cfg.get("task_verification_cfg", {}).get("pass_threshold", 0.8) - strict_mode = cfg.get("task_verification_cfg", {}).get("strict_mode", False) - # Find all task_solutions directories task_solutions_base_dir = ( output_base_dir / experiment_id / "task_solutions" / solution_tag @@ -551,6 +549,24 @@ def stage5_validate_tasks( for capability_dir in capability_dirs: capability_id = capability_dir.name + # Check if validation already exists for this capability (resume logic) + validation_cap_dir = ( + output_base_dir + / experiment_id + / "validations" + / validation_tag + / area_id + / capability_id + ) + + if is_resume and validation_cap_dir.exists(): + existing_validations = list(validation_cap_dir.glob("*.json")) + if existing_validations: + logger.info( + f"Skipping {area_id}/{capability_id} - {len(existing_validations)} validations already exist" + ) + continue + # Find all task solution files task_solution_files = list(capability_dir.glob("*.json")) @@ -562,97 +578,31 @@ def stage5_validate_tasks( f"Validating {len(task_solution_files)} task solutions for {area_id}/{capability_id}" ) - for task_solution_file in task_solution_files: - task_id = task_solution_file.stem - - # Check if validation already exists (resume logic) - validation_path = ( - output_base_dir - / experiment_id - / "validations" - / validation_tag - / area_id - / capability_id - / f"{task_id}.json" - ) - - if is_resume and validation_path.exists(): - logger.info( - f"Skipping {area_id}/{capability_id}/{task_id} - validation already exists" - ) - continue + # Load all task solutions for this capability + task_solutions = [] - try: - # Load task solution - with open(task_solution_file, "r") as f: - task_solution_data = json.load(f) - - # Extract necessary information - task_text = task_solution_data.get("task", "") - solution = task_solution_data.get("solution", "") - reasoning = task_solution_data.get("reasoning", "") - generation_metadata = task_solution_data.get( - "generation_metadata", {} - ) - - # For validation, we need to check if the task is well-formed - # Simple validation: check if task has content and solution exists - verification = ( - len(task_text.strip()) > 0 and len(solution.strip()) > 0 - ) - feedback = ( - "Task validation passed" - if verification - else "Task validation failed: missing content" - ) + for task_solution_file in task_solution_files: + task_solution, _ = load_solution(task_solution_file) + task_solutions.append(task_solution) - # Create Task object for ValidationResult - from src.schemas.area_schemas import Area - from src.schemas.capability_schemas import Capability - from src.schemas.domain_schemas import Domain - from src.schemas.task_schemas import Task + if not task_solutions: + logger.warning( + f"No valid task solutions loaded for {area_id}/{capability_id}" + ) + continue - domain = Domain( - name=task_solution_data.get("domain", ""), - domain_id=task_solution_data.get("domain_id", ""), - description="", - ) - area = Area( - name=task_solution_data.get("area", ""), - area_id=task_solution_data.get("area_id", ""), - domain=domain, - description=task_solution_data.get("area_description", ""), - ) - capability = Capability( - name=task_solution_data.get("capability", ""), - capability_id=task_solution_data.get("capability_id", ""), - area=area, - description=task_solution_data.get( - "capability_description", "" - ), - ) - task = Task( - task_id=task_id, - task=task_text, - capability=capability, - ) + # Validate all tasks for this capability + try: + validation_results = validate_tasks( + task_solutions=task_solutions, + client=validator_llm_client, + ) - # Create ValidationResult - validation_result = ValidationResult( - task_id=task_id, - task=task_text, - verification=verification, - feedback=feedback, - task_obj=task, - generation_metadata={ - "method": "simple_validation", - "pass_threshold": pass_threshold, - "strict_mode": strict_mode, - **generation_metadata, - }, - ) + # Save individual validation results + for validation_result in validation_results: + task_id = validation_result.task_id + validation_path = validation_cap_dir / f"{task_id}.json" - # Save validation metadata = PipelineMetadata( experiment_id=experiment_id, output_base_dir=str(output_base_dir), @@ -663,21 +613,64 @@ def stage5_validate_tasks( ) save_validation(validation_result, metadata, validation_path) - logger.info( - f"Validated {task_id}: {'✓ PASS' if verification else '✗ FAIL'}" - ) - except Exception as e: - logger.error( - f"Error validating {area_id}/{capability_id}/{task_id}: {e}", - exc_info=True, - ) - continue + logger.info( + f"Validated {area_id}/{capability_id}: " + f"{len(validation_results)} task(s) validated" + ) + + except Exception as e: + logger.error( + f"Error validating tasks for {area_id}/{capability_id}: {e}", + exc_info=True, + ) + continue logger.info(f"Stage 5 completed. Validation tag: {validation_tag}") return validation_tag +def _validate_stage_inputs( + stage: int | str, + areas_tag: str | None, + capabilities_tag: str | None, + solution_tag: str | None, +) -> bool: + """Validate required inputs for standalone stage execution. + + Returns True if validation passes, False otherwise. + """ + if stage == 2 and not areas_tag: + logger.error("areas_tag is required when running stage 2 standalone") + logger.error( + "Usage: python -m src.run_capability_generation stage=2 areas_tag=_YYYYMMDD_HHMMSS" + ) + logger.error( + "Optional: capabilities_tag=_YYYYMMDD_HHMMSS to resume from existing run" + ) + return False + + if stage == 3 and not capabilities_tag: + logger.error("capabilities_tag is required when running stage 3 standalone") + logger.error( + "Usage: python -m src.run_capability_generation stage=3 capabilities_tag=_YYYYMMDD_HHMMSS" + ) + logger.error("Optional: tasks_tag=_YYYYMMDD_HHMMSS to resume from existing run") + return False + + if stage == 5 and not solution_tag: + logger.error("solution_tag is required when running stage 5 standalone") + logger.error( + "Usage: python -m src.run_capability_generation stage=5 solution_tag=_YYYYMMDD_HHMMSS" + ) + logger.error( + "Optional: validation_tag=_YYYYMMDD_HHMMSS to resume from existing run" + ) + return False + + return True + + @hydra.main(version_base=None, config_path="cfg", config_name="run_cfg") def main(cfg: DictConfig) -> None: """Run specific pipeline stages based on configuration. @@ -706,9 +699,12 @@ def main(cfg: DictConfig) -> None: areas_tag = cfg.get("areas_tag", None) capabilities_tag = cfg.get("capabilities_tag", None) solution_tag = cfg.get("solution_tag", None) - validation_tag = None - if stage == 0 or stage == "all": + # Validate required inputs for standalone stages + if not _validate_stage_inputs(stage, areas_tag, capabilities_tag, solution_tag): + return + + if stage in {0, "all"}: logger.info("=" * 60) logger.info("STAGE 0: Experiment and Domain Setup") logger.info("=" * 60) @@ -716,7 +712,7 @@ def main(cfg: DictConfig) -> None: if stage == 0: return - if stage == 1 or stage == "all": + if stage in {1, "all"}: logger.info("=" * 60) logger.info("STAGE 1: Area Generation (Hierarchical)") logger.info("=" * 60) @@ -725,22 +721,11 @@ def main(cfg: DictConfig) -> None: if stage == 1: return - if stage == 2 or stage == "all": + if stage in {2, "all"}: logger.info("=" * 60) logger.info("STAGE 2: Capability Generation and Filtering") logger.info("=" * 60) - # When running stage 2 standalone, areas_tag must be provided - if stage == 2 and not areas_tag: - logger.error("areas_tag is required when running stage 2 standalone") - logger.error( - "Usage: python -m src.run_capability_generation stage=2 areas_tag=_YYYYMMDD_HHMMSS" - ) - logger.error( - "Optional: capabilities_tag=_YYYYMMDD_HHMMSS to resume from existing run" - ) - return - # Check if resuming resume_capabilities_tag = ( cfg.get("capabilities_tag", None) if stage == 2 else None @@ -759,22 +744,11 @@ def main(cfg: DictConfig) -> None: if stage == 2: return - if stage == 3 or stage == "all": + if stage in {3, "all"}: logger.info("=" * 60) logger.info("STAGE 3: Diverse Task Generation") logger.info("=" * 60) - # When running stage 3 standalone, capabilities_tag must be provided - if stage == 3 and not capabilities_tag: - logger.error("capabilities_tag is required when running stage 3 standalone") - logger.error( - "Usage: python -m src.run_capability_generation stage=3 capabilities_tag=_YYYYMMDD_HHMMSS" - ) - logger.error( - "Optional: tasks_tag=_YYYYMMDD_HHMMSS to resume from existing run" - ) - return - # Check if resuming resume_tasks_tag = cfg.get("tasks_tag", None) if stage == 3 else None if resume_tasks_tag: @@ -791,22 +765,11 @@ def main(cfg: DictConfig) -> None: if stage == 3: return - if stage == 5 or stage == "all": + if stage in {5, "all"}: logger.info("=" * 60) logger.info("STAGE 5: Task Validation") logger.info("=" * 60) - # When running stage 5 standalone, solution_tag must be provided - if stage == 5 and not solution_tag: - logger.error("solution_tag is required when running stage 5 standalone") - logger.error( - "Usage: python -m src.run_capability_generation stage=5 solution_tag=_YYYYMMDD_HHMMSS" - ) - logger.error( - "Optional: validation_tag=_YYYYMMDD_HHMMSS to resume from existing run" - ) - return - # Check if resuming resume_validation_tag = cfg.get("validation_tag", None) if stage == 5 else None if resume_validation_tag: @@ -820,8 +783,6 @@ def main(cfg: DictConfig) -> None: validation_tag=resume_validation_tag, ) logger.info("Stage 5 validation tag: %s", validation_tag) - if stage == 5: - return if __name__ == "__main__": From 6e9d84d6ecda79effdb5cb39ba885e51fed659d9 Mon Sep 17 00:00:00 2001 From: Farnaz Kohankhaki Date: Mon, 15 Dec 2025 09:01:15 -0800 Subject: [PATCH 3/8] fixed docstrings and comments. --- src/base_task_generation/extract_subtopics.py | 11 +- src/base_task_generation/find_combinations.py | 14 +-- .../generate_blueprints.py | 11 +- .../generate_diverse_tasks.py | 21 +--- src/base_task_generation/generate_tasks.py | 18 +-- src/base_task_generation/validate_tasks.py | 11 +- src/generate_capabilities.py | 79 +++++--------- src/run_generation_pipeline.py | 103 ++++++++---------- src/utils/timestamp_utils.py | 13 +++ 9 files changed, 113 insertions(+), 168 deletions(-) create mode 100644 src/utils/timestamp_utils.py diff --git a/src/base_task_generation/extract_subtopics.py b/src/base_task_generation/extract_subtopics.py index 294f7b4..8e68374 100644 --- a/src/base_task_generation/extract_subtopics.py +++ b/src/base_task_generation/extract_subtopics.py @@ -7,6 +7,7 @@ from src.base_task_generation.diverse_task_dataclasses import SubTopic from src.base_task_generation.diverse_task_prompts import format_subtopic_prompt +from src.utils.model_client_utils import ModelCallMode, async_call_model logger = logging.getLogger(__name__) @@ -21,16 +22,17 @@ def extract_subtopics( """Extract sub-topics for the given capability. Args: - capability: Schema Capability object with area.domain.name, area.name, etc. + capability: Capability object client: ChatCompletionClient for API calls min_subtopics: Minimum number of subtopics to generate max_subtopics: Maximum number of subtopics to generate + + Returns + ------- + List of SubTopic objects """ logger.info(f"Extracting sub-topics (range: {min_subtopics}-{max_subtopics}) ...") - # Import here to avoid circular dependency - from src.utils.model_client_utils import ModelCallMode, async_call_model - system_prompt, user_prompt = format_subtopic_prompt( capability_name=capability.name, capability_description=capability.description, @@ -51,7 +53,6 @@ def extract_subtopics( subtopic_names = response.get("sub_topics", []) - # Create SubTopic objects subtopics = [SubTopic(name=name) for name in subtopic_names] logger.info(f"Extracted {len(subtopics)} sub-topics:") diff --git a/src/base_task_generation/find_combinations.py b/src/base_task_generation/find_combinations.py index c591e4d..b0febff 100644 --- a/src/base_task_generation/find_combinations.py +++ b/src/base_task_generation/find_combinations.py @@ -11,6 +11,7 @@ ) from src.base_task_generation.diverse_task_dataclasses import Combination, SubTopic from src.base_task_generation.diverse_task_prompts import format_combination_prompt +from src.utils.model_client_utils import ModelCallMode, async_call_model logger = logging.getLogger(__name__) @@ -22,15 +23,16 @@ def find_valid_combinations( """Find valid combinations of Content, Difficulty, and Reasoning. Args: - capability: Schema Capability object with area.domain.name, area.name, etc. + capability: Capability object subtopics: List of SubTopic objects client: ChatCompletionClient for API calls + + Returns + ------- + List of Combination objects """ logger.info("Finding valid combinations...") - # Import here to avoid circular dependency - from src.utils.model_client_utils import ModelCallMode, async_call_model - # Get difficulty levels and reasoning types from constants difficulty_levels = list(DIFFICULTY_LEVELS.keys()) reasoning_types = list(BLOOMS_TAXONOMY.keys()) @@ -50,7 +52,6 @@ def find_valid_combinations( logger.info(f"Generated {len(all_combinations)} total combinations to validate") - # Format combinations as a numbered list for the LLM content_list = "\n".join( [ f"{i + 1}. Content: {c['content']}, Difficulty: {c['difficulty']}, Reasoning: {c['reasoning']}" @@ -77,7 +78,6 @@ def find_valid_combinations( combinations_data = response.get("valid_combinations", []) - # Create Combination objects combinations = [ Combination( content=combo["content"], @@ -90,7 +90,7 @@ def find_valid_combinations( logger.info( f"Found {len(combinations)} valid combinations out of {len(all_combinations)} total:" ) - for i, combo in enumerate(combinations[:5]): # Show first 5 + for i, combo in enumerate(combinations[:5]): logger.info( f" {i + 1}. {combo.content} | {combo.difficulty} | {combo.reasoning}" ) diff --git a/src/base_task_generation/generate_blueprints.py b/src/base_task_generation/generate_blueprints.py index 89e3ebf..2358bbd 100644 --- a/src/base_task_generation/generate_blueprints.py +++ b/src/base_task_generation/generate_blueprints.py @@ -11,6 +11,7 @@ ) from src.base_task_generation.diverse_task_dataclasses import Blueprint, Combination from src.base_task_generation.diverse_task_prompts import format_blueprint_prompt +from src.utils.model_client_utils import ModelCallMode, async_call_model logger = logging.getLogger(__name__) @@ -24,15 +25,16 @@ def generate_blueprints( """Generate task blueprints for each valid combination. Args: - capability: Schema Capability object with area.domain.name, area.name, etc. + capability: Capability object combinations: List of Combination objects client: ChatCompletionClient for API calls + + Returns + ------- + List of Blueprint objects """ logger.info("Generating task blueprints...") - # Import here to avoid circular dependency - from src.utils.model_client_utils import ModelCallMode, async_call_model - blueprints = [] for i, combo in enumerate(combinations): @@ -64,7 +66,6 @@ def generate_blueprints( ) ) - # Validate response has blueprint key if "blueprint" not in response: logger.error( f"Response missing 'blueprint' key. Response keys: {response.keys()}" diff --git a/src/base_task_generation/generate_diverse_tasks.py b/src/base_task_generation/generate_diverse_tasks.py index 041019e..ed5477b 100644 --- a/src/base_task_generation/generate_diverse_tasks.py +++ b/src/base_task_generation/generate_diverse_tasks.py @@ -1,21 +1,14 @@ -"""Generate diverse tasks using base task generation multi-dimensional approach. - -This module adapts the base task generator for use in Stage 3 -of the standardized pipeline. -""" +"""Generate diverse tasks using multi-dimensional approach.""" import logging from typing import List from autogen_core.models import ChatCompletionClient -# Import base task generation components from src.base_task_generation.extract_subtopics import extract_subtopics from src.base_task_generation.find_combinations import find_valid_combinations from src.base_task_generation.generate_blueprints import generate_blueprints from src.base_task_generation.generate_tasks import generate_tasks - -# Import schema objects from src.schemas.capability_schemas import Capability from src.schemas.solution_schemas import TaskSolution @@ -32,37 +25,31 @@ def generate_diverse_tasks_for_capability( ) -> List[TaskSolution]: """Generate diverse tasks with solutions for a single capability. - Uses multi-dimensional task generation approach. - Args: - capability: Schema Capability object + capability: Capability object tasks_per_blueprint: Number of tasks to generate per blueprint - client: ChatCompletionClient from get_standard_model_client + client: ChatCompletionClient for API calls min_subtopics: Minimum number of subtopics to generate max_subtopics: Maximum number of subtopics to generate Returns ------- - List of schema TaskSolution objects + List of TaskSolution objects """ logger.info(f"Generating diverse tasks for capability: {capability.name}") - # Step 1: Extract sub-topics logger.info("Step 1: Extracting sub-topics") subtopics = extract_subtopics(capability, client, min_subtopics, max_subtopics) logger.info(f"Extracted {len(subtopics)} sub-topics") - # Step 2: Find valid combinations logger.info("Step 2: Finding valid combinations") combinations = find_valid_combinations(capability, subtopics, client) logger.info(f"Found {len(combinations)} valid combinations") - # Step 3: Generate blueprints logger.info("Step 3: Generating blueprints") blueprints = generate_blueprints(capability, combinations, client) logger.info(f"Generated {len(blueprints)} blueprints") - # Step 4: Generate tasks (returns schema TaskSolution objects directly) logger.info("Step 4: Generating tasks with solutions") task_solutions = generate_tasks(capability, blueprints, client, tasks_per_blueprint) logger.info(f"Generated {len(task_solutions)} task solutions") diff --git a/src/base_task_generation/generate_tasks.py b/src/base_task_generation/generate_tasks.py index 35dffb2..bdf6ff4 100644 --- a/src/base_task_generation/generate_tasks.py +++ b/src/base_task_generation/generate_tasks.py @@ -7,6 +7,9 @@ from src.base_task_generation.diverse_task_dataclasses import Blueprint from src.base_task_generation.diverse_task_prompts import format_task_prompt +from src.schemas.solution_schemas import TaskSolution +from src.schemas.task_schemas import Task +from src.utils.model_client_utils import ModelCallMode, async_call_model logger = logging.getLogger(__name__) @@ -21,22 +24,17 @@ def generate_tasks( """Generate multiple-choice questions for each blueprint. Args: - capability: Schema Capability object with area.domain.name, area.name, etc. + capability: Capability object blueprints: List of Blueprint objects client: ChatCompletionClient for API calls tasks_per_blueprint: Number of tasks to generate per blueprint Returns ------- - List of schema TaskSolution objects + List of TaskSolution objects """ logger.info("Generating tasks from blueprints...") - # Import here to avoid circular dependency - from src.schemas.solution_schemas import TaskSolution - from src.schemas.task_schemas import Task - from src.utils.model_client_utils import ModelCallMode, async_call_model - all_task_solutions = [] for blueprint in blueprints: @@ -46,7 +44,6 @@ def generate_tasks( f"{blueprint.difficulty} | {blueprint.reasoning}" ) - # Generate multiple tasks for this blueprint for _j in range(tasks_per_blueprint): task_id = f"task_{len(all_task_solutions):03d}" @@ -68,12 +65,10 @@ def generate_tasks( ) ) - # Format the task question with multiple choice options task_text = f"{response['question']}\n\n" for choice_key, choice_text in response["options"].items(): task_text += f"{choice_key}. {choice_text}\n" - # Create metadata dict generation_metadata = { "method": "diverse_task_generation", "blueprint_id": blueprint.combination_id, @@ -85,14 +80,12 @@ def generate_tasks( "alignment_notes": response.get("alignment_notes", ""), } - # Create schema Task object (without generation_metadata) task = Task( task_id=task_id, task=task_text, capability=capability, ) - # Create TaskSolution object with the task and metadata task_solution = TaskSolution( task_id=task_id, task=task_text, @@ -105,7 +98,6 @@ def generate_tasks( except Exception as e: logger.error(f" Failed to generate {task_id}: {e}") - # Skip failed tasks and continue continue logger.info( diff --git a/src/base_task_generation/validate_tasks.py b/src/base_task_generation/validate_tasks.py index ed0c2aa..76ab93f 100644 --- a/src/base_task_generation/validate_tasks.py +++ b/src/base_task_generation/validate_tasks.py @@ -22,12 +22,12 @@ def validate_tasks( """Validate that generated tasks align with intended dimensions. Args: - task_solutions: List of schema TaskSolution objects (non-empty) + task_solutions: List of TaskSolution objects client: ChatCompletionClient for API calls Returns ------- - List of ValidationResult objects for validated tasks + List of ValidationResult objects """ logger.info("Validateing task alignment...") @@ -40,15 +40,12 @@ def validate_tasks( capability = task_solution.task_obj.capability try: - # Get blueprint info from generation_metadata blueprint_info = task_solution.generation_metadata or {} blueprint_text = blueprint_info.get("blueprint", "N/A") - # Parse the task to extract question and choices task_lines = task_solution.task.strip().split("\n") question = task_lines[0] if task_lines else "" - # Extract choices (A, B, C, D) choices = {} for task_line in task_lines[1:]: line = task_line.strip() @@ -80,17 +77,14 @@ def validate_tasks( ) ) - # Map verification response to result format overall_aligned = response.get("overall_verdict", "Fail") == "Pass" - # Create Task object task = Task( task_id=task_solution.task_id, task=task_solution.task, capability=capability, ) - # Create ValidationResult schema object validation_result = ValidationResult( task_id=task_solution.task_id, task=task_solution.task, @@ -123,7 +117,6 @@ def validate_tasks( except Exception as e: logger.error(f" Failed to validate {task_solution.task_id}: {e}") logger.info(" ✗ ERROR - Skipping this task") - # Skip tasks that fail verification continue return validation_results diff --git a/src/generate_capabilities.py b/src/generate_capabilities.py index 4696b87..d80be83 100644 --- a/src/generate_capabilities.py +++ b/src/generate_capabilities.py @@ -2,7 +2,7 @@ import asyncio import logging -from typing import Any, Dict, List +from typing import List import numpy as np from autogen_core.models import ChatCompletionClient @@ -23,20 +23,17 @@ def generate_areas( num_capabilities_per_area: int, scientist_llm_client: ChatCompletionClient, ) -> List[Area]: - """ - Generate areas for the specified domain. + """Generate areas for the specified domain. - Args - ---- - domain (str): The domain name. - num_areas (int): The number of areas to generate. - num_capabilities_per_area (int): The number of capabilities per area. - scientist_llm (ChatCompletionClient): The scientist LLM client. + Args: + domain: Domain object + num_areas: Number of areas to generate + num_capabilities_per_area: Number of capabilities per area + scientist_llm_client: LLM client for generation Returns ------- - Dict[str, Any]: A dictionary containing the generated areas - and metadata about the generation process. + List of generated Area objects """ logger.info(f"Generating {num_areas} areas ...") user_prompt = prompts.AREAS_GENERATION_USER_PROMPT.format( @@ -46,7 +43,6 @@ def generate_areas( response_json_format=prompts.AREAS_GENERATION_RESPONSE_JSON_FORMAT, ) - # Use async_call_model with asyncio.run() for sync execution response = asyncio.run( async_call_model( scientist_llm_client, @@ -77,22 +73,17 @@ def generate_capabilities( num_capabilities_per_run: int, scientist_llm_client: ChatCompletionClient, ) -> List[Capability]: - """ - Generate capabilities for a given area. - - Args - ---- - area (Area): The area object containing domain information. - num_capabilities (int): The number of capabilities to generate. - num_capabilities_per_run (int): The number of capabilities to generate per run. - scientist_llm (ChatCompletionClient): The scientist LLM client. - scientist_llm_gen_cfg (Dict[str, Any]): The generation configuration - for the scientist LLM. - **kwargs (Any): Additional keyword arguments. + """Generate capabilities for a given area. + + Args: + area: Area object + num_capabilities: Total number of capabilities to generate + num_capabilities_per_run: Number of capabilities per LLM call + scientist_llm_client: LLM client for generation Returns ------- - List[Capability]: The generated capabilities. + List of generated Capability objects """ capabilities = [] @@ -121,31 +112,18 @@ def generate_capabilities_using_llm( num_capabilities: int, scientist_llm_client: ChatCompletionClient, prev_capabilities: List[Capability], -) -> Dict[str, Any]: - """ - Generate capabilities using the scientist LLM. - - Prompt the scientist LLM with instructions to generate initial capabilities. - - Args - ---- - domain_name (str): The domain name. - area_name (str): The area name. - area (Area): The area object. - num_capabilities (int): The number of capabilities to generate. - scientist_llm (ChatCompletionClient): The scientist LLM client. - scientist_llm_gen_cfg (Dict[str, Any]): The generation configuration - for the scientist LLM. - sys_prompt (str): The system prompt. - user_prompt (str): The user prompt. - prev_capabilities (List[Capability]): The list of previously - generated capabilities. - **kwargs (Any): Additional keyword arguments. +) -> List[Capability]: + """Generate capabilities using LLM. + + Args: + area: Area object + num_capabilities: Number of capabilities to generate + scientist_llm_client: LLM client for generation + prev_capabilities: Previously generated capabilities Returns ------- - Dict[str, Any]: A dictionary containing the generated capabilities - and metadata about the generation process. + List of generated Capability objects """ sys_prompt = prompts.CAPABILITY_GENERATION_SYSTEM_PROMPT user_prompt = prompts.HIERARCHICAL_CAPABILITY_GENERATION_USER_PROMPT.format( @@ -155,8 +133,6 @@ def generate_capabilities_using_llm( prev_capabilities="\n".join([elm.name for elm in prev_capabilities]), ) - # Use async_call_model with asyncio.run() for sync execution - # Retry logic is handled inside async_call_model response = asyncio.run( async_call_model( scientist_llm_client, @@ -166,13 +142,11 @@ def generate_capabilities_using_llm( ) ) - # Response is already a parsed dict from JSON_PARSE mode gen_capabilities_dict = response.get("capabilities", []) capabilities = [] for idx, capability_dict in enumerate(gen_capabilities_dict): try: - # Create capability object without saving to disk capability_id = f"cap_{idx:03d}" capability = Capability( name=capability_dict["name"], @@ -182,9 +156,8 @@ def generate_capabilities_using_llm( ) except Exception as e: logger.warning( - f"Error creating capability object {capability_dict['name']}, hence skipping it: {e}" + f"Error creating capability {capability_dict['name']}, skipping: {e}" ) - # Skip this capability continue else: capabilities.append(capability) diff --git a/src/run_generation_pipeline.py b/src/run_generation_pipeline.py index 5e54ab1..3d7a680 100644 --- a/src/run_generation_pipeline.py +++ b/src/run_generation_pipeline.py @@ -1,26 +1,26 @@ -"""Script to generate capabilities (Stage 2) and tasks (Stage 3). +"""Multi-stage pipeline for capability and task generation. -This module keeps the existing behavior but makes the flow explicit: -- Stage 0: setup (config validation, run id, model init) -- Stage 1: create a minimal single area artifact (schema alignment) -- Stage 2: generate capabilities, embed + filter -- Stage 3: generate tasks for retained capabilities +This module orchestrates the complete generation pipeline: +- Stage 0: Experiment and domain setup +- Stage 1: Area generation +- Stage 2: Capability generation and filtering +- Stage 3: Task generation with solutions +- Stage 5: Task validation Usage: - # Run specific stage using Hydra override syntax - python -m src.run_capability_generation stage=0 - python -m src.run_capability_generation stage=1 - python -m src.run_capability_generation stage=2 areas_tag=_20251211_214002 - python -m src.run_capability_generation stage=3 capabilities_tag=_20251211_220000 - # Run all stages - python -m src.run_capability_generation stage=all - python -m src.run_capability_generation # defaults to "all" + python -m src.run_generation_pipeline stage=all + + # Run specific stage + python -m src.run_generation_pipeline stage=0 + python -m src.run_generation_pipeline stage=1 + python -m src.run_generation_pipeline stage=2 areas_tag=_YYYYMMDD_HHMMSS + python -m src.run_generation_pipeline stage=3 capabilities_tag=_YYYYMMDD_HHMMSS + python -m src.run_generation_pipeline stage=5 solution_tag=_YYYYMMDD_HHMMSS """ import logging import math -from datetime import datetime from pathlib import Path import hydra @@ -55,6 +55,7 @@ generate_schema_capabilities_embeddings, ) from src.utils.model_client_utils import get_standard_model_client +from src.utils.timestamp_utils import iso_timestamp, timestamp_tag logger = logging.getLogger(__name__) @@ -63,7 +64,7 @@ def stage0_setup( cfg: DictConfig, ) -> None: - """Stage 0: basic setup (config check, run id, base dir, schema files).""" + """Stage 0: Experiment and domain setup.""" check_cfg(cfg, logger) exp_id = cfg.exp_cfg.exp_id output_base_dir = Path(cfg.global_cfg.output_dir) @@ -98,7 +99,7 @@ def stage0_setup( metadata = PipelineMetadata( experiment_id=exp_id, output_base_dir=str(output_base_dir), - timestamp=_iso_timestamp(), + timestamp=iso_timestamp(), input_stage_tag=None, output_stage_tag=None, resume=False, @@ -118,24 +119,11 @@ def stage0_setup( ) -def _timestamp_tag() -> str: - """Return a timestamp tag in `_YYYYMMDD_HHMMSS` format.""" - return f"_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}" - - -def _iso_timestamp() -> str: - """Return an ISO 8601 formatted timestamp with UTC timezone.""" - return datetime.utcnow().isoformat() + "Z" - - def stage1_generate_areas(cfg: DictConfig) -> str: - """ - Stage 1: Generate capability areas using hierarchical method. - - Uses LLM to generate multiple areas within the domain. + """Stage 1: Generate capability areas. Args: - cfg: The configuration object + cfg: Configuration object Returns ------- @@ -180,12 +168,12 @@ def stage1_generate_areas(cfg: DictConfig) -> str: areas = areas[:num_areas] # Save areas - areas_tag = _timestamp_tag() + areas_tag = timestamp_tag() areas_path = output_base_dir / experiment_id / "areas" / areas_tag / "areas.json" metadata = PipelineMetadata( experiment_id=experiment_id, output_base_dir=str(output_base_dir), - timestamp=_iso_timestamp(), + timestamp=iso_timestamp(), input_stage_tag=None, output_stage_tag=areas_tag, resume=False, @@ -200,12 +188,12 @@ def stage2_generate_and_filter_capabilities( areas_tag: str, capabilities_tag: str = None, ) -> str: - """Stage 2: generate capabilities, embed, and filter per schema intent. + """Stage 2: Generate capabilities, embed, and filter. Args: - cfg: The configuration object - areas_tag: The tag from Stage 1 to load areas from - capabilities_tag: Optional resume tag. If provided, resumes from existing tag. + cfg: Configuration object + areas_tag: Tag from Stage 1 to load areas from + capabilities_tag: Optional resume tag Returns ------- @@ -237,7 +225,7 @@ def stage2_generate_and_filter_capabilities( if is_resume: logger.info(f"Resuming Stage 2 with capabilities_tag: {capabilities_tag}") else: - capabilities_tag = _timestamp_tag() + capabilities_tag = timestamp_tag() logger.info(f"Starting new Stage 2 with capabilities_tag: {capabilities_tag}") # Calculate target capabilities per area @@ -269,7 +257,7 @@ def stage2_generate_and_filter_capabilities( logger.info(f"Generating capabilities for area: {area.name} ({area.area_id})") - # Generate capabilities using existing function + # Generate capabilities capabilities = generate_capabilities( area=area, num_capabilities=num_capabilities_per_area, @@ -317,7 +305,7 @@ def stage2_generate_and_filter_capabilities( metadata = PipelineMetadata( experiment_id=experiment_id, output_base_dir=str(output_base_dir), - timestamp=_iso_timestamp(), + timestamp=iso_timestamp(), input_stage_tag=areas_tag, output_stage_tag=capabilities_tag, resume=is_resume, @@ -336,15 +324,12 @@ def stage3_generate_tasks( capabilities_tag: str, tasks_tag: str = None, ) -> str: - """Stage 3: Generate diverse tasks with solutions for each capability. - - Generates tasks using the diverse task generation method and creates - TaskSolution objects with the correct answer and explanation. + """Stage 3: Generate tasks with solutions for each capability. Args: - cfg: The configuration object - capabilities_tag: The tag from Stage 2 to load capabilities from - tasks_tag: Optional resume tag. If provided, resumes from existing tag. + cfg: Configuration object + capabilities_tag: Tag from Stage 2 to load capabilities from + tasks_tag: Optional resume tag Returns ------- @@ -358,7 +343,7 @@ def stage3_generate_tasks( if is_resume: logger.info(f"Resuming Stage 3 with tasks_tag: {tasks_tag}") else: - tasks_tag = _timestamp_tag() + tasks_tag = timestamp_tag() logger.info(f"Starting new Stage 3 with tasks_tag: {tasks_tag}") # Initialize scientist LLM client using task_generation config @@ -443,13 +428,13 @@ def stage3_generate_tasks( metadata = PipelineMetadata( experiment_id=experiment_id, output_base_dir=str(output_base_dir), - timestamp=_iso_timestamp(), + timestamp=iso_timestamp(), input_stage_tag=capabilities_tag, output_stage_tag=tasks_tag, resume=is_resume, ) - # Save task solutions in task_solutions directory + # Save task solutions for task_solution in task_solutions: solution_path = ( output_base_dir @@ -486,9 +471,9 @@ def stage5_validate_tasks( """Stage 5: Validate generated task solutions. Args: - cfg: The configuration object - solution_tag: The tag from Stage 3 to load task solutions from - validation_tag: Optional resume tag. If provided, resumes from existing tag. + cfg: Configuration object + solution_tag: Tag from Stage 3 to load task solutions from + validation_tag: Optional resume tag Returns ------- @@ -502,7 +487,7 @@ def stage5_validate_tasks( if is_resume: logger.info(f"Resuming Stage 5 with validation_tag: {validation_tag}") else: - validation_tag = _timestamp_tag() + validation_tag = timestamp_tag() logger.info(f"Starting new Stage 5 with validation_tag: {validation_tag}") # Initialize validator LLM client @@ -606,7 +591,7 @@ def stage5_validate_tasks( metadata = PipelineMetadata( experiment_id=experiment_id, output_base_dir=str(output_base_dir), - timestamp=_iso_timestamp(), + timestamp=iso_timestamp(), input_stage_tag=solution_tag, output_stage_tag=validation_tag, resume=is_resume, @@ -643,7 +628,7 @@ def _validate_stage_inputs( if stage == 2 and not areas_tag: logger.error("areas_tag is required when running stage 2 standalone") logger.error( - "Usage: python -m src.run_capability_generation stage=2 areas_tag=_YYYYMMDD_HHMMSS" + "Usage: python -m src.run_generation_pipeline stage=2 areas_tag=_YYYYMMDD_HHMMSS" ) logger.error( "Optional: capabilities_tag=_YYYYMMDD_HHMMSS to resume from existing run" @@ -653,7 +638,7 @@ def _validate_stage_inputs( if stage == 3 and not capabilities_tag: logger.error("capabilities_tag is required when running stage 3 standalone") logger.error( - "Usage: python -m src.run_capability_generation stage=3 capabilities_tag=_YYYYMMDD_HHMMSS" + "Usage: python -m src.run_generation_pipeline stage=3 capabilities_tag=_YYYYMMDD_HHMMSS" ) logger.error("Optional: tasks_tag=_YYYYMMDD_HHMMSS to resume from existing run") return False @@ -661,7 +646,7 @@ def _validate_stage_inputs( if stage == 5 and not solution_tag: logger.error("solution_tag is required when running stage 5 standalone") logger.error( - "Usage: python -m src.run_capability_generation stage=5 solution_tag=_YYYYMMDD_HHMMSS" + "Usage: python -m src.run_generation_pipeline stage=5 solution_tag=_YYYYMMDD_HHMMSS" ) logger.error( "Optional: validation_tag=_YYYYMMDD_HHMMSS to resume from existing run" diff --git a/src/utils/timestamp_utils.py b/src/utils/timestamp_utils.py new file mode 100644 index 0000000..21a1e8e --- /dev/null +++ b/src/utils/timestamp_utils.py @@ -0,0 +1,13 @@ +"""Timestamp utility functions for the pipeline.""" + +from datetime import datetime + + +def timestamp_tag() -> str: + """Return a timestamp tag in `_YYYYMMDD_HHMMSS` format.""" + return f"_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}" + + +def iso_timestamp() -> str: + """Return an ISO 8601 formatted timestamp with UTC timezone.""" + return datetime.utcnow().isoformat() + "Z" From 550941018104b45471ee4769936b8758bc1346d2 Mon Sep 17 00:00:00 2001 From: Farnaz Kohankhaki Date: Tue, 16 Dec 2025 18:00:52 -0800 Subject: [PATCH 4/8] separated out task solution generation from task generation. --- README.md | 34 +- src/base_stages/__init__.py | 45 + .../extract_subtopics.py | 4 +- .../find_combinations.py | 8 +- src/base_stages/generate_areas.py | 68 ++ .../generate_blueprints.py | 6 +- .../generate_capabilities.py | 83 +- .../generate_diverse_tasks.py | 28 +- src/base_stages/generate_tasks.py | 141 ++++ .../prompts.py} | 323 +++++++- src/base_stages/solve_tasks.py | 116 +++ src/base_stages/stage0_setup.py | 84 ++ src/base_stages/stage1_areas.py | 84 ++ src/base_stages/stage2_capabilities.py | 172 ++++ src/base_stages/stage3_tasks.py | 160 ++++ src/base_stages/stage4_solutions.py | 159 ++++ src/base_stages/stage5_validation.py | 175 ++++ .../task_constants.py} | 0 .../task_dataclasses.py} | 15 +- .../validate_tasks.py | 6 +- src/base_task_generation/generate_tasks.py | 109 --- src/cfg/run_cfg.yaml | 30 +- src/model.py | 6 +- src/run_base_pipeline.py | 246 ++++++ src/run_generation_pipeline.py | 775 ------------------ src/schemas/task_schemas.py | 10 +- src/utils/data_utils.py | 4 +- 27 files changed, 1841 insertions(+), 1050 deletions(-) create mode 100644 src/base_stages/__init__.py rename src/{base_task_generation => base_stages}/extract_subtopics.py (91%) rename src/{base_task_generation => base_stages}/find_combinations.py (91%) create mode 100644 src/base_stages/generate_areas.py rename src/{base_task_generation => base_stages}/generate_blueprints.py (91%) rename src/{ => base_stages}/generate_capabilities.py (62%) rename src/{base_task_generation => base_stages}/generate_diverse_tasks.py (63%) create mode 100644 src/base_stages/generate_tasks.py rename src/{base_task_generation/diverse_task_prompts.py => base_stages/prompts.py} (64%) create mode 100644 src/base_stages/solve_tasks.py create mode 100644 src/base_stages/stage0_setup.py create mode 100644 src/base_stages/stage1_areas.py create mode 100644 src/base_stages/stage2_capabilities.py create mode 100644 src/base_stages/stage3_tasks.py create mode 100644 src/base_stages/stage4_solutions.py create mode 100644 src/base_stages/stage5_validation.py rename src/{base_task_generation/diverse_task_constants.py => base_stages/task_constants.py} (100%) rename src/{base_task_generation/diverse_task_dataclasses.py => base_stages/task_dataclasses.py} (82%) rename src/{base_task_generation => base_stages}/validate_tasks.py (95%) delete mode 100644 src/base_task_generation/generate_tasks.py create mode 100644 src/run_base_pipeline.py delete mode 100644 src/run_generation_pipeline.py diff --git a/README.md b/README.md index 5493c04..4ffea79 100644 --- a/README.md +++ b/README.md @@ -50,53 +50,59 @@ gcloud auth application-default login 2. Modify `src/cfg/run_cfg.yaml`, if required. -### Generation Pipeline +### Base Pipeline -The generation pipeline consists of multiple stages that can be run sequentially or individually: +The base (non-agentic) pipeline consists of multiple stages that can be run sequentially or individually: - **Stage 0**: Experiment and domain setup - **Stage 1**: Area generation - **Stage 2**: Capability generation and filtering -- **Stage 3**: Task generation with solutions +- **Stage 3**: Task generation (questions with options) +- **Stage 4**: Solution generation (determine correct answers) - **Stage 5**: Task validation #### Run All Stages ```bash -python -m src.run_generation_pipeline stage=all +python -m src.run_base_pipeline stage=all ``` #### Run Individual Stages ```bash # Stage 0: Setup -python -m src.run_generation_pipeline stage=0 +python -m src.run_base_pipeline stage=0 # Stage 1: Generate areas -python -m src.run_generation_pipeline stage=1 +python -m src.run_base_pipeline stage=1 # Stage 2: Generate capabilities (requires areas_tag from Stage 1) -python -m src.run_generation_pipeline stage=2 areas_tag=_YYYYMMDD_HHMMSS +python -m src.run_base_pipeline stage=2 areas_tag=_YYYYMMDD_HHMMSS -# Stage 3 and 4: Generate tasks and solutions (requires capabilities_tag from Stage 2) +# Stage 3: Generate tasks (requires capabilities_tag from Stage 2) +python -m src.run_base_pipeline stage=3 capabilities_tag=_YYYYMMDD_HHMMSS -python -m src.run_generation_pipeline stage=3 capabilities_tag=_YYYYMMDD_HHMMSS +# Stage 4: Generate solutions (requires tasks_tag from Stage 3) +python -m src.run_base_pipeline stage=4 tasks_tag=_YYYYMMDD_HHMMSS -# Stage 5: Validate tasks (requires solution_tag from Stage 3) -python -m src.run_generation_pipeline stage=5 solution_tag=_YYYYMMDD_HHMMSS +# Stage 5: Validate tasks (requires solution_tag from Stage 4) +python -m src.run_base_pipeline stage=5 solution_tag=_YYYYMMDD_HHMMSS ``` #### Resume from Existing Runs ```bash # Resume Stage 2 from existing capabilities_tag -python -m src.run_generation_pipeline stage=2 areas_tag=_YYYYMMDD_HHMMSS capabilities_tag=_YYYYMMDD_HHMMSS +python -m src.run_base_pipeline stage=2 areas_tag=_YYYYMMDD_HHMMSS capabilities_tag=_YYYYMMDD_HHMMSS # Resume Stage 3 from existing tasks_tag -python -m src.run_generation_pipeline stage=3 capabilities_tag=_YYYYMMDD_HHMMSS tasks_tag=_YYYYMMDD_HHMMSS +python -m src.run_base_pipeline stage=3 capabilities_tag=_YYYYMMDD_HHMMSS tasks_tag=_YYYYMMDD_HHMMSS + +# Resume Stage 4 from existing solution_tag +python -m src.run_base_pipeline stage=4 tasks_tag=_YYYYMMDD_HHMMSS solution_tag=_YYYYMMDD_HHMMSS # Resume Stage 5 from existing validation_tag -python -m src.run_generation_pipeline stage=5 solution_tag=_YYYYMMDD_HHMMSS validation_tag=_YYYYMMDD_HHMMSS +python -m src.run_base_pipeline stage=5 solution_tag=_YYYYMMDD_HHMMSS validation_tag=_YYYYMMDD_HHMMSS ``` ### Evaluation of subject LLM on generated capabilities diff --git a/src/base_stages/__init__.py b/src/base_stages/__init__.py new file mode 100644 index 0000000..5ee586d --- /dev/null +++ b/src/base_stages/__init__.py @@ -0,0 +1,45 @@ +"""Base (non-agentic) pipeline stages and utilities. + +This module contains all base pipeline stages and the utilities they use: + +Stages: +- stage0_setup: Experiment and domain setup +- stage1_areas: Area generation +- stage2_capabilities: Capability generation and filtering +- stage3_tasks: Task generation +- stage4_solutions: Solution generation +- stage5_validation: Task validation + +Utilities: +- generate_areas: Area generation using LLM +- generate_capabilities: Capability generation using LLM +- generate_diverse_tasks: Orchestrates subtopic→combination→blueprint→task pipeline +- generate_tasks: Task (question + options) generation +- solve_tasks: Task solving to determine correct answers +- validate_tasks: Task validation + +Supporting modules: +- task_constants: Bloom's taxonomy, difficulty levels +- task_dataclasses: SubTopic, Combination, Blueprint, etc. +- task_prompts: All LLM prompts for task generation pipeline +- extract_subtopics: Sub-topic extraction +- find_combinations: Valid combination finding +- generate_blueprints: Blueprint generation +""" + +from src.base_stages.stage0_setup import run_stage0 +from src.base_stages.stage1_areas import run_stage1 +from src.base_stages.stage2_capabilities import run_stage2 +from src.base_stages.stage3_tasks import run_stage3 +from src.base_stages.stage4_solutions import run_stage4 +from src.base_stages.stage5_validation import run_stage5 + + +__all__ = [ + "run_stage0", + "run_stage1", + "run_stage2", + "run_stage3", + "run_stage4", + "run_stage5", +] diff --git a/src/base_task_generation/extract_subtopics.py b/src/base_stages/extract_subtopics.py similarity index 91% rename from src/base_task_generation/extract_subtopics.py rename to src/base_stages/extract_subtopics.py index 8e68374..5e0bfeb 100644 --- a/src/base_task_generation/extract_subtopics.py +++ b/src/base_stages/extract_subtopics.py @@ -5,8 +5,8 @@ from autogen_core.models import ChatCompletionClient -from src.base_task_generation.diverse_task_dataclasses import SubTopic -from src.base_task_generation.diverse_task_prompts import format_subtopic_prompt +from src.base_stages.prompts import format_subtopic_prompt +from src.base_stages.task_dataclasses import SubTopic from src.utils.model_client_utils import ModelCallMode, async_call_model diff --git a/src/base_task_generation/find_combinations.py b/src/base_stages/find_combinations.py similarity index 91% rename from src/base_task_generation/find_combinations.py rename to src/base_stages/find_combinations.py index b0febff..3fcc406 100644 --- a/src/base_task_generation/find_combinations.py +++ b/src/base_stages/find_combinations.py @@ -5,12 +5,12 @@ from autogen_core.models import ChatCompletionClient -from src.base_task_generation.diverse_task_constants import ( +from src.base_stages.prompts import format_combination_prompt +from src.base_stages.task_constants import ( BLOOMS_TAXONOMY, DIFFICULTY_LEVELS, ) -from src.base_task_generation.diverse_task_dataclasses import Combination, SubTopic -from src.base_task_generation.diverse_task_prompts import format_combination_prompt +from src.base_stages.task_dataclasses import Combination, SubTopic from src.utils.model_client_utils import ModelCallMode, async_call_model @@ -97,4 +97,4 @@ def find_valid_combinations( if len(combinations) > 5: logger.info(f" ... and {len(combinations) - 5} more") - return combinations[0:1] + return combinations diff --git a/src/base_stages/generate_areas.py b/src/base_stages/generate_areas.py new file mode 100644 index 0000000..318ce81 --- /dev/null +++ b/src/base_stages/generate_areas.py @@ -0,0 +1,68 @@ +"""Generate areas using the scientist LLM.""" + +import asyncio +import logging +from typing import List + +from autogen_core.models import ChatCompletionClient + +from src.base_stages.prompts import ( + AREAS_GENERATION_RESPONSE_JSON_FORMAT, + AREAS_GENERATION_USER_PROMPT, +) +from src.schemas.area_schemas import Area +from src.schemas.domain_schemas import Domain +from src.utils.model_client_utils import ModelCallMode, async_call_model + + +logger = logging.getLogger(__name__) + + +def generate_areas( + domain: Domain, + num_areas: int, + num_capabilities_per_area: int, + client: ChatCompletionClient, +) -> List[Area]: + """Generate areas for the specified domain. + + Args: + domain: Domain object + num_areas: Number of areas to generate + num_capabilities_per_area: Number of capabilities per area + client: ChatCompletionClient for API calls + + Returns + ------- + List of generated Area objects + """ + logger.info(f"Generating {num_areas} areas ...") + user_prompt = AREAS_GENERATION_USER_PROMPT.format( + num_areas=num_areas, + num_capabilities_per_area=num_capabilities_per_area, + domain=domain.name, + response_json_format=AREAS_GENERATION_RESPONSE_JSON_FORMAT, + ) + + response = asyncio.run( + async_call_model( + client, + system_prompt="", + user_prompt=user_prompt, + mode=ModelCallMode.JSON_PARSE, + ) + ) + + areas = [] + for idx, area_name in enumerate(response.get("areas", [])): + area = Area( + name=area_name, + area_id=f"area_{idx:03d}", + domain=domain, + description="", + ) + areas.append(area) + + logger.info(f"Generated {len(areas)} areas") + + return areas diff --git a/src/base_task_generation/generate_blueprints.py b/src/base_stages/generate_blueprints.py similarity index 91% rename from src/base_task_generation/generate_blueprints.py rename to src/base_stages/generate_blueprints.py index 2358bbd..8e73f88 100644 --- a/src/base_task_generation/generate_blueprints.py +++ b/src/base_stages/generate_blueprints.py @@ -5,12 +5,12 @@ from autogen_core.models import ChatCompletionClient -from src.base_task_generation.diverse_task_constants import ( +from src.base_stages.prompts import format_blueprint_prompt +from src.base_stages.task_constants import ( BLOOMS_TAXONOMY, DIFFICULTY_LEVELS, ) -from src.base_task_generation.diverse_task_dataclasses import Blueprint, Combination -from src.base_task_generation.diverse_task_prompts import format_blueprint_prompt +from src.base_stages.task_dataclasses import Blueprint, Combination from src.utils.model_client_utils import ModelCallMode, async_call_model diff --git a/src/generate_capabilities.py b/src/base_stages/generate_capabilities.py similarity index 62% rename from src/generate_capabilities.py rename to src/base_stages/generate_capabilities.py index d80be83..05dbcda 100644 --- a/src/generate_capabilities.py +++ b/src/base_stages/generate_capabilities.py @@ -7,71 +7,23 @@ import numpy as np from autogen_core.models import ChatCompletionClient +from src.base_stages.prompts import ( + CAPABILITY_GENERATION_SYSTEM_PROMPT, + CAPABILITY_GENERATION_USER_PROMPT, +) from src.schemas.area_schemas import Area from src.schemas.capability_schemas import Capability -from src.schemas.domain_schemas import Domain -from src.utils import prompts from src.utils.model_client_utils import ModelCallMode, async_call_model logger = logging.getLogger(__name__) -def generate_areas( - domain: Domain, - num_areas: int, - num_capabilities_per_area: int, - scientist_llm_client: ChatCompletionClient, -) -> List[Area]: - """Generate areas for the specified domain. - - Args: - domain: Domain object - num_areas: Number of areas to generate - num_capabilities_per_area: Number of capabilities per area - scientist_llm_client: LLM client for generation - - Returns - ------- - List of generated Area objects - """ - logger.info(f"Generating {num_areas} areas ...") - user_prompt = prompts.AREAS_GENERATION_USER_PROMPT.format( - num_areas=num_areas, - num_capabilities_per_area=num_capabilities_per_area, - domain=domain.name, - response_json_format=prompts.AREAS_GENERATION_RESPONSE_JSON_FORMAT, - ) - - response = asyncio.run( - async_call_model( - scientist_llm_client, - system_prompt="", - user_prompt=user_prompt, - mode=ModelCallMode.JSON_PARSE, - ) - ) - - areas = [] - for idx, area_name in enumerate(response.get("areas", [])): - area = Area( - name=area_name, - area_id=f"area_{idx:03d}", - domain=domain, - description="", - ) - areas.append(area) - - logger.info(f"Generated {len(areas)} areas") - - return areas - - def generate_capabilities( area: Area, num_capabilities: int, num_capabilities_per_run: int, - scientist_llm_client: ChatCompletionClient, + client: ChatCompletionClient, ) -> List[Capability]: """Generate capabilities for a given area. @@ -79,7 +31,7 @@ def generate_capabilities( area: Area object num_capabilities: Total number of capabilities to generate num_capabilities_per_run: Number of capabilities per LLM call - scientist_llm_client: LLM client for generation + client: ChatCompletionClient for API calls Returns ------- @@ -98,7 +50,7 @@ def generate_capabilities( run_capabilities = generate_capabilities_using_llm( area=area, num_capabilities=min(num_capabilities_per_run, num_capabilities_left), - scientist_llm_client=scientist_llm_client, + client=client, prev_capabilities=capabilities, ) capabilities.extend(run_capabilities) @@ -110,7 +62,7 @@ def generate_capabilities( def generate_capabilities_using_llm( area: Area, num_capabilities: int, - scientist_llm_client: ChatCompletionClient, + client: ChatCompletionClient, prev_capabilities: List[Capability], ) -> List[Capability]: """Generate capabilities using LLM. @@ -118,15 +70,15 @@ def generate_capabilities_using_llm( Args: area: Area object num_capabilities: Number of capabilities to generate - scientist_llm_client: LLM client for generation + client: ChatCompletionClient for API calls prev_capabilities: Previously generated capabilities Returns ------- List of generated Capability objects """ - sys_prompt = prompts.CAPABILITY_GENERATION_SYSTEM_PROMPT - user_prompt = prompts.HIERARCHICAL_CAPABILITY_GENERATION_USER_PROMPT.format( + sys_prompt = CAPABILITY_GENERATION_SYSTEM_PROMPT + user_prompt = CAPABILITY_GENERATION_USER_PROMPT.format( area=area.name, domain=area.domain.name, num_capabilities=num_capabilities, @@ -135,7 +87,7 @@ def generate_capabilities_using_llm( response = asyncio.run( async_call_model( - scientist_llm_client, + client, system_prompt=sys_prompt, user_prompt=user_prompt, mode=ModelCallMode.JSON_PARSE, @@ -164,8 +116,17 @@ def generate_capabilities_using_llm( if len(capabilities) != len(gen_capabilities_dict): logger.warning( - f"Only {len(capabilities)} capabilities were created out of {len(gen_capabilities_dict)} generated capabilities." + f"Only {len(capabilities)} capabilities were created out of " + f"{len(gen_capabilities_dict)} generated capabilities." + ) + + # Truncate to requested number if LLM returned more + if len(capabilities) > num_capabilities: + logger.info( + f"LLM returned {len(capabilities)} capabilities, " + f"truncating to requested {num_capabilities}" ) + capabilities = capabilities[:num_capabilities] logger.info(f"Generated {len(capabilities)} capabilities.") diff --git a/src/base_task_generation/generate_diverse_tasks.py b/src/base_stages/generate_diverse_tasks.py similarity index 63% rename from src/base_task_generation/generate_diverse_tasks.py rename to src/base_stages/generate_diverse_tasks.py index ed5477b..e2e6dbe 100644 --- a/src/base_task_generation/generate_diverse_tasks.py +++ b/src/base_stages/generate_diverse_tasks.py @@ -5,12 +5,12 @@ from autogen_core.models import ChatCompletionClient -from src.base_task_generation.extract_subtopics import extract_subtopics -from src.base_task_generation.find_combinations import find_valid_combinations -from src.base_task_generation.generate_blueprints import generate_blueprints -from src.base_task_generation.generate_tasks import generate_tasks +from src.base_stages.extract_subtopics import extract_subtopics +from src.base_stages.find_combinations import find_valid_combinations +from src.base_stages.generate_blueprints import generate_blueprints +from src.base_stages.generate_tasks import generate_tasks from src.schemas.capability_schemas import Capability -from src.schemas.solution_schemas import TaskSolution +from src.schemas.task_schemas import Task logger = logging.getLogger(__name__) @@ -22,8 +22,12 @@ def generate_diverse_tasks_for_capability( client: ChatCompletionClient, min_subtopics: int = 3, max_subtopics: int = 8, -) -> List[TaskSolution]: - """Generate diverse tasks with solutions for a single capability. +) -> List[Task]: + """Generate diverse tasks for a single capability. + + This function generates Task objects (questions with 4 options). The + correct answer is NOT determined here — that happens in Stage 4 + (Solution Generation) where an LLM solves each task. Args: capability: Capability object @@ -34,7 +38,7 @@ def generate_diverse_tasks_for_capability( Returns ------- - List of TaskSolution objects + List of Task objects (questions + options, no answers) """ logger.info(f"Generating diverse tasks for capability: {capability.name}") @@ -50,8 +54,8 @@ def generate_diverse_tasks_for_capability( blueprints = generate_blueprints(capability, combinations, client) logger.info(f"Generated {len(blueprints)} blueprints") - logger.info("Step 4: Generating tasks with solutions") - task_solutions = generate_tasks(capability, blueprints, client, tasks_per_blueprint) - logger.info(f"Generated {len(task_solutions)} task solutions") + logger.info("Step 4: Generating tasks") + tasks = generate_tasks(capability, blueprints, client, tasks_per_blueprint) + logger.info(f"Generated {len(tasks)} tasks") - return task_solutions + return tasks diff --git a/src/base_stages/generate_tasks.py b/src/base_stages/generate_tasks.py new file mode 100644 index 0000000..7610158 --- /dev/null +++ b/src/base_stages/generate_tasks.py @@ -0,0 +1,141 @@ +"""Generate multiple-choice questions for each blueprint.""" + +import asyncio +import logging +from typing import List + +from autogen_core.models import ChatCompletionClient + +from src.base_stages.prompts import ( + format_options_prompt, + format_question_prompt, +) +from src.base_stages.task_dataclasses import Blueprint +from src.schemas.task_schemas import Task +from src.utils.model_client_utils import ModelCallMode, async_call_model + + +logger = logging.getLogger(__name__) + + +def generate_tasks( + capability, + blueprints: list[Blueprint], + client: ChatCompletionClient, + tasks_per_blueprint: int = 3, +) -> List[Task]: + """Generate multiple-choice questions for each blueprint. + + This function generates Task objects using a two-step process: + 1. Generate the question text + 2. Generate 4 options for the question + + The correct answer is NOT determined here — that happens in Stage 4 + (Solution Generation) where an LLM solves each task. + + Args: + capability: Capability object + blueprints: List of Blueprint objects + client: ChatCompletionClient for API calls + tasks_per_blueprint: Number of tasks to generate per blueprint + + Returns + ------- + List of Task objects (questions + options, no answers) + """ + logger.info("Generating tasks from blueprints...") + + all_tasks = [] + + for blueprint in blueprints: + logger.info( + f"Generating {tasks_per_blueprint} tasks for blueprint " + f"{blueprint.combination_id}: {blueprint.subtopic} | " + f"{blueprint.difficulty} | {blueprint.reasoning}" + ) + + for _j in range(tasks_per_blueprint): + task_id = f"task_{len(all_tasks):03d}" + + try: + # Step 1: Generate the question + logger.debug(f" {task_id}: Generating question...") + question_system, question_user = format_question_prompt( + capability_name=capability.name, + capability_description=capability.description, + capability_domain=capability.area.domain.name, + capability_area=capability.area.name, + blueprint_description=blueprint.blueprint, + ) + + question_response = asyncio.run( + async_call_model( + client, + system_prompt=question_system, + user_prompt=question_user, + mode=ModelCallMode.JSON_PARSE, + ) + ) + + question_text = question_response["question"] + logger.debug(f" {task_id}: Question generated") + + # Step 2: Generate the options + logger.debug(f" {task_id}: Generating options...") + options_system, options_user = format_options_prompt( + capability_name=capability.name, + capability_description=capability.description, + capability_domain=capability.area.domain.name, + capability_area=capability.area.name, + question=question_text, + ) + + options_response = asyncio.run( + async_call_model( + client, + system_prompt=options_system, + user_prompt=options_user, + mode=ModelCallMode.JSON_PARSE, + ) + ) + + options = options_response["options"] + logger.debug(f" {task_id}: Options generated") + + # Combine question and options into task text + task_text = f"{question_text}\n\n" + for choice_key, choice_text in options.items(): + task_text += f"{choice_key}. {choice_text}\n" + + # Store generation metadata + generation_metadata = { + "method": "diverse_task_generation", + "blueprint_id": blueprint.combination_id, + "blueprint": blueprint.blueprint, + "subtopic": blueprint.subtopic, + "difficulty": blueprint.difficulty, + "reasoning": blueprint.reasoning, + } + + task = Task( + task_id=task_id, + task=task_text, + capability=capability, + generation_metadata=generation_metadata, + ) + all_tasks.append(task) + + except Exception as e: + logger.error(f" Failed to generate {task_id}: {e}") + continue + + tasks_for_blueprint = [ + t + for t in all_tasks + if t.generation_metadata.get("blueprint_id") == blueprint.combination_id + ] + logger.info(f" Generated {len(tasks_for_blueprint)} tasks for this blueprint") + + logger.info(f"Generated {len(all_tasks)} total tasks") + + return all_tasks diff --git a/src/base_task_generation/diverse_task_prompts.py b/src/base_stages/prompts.py similarity index 64% rename from src/base_task_generation/diverse_task_prompts.py rename to src/base_stages/prompts.py index 0d60a23..57c954d 100644 --- a/src/base_task_generation/diverse_task_prompts.py +++ b/src/base_stages/prompts.py @@ -1,12 +1,91 @@ """ -Prompts for the diverse task generation pipeline. +Prompts for the base pipeline stages. -Edit these prompts to customize the task generation behavior. -The main script can import these instead of using hardcoded prompts. +This module contains all prompts used by the base (non-agentic) pipeline: +- Stage 1: Area generation +- Stage 2: Capability generation +- Stage 3: Task generation (sub-topics, combinations, blueprints, questions, options) +- Stage 4: Solution generation +- Stage 5: Task validation + +Edit these prompts to customize generation behavior. +""" + +# ============================================================================= +# AREA GENERATION (Stage 1) +# ============================================================================= + +AREAS_GENERATION_USER_PROMPT = """ +You are an expert in designing capabilities to assess the abilities of foundation models. +For the domain of {domain}, identify {num_areas} high-level, broad, diverse, and non-overlapping areas for capability generation. +Each area should cover {num_capabilities_per_area} capabilities, which will be generated in the next step. +Aim for each area to cover a broad subdomain or skill cluster within the domain. + +Respond in the following JSON format: + +{response_json_format} +""" + +AREAS_GENERATION_RESPONSE_JSON_FORMAT = """ +{{ + "areas": [ + , + , + ... + ] +}}""" + + +# ============================================================================= +# CAPABILITY GENERATION (Stage 2) +# ============================================================================= + +CAPABILITY_GENERATION_SYSTEM_PROMPT = """ +You are an expert in designing capabilities to assess the abilities of foundation models. +Your goal is to create novel, diverse capabilities that can reveal the breadth and depth of a foundation model's skills within the specified domain. +You will be particularly rewarded for a comprehensive design of capabilities. +Valid capabilities will be added to a capability archive. +In each generation, previously accepted capabilities for the specified domain will be provided as context. + +Respond precisely in the following JSON format: + +{ + "thought": , + "capabilities": [ + { + "name": , + "description": + }, + ... + ] +} + +In "thought", briefly think and reason about what kind of capabilities you want to propose. +In "capabilities", provide an array of new capability objects with the following fields: +- "name": A concise, descriptive label (lowercase, underscores for spaces, e.g., "personalized_budget_planning"). +- "description": A clear and detailed explanation of what the capability entails, including the skills and knowledge required (e.g., "Ability to generate a realistic monthly budget tailored to an individual's income, fixed and variable expenses, and financial goals. Requires understanding spending categories, prioritization, and basic cash flow allocation."). + +Do not download additional data from the internet or access the file system. + +Be creative and design capabilities that can distinguish between different levels of expertise, but ensure that the capability remains relevant to the domain. +Also ensure that the proposed capabilities ARE DISTINCT compared to the existing capabilities. +Names of all existing capabilities will be provided. + +Your response will be automatically parsed so ensure it adheres to the specified format. +""" + +CAPABILITY_GENERATION_USER_PROMPT = """ +The names of all existing capabilities are provided below. + +Existing capability names: +{prev_capabilities} + +Generate {num_capabilities} new capabilities for the "{area}" area within the {domain} domain that do not overlap with the existing capabilities. """ + # ============================================================================= -# SUB-TOPIC EXTRACTION +# SUB-TOPIC EXTRACTION (Stage 3 - Step 1) # ============================================================================= SUBTOPIC_SYSTEM_PROMPT = """ @@ -183,38 +262,31 @@ # ============================================================================= -# TASK GENERATION +# QUESTION GENERATION (Stage 3 - Step 1) # ============================================================================= -TASK_SYSTEM_PROMPT = """ -You are an expert educational scientist responsible for generating high-quality multiple-choice tasks. +QUESTION_SYSTEM_PROMPT = """ +You are an expert educational scientist responsible for generating high-quality assessment questions. A domain is a broad subject area (e.g., Mathematics), an area is a specialized field within that domain (e.g., Linear Algebra), a capability is a specific concept or topic within that area (e.g., representing graphs using matrices), and a sub-topic is a concrete skill of that capability that can be assessed (e.g., constructing an adjacency matrix for a given graph). -Given a task blueprint that describes what the question should assess, difficulty level and reasoning type based on bloom's taxonomy, your goal is to write a complete multiple-choice question that: +Given a task blueprint that describes what the question should assess, your goal is to write a clear, well-formed question that: 1. Accurately reflects the blueprint and capability description. -2. Includes exactly four answer options. +2. Is suitable for a multiple-choice format (will have options generated separately). -3. Has ONLY one correct answer. +3. Uses clear and unambiguous wording. -4. Uses clear and unambiguous wording. +4. Has a single, objectively correct answer. -5. Ensures that incorrect options (distractors) are plausible but clearly wrong when the concept is understood correctly. +IMPORTANT: Generate ONLY the question text. Do NOT include any answer options. Respond precisely in the following format, including the JSON start and end markers: RESPONSE JSON: {{{{ - "question": "", - "options": {{{{ - "A": "