DataEvolve/config.py at main · GAIR-NLP/DataEvolve · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""
Configuration for data processing pipeline.
"""
import os
from pathlib import Path


class Config:
    """Configuration settings for the data processing pipeline."""

    def __init__(self, level: str = "high", domain: str = "various", content: str = "academic"):
        """
        Initialize configuration.

        Args:
            level: Data level (e.g., "high", "medium", "low")
            domain: Data domain (e.g., "computer_science", "mathematics")
            content: Content type (e.g., "academic", "general")
        """
        # Data category labels
        self.LEVEL = level
        self.DOMAIN = domain
        self.CONTENT = content

        # Base paths
        # Use environment variable or default to parent directory
        project_root_env = os.getenv("PIPELINE_PROJECT_ROOT")
        if project_root_env:
            self.PROJECT_ROOT = Path(project_root_env)
        else:
            self.PROJECT_ROOT = Path(__file__).parent.parent

        self.DATA_ROOT = self.PROJECT_ROOT / "data"

        # Data category path (based on labels)
        self.data_category_name = f"level={level}--domain={domain}--content={content}"
        self.data_category_path = self.DATA_ROOT / self.data_category_name

        # Data paths
        self.sample_data_dir = self.data_category_path / "sample_data"
        self.raw_data_dir = self.data_category_path / "raw_data"
        self.sample_cleaned_data_dir = self.data_category_path / "sample_cleaned_data"

        # Database paths
        self.experience_database_path = self.data_category_path / "experience_database.jsonl"
        self.prompt_database_path = self.data_category_path / "prompt.jsonl"

        # Observer settings
        self.OBSERVER_ITERATIONS = 10  # m: number of iterations
        self.OBSERVER_BATCH_SIZE = 10  # b: batch size per iteration

        # Quality Judge settings
        self.QUALITY_JUDGE_ITERATIONS = 5 # n: number of iterations
        self.QUALITY_JUDGE_BATCH_SIZE = 10  # m: batch size per iteration

        # Maximum prompt design attempts
        self.MAX_PROMPT_DESIGN_ATTEMPTS = 30

        # Script paths
        self.SCRIPT_DIR = self.PROJECT_ROOT / "pipeline" / "clean"
        self.SAMPLE_CLEAN_SCRIPT = self.SCRIPT_DIR / "sample_clean.sh"

        # Shared directory - use environment variable or default
        shared_dir_env = os.getenv("PIPELINE_SHARED_DIR")
        if shared_dir_env:
            self.SHARED_DIR = shared_dir_env
        else:
            self.SHARED_DIR = str(self.PROJECT_ROOT / "pipeline" / "shared")

        # Prompt file paths
        self.PROMPT_DIR = self.SCRIPT_DIR / "prompt"
        self.category_prompt_name = f"prompt_level={level}--domain={domain}--content={content}.py"
        self.category_prompt_file = self.PROMPT_DIR / self.category_prompt_name

    def get_category_prompt_file(self) -> Path:
        """
        Get the category-specific prompt file path.

        Returns:
            Path to the prompt file for this category
        """
        return self.category_prompt_file

    def __repr__(self):
        return f"Config(level={self.LEVEL}, domain={self.DOMAIN}, content={self.CONTENT})"