-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathconfig.py
More file actions
85 lines (67 loc) · 3.2 KB
/
config.py
File metadata and controls
85 lines (67 loc) · 3.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""
Configuration for data processing pipeline.
"""
import os
from pathlib import Path
class Config:
"""Configuration settings for the data processing pipeline."""
def __init__(self, level: str = "high", domain: str = "various", content: str = "academic"):
"""
Initialize configuration.
Args:
level: Data level (e.g., "high", "medium", "low")
domain: Data domain (e.g., "computer_science", "mathematics")
content: Content type (e.g., "academic", "general")
"""
# Data category labels
self.LEVEL = level
self.DOMAIN = domain
self.CONTENT = content
# Base paths
# Use environment variable or default to parent directory
project_root_env = os.getenv("PIPELINE_PROJECT_ROOT")
if project_root_env:
self.PROJECT_ROOT = Path(project_root_env)
else:
self.PROJECT_ROOT = Path(__file__).parent.parent
self.DATA_ROOT = self.PROJECT_ROOT / "data"
# Data category path (based on labels)
self.data_category_name = f"level={level}--domain={domain}--content={content}"
self.data_category_path = self.DATA_ROOT / self.data_category_name
# Data paths
self.sample_data_dir = self.data_category_path / "sample_data"
self.raw_data_dir = self.data_category_path / "raw_data"
self.sample_cleaned_data_dir = self.data_category_path / "sample_cleaned_data"
# Database paths
self.experience_database_path = self.data_category_path / "experience_database.jsonl"
self.prompt_database_path = self.data_category_path / "prompt.jsonl"
# Observer settings
self.OBSERVER_ITERATIONS = 10 # m: number of iterations
self.OBSERVER_BATCH_SIZE = 10 # b: batch size per iteration
# Quality Judge settings
self.QUALITY_JUDGE_ITERATIONS = 5 # n: number of iterations
self.QUALITY_JUDGE_BATCH_SIZE = 10 # m: batch size per iteration
# Maximum prompt design attempts
self.MAX_PROMPT_DESIGN_ATTEMPTS = 30
# Script paths
self.SCRIPT_DIR = self.PROJECT_ROOT / "pipeline" / "clean"
self.SAMPLE_CLEAN_SCRIPT = self.SCRIPT_DIR / "sample_clean.sh"
# Shared directory - use environment variable or default
shared_dir_env = os.getenv("PIPELINE_SHARED_DIR")
if shared_dir_env:
self.SHARED_DIR = shared_dir_env
else:
self.SHARED_DIR = str(self.PROJECT_ROOT / "pipeline" / "shared")
# Prompt file paths
self.PROMPT_DIR = self.SCRIPT_DIR / "prompt"
self.category_prompt_name = f"prompt_level={level}--domain={domain}--content={content}.py"
self.category_prompt_file = self.PROMPT_DIR / self.category_prompt_name
def get_category_prompt_file(self) -> Path:
"""
Get the category-specific prompt file path.
Returns:
Path to the prompt file for this category
"""
return self.category_prompt_file
def __repr__(self):
return f"Config(level={self.LEVEL}, domain={self.DOMAIN}, content={self.CONTENT})"