Skip to content

Commit 2966eef

Browse files
strickvlclaude
andcommitted
Refactor helper_functions.py into domain-specific utility modules
- Move LLM output processing functions to llm_utils.py (remove_reasoning_from_output, clean_json_tags, clean_markdown_tags, safe_json_loads) - Move HTML extraction to css_utils.py (extract_html_from_content) - Create new config_utils.py for configuration management (load_pipeline_config, check_required_env_vars) - Update all imports across the codebase - Delete the now-empty helper_functions.py This improves code organization by grouping related functionality together. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <[email protected]>
1 parent 43cc512 commit 2966eef

File tree

7 files changed

+198
-208
lines changed

7 files changed

+198
-208
lines changed

deep_research/run.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from pipelines.parallel_research_pipeline import (
88
parallelized_deep_research_pipeline,
99
)
10-
from utils.helper_functions import check_required_env_vars
10+
from utils.config_utils import check_required_env_vars
1111

1212
logger = logging.getLogger(__name__)
1313

deep_research/steps/cross_viewpoint_step.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,7 @@
44
from typing import Annotated, List
55

66
from materializers.analysis_data_materializer import AnalysisDataMaterializer
7-
from utils.helper_functions import (
8-
safe_json_loads,
9-
)
10-
from utils.llm_utils import run_llm_completion
7+
from utils.llm_utils import run_llm_completion, safe_json_loads
118
from utils.pydantic_models import (
129
AnalysisData,
1310
Prompt,

deep_research/steps/pydantic_final_report_step.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,8 @@
1212
from typing import Annotated, Tuple
1313

1414
from materializers.final_report_materializer import FinalReportMaterializer
15-
from utils.css_utils import get_shared_css_tag
16-
from utils.helper_functions import (
17-
extract_html_from_content,
18-
remove_reasoning_from_output,
19-
)
20-
from utils.llm_utils import run_llm_completion
15+
from utils.css_utils import extract_html_from_content, get_shared_css_tag
16+
from utils.llm_utils import remove_reasoning_from_output, run_llm_completion
2117
from utils.prompts import (
2218
STATIC_HTML_TEMPLATE,
2319
SUB_QUESTION_TEMPLATE,
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
"""Configuration and environment utilities for the Deep Research Agent."""
2+
3+
import logging
4+
import os
5+
from typing import Any, Dict
6+
7+
import yaml
8+
9+
logger = logging.getLogger(__name__)
10+
11+
12+
def load_pipeline_config(config_path: str) -> Dict[str, Any]:
13+
"""Load pipeline configuration from YAML file.
14+
15+
This is used only for pipeline-level configuration, not for step parameters.
16+
Step parameters should be defined directly in the step functions.
17+
18+
Args:
19+
config_path: Path to the configuration YAML file
20+
21+
Returns:
22+
Pipeline configuration dictionary
23+
"""
24+
# Get absolute path if relative
25+
if not os.path.isabs(config_path):
26+
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
27+
config_path = os.path.join(base_dir, config_path)
28+
29+
# Load YAML configuration
30+
try:
31+
with open(config_path, "r") as f:
32+
config = yaml.safe_load(f)
33+
return config
34+
except Exception as e:
35+
logger.error(f"Error loading pipeline configuration: {e}")
36+
# Return a minimal default configuration in case of loading error
37+
return {
38+
"pipeline": {
39+
"name": "deep_research_pipeline",
40+
"enable_cache": True,
41+
},
42+
"environment": {
43+
"docker": {
44+
"requirements": [
45+
"openai>=1.0.0",
46+
"tavily-python>=0.2.8",
47+
"PyYAML>=6.0",
48+
"click>=8.0.0",
49+
"pydantic>=2.0.0",
50+
"typing_extensions>=4.0.0",
51+
]
52+
}
53+
},
54+
"resources": {"cpu": 1, "memory": "4Gi"},
55+
"timeout": 3600,
56+
}
57+
58+
59+
def check_required_env_vars(env_vars: list[str]) -> list[str]:
60+
"""Check if required environment variables are set.
61+
62+
Args:
63+
env_vars: List of environment variable names to check
64+
65+
Returns:
66+
List of missing environment variables
67+
"""
68+
missing_vars = []
69+
for var in env_vars:
70+
if not os.environ.get(var):
71+
missing_vars.append(var)
72+
return missing_vars

deep_research/utils/css_utils.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""CSS utility functions for consistent styling across materializers."""
22

3+
import json
34
import os
45
from typing import Optional
56

@@ -219,3 +220,48 @@ def create_notice(content: str, notice_type: str = "info") -> str:
219220
{content}
220221
</div>
221222
"""
223+
224+
225+
def extract_html_from_content(content: str) -> str:
226+
"""Attempt to extract HTML content from a response that might be wrapped in other formats.
227+
228+
Args:
229+
content: The content to extract HTML from
230+
231+
Returns:
232+
The extracted HTML, or a basic fallback if extraction fails
233+
"""
234+
if not content:
235+
return ""
236+
237+
# Try to find HTML between tags
238+
if "<html" in content and "</html>" in content:
239+
start = content.find("<html")
240+
end = content.find("</html>") + 7 # Include the closing tag
241+
return content[start:end]
242+
243+
# Try to find div class="research-report"
244+
if '<div class="research-report"' in content and "</div>" in content:
245+
start = content.find('<div class="research-report"')
246+
# Find the last closing div
247+
last_div = content.rfind("</div>")
248+
if last_div > start:
249+
return content[start : last_div + 6] # Include the closing tag
250+
251+
# Look for code blocks
252+
if "```html" in content and "```" in content:
253+
start = content.find("```html") + 7
254+
end = content.find("```", start)
255+
if end > start:
256+
return content[start:end].strip()
257+
258+
# Look for JSON with an "html" field
259+
try:
260+
parsed = json.loads(content)
261+
if isinstance(parsed, dict) and "html" in parsed:
262+
return parsed["html"]
263+
except:
264+
pass
265+
266+
# If all extraction attempts fail, return the original content
267+
return content

deep_research/utils/helper_functions.py

Lines changed: 0 additions & 192 deletions
This file was deleted.

0 commit comments

Comments
 (0)