Refactor helper_functions.py into domain-specific utility modules

strickvl · claude · strickvl · commit 2966eef87414 · 2025-05-28T10:05:37.000+02:00
- Move LLM output processing functions to llm_utils.py (remove_reasoning_from_output, clean_json_tags, clean_markdown_tags, safe_json_loads) - Move HTML extraction to css_utils.py (extract_html_from_content) - Create new config_utils.py for configuration management (load_pipeline_config, check_required_env_vars) - Update all imports across the codebase - Delete the now-empty helper_functions.py This improves code organization by grouping related functionality together. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/deep_research/run.py b/deep_research/run.py
@@ -7,7 +7,7 @@
 from pipelines.parallel_research_pipeline import (
     parallelized_deep_research_pipeline,
 )
-from utils.helper_functions import check_required_env_vars
+from utils.config_utils import check_required_env_vars
 
 logger = logging.getLogger(__name__)
 
diff --git a/deep_research/steps/cross_viewpoint_step.py b/deep_research/steps/cross_viewpoint_step.py
@@ -4,10 +4,7 @@
 from typing import Annotated, List
 
 from materializers.analysis_data_materializer import AnalysisDataMaterializer
-from utils.helper_functions import (
-    safe_json_loads,
-)
-from utils.llm_utils import run_llm_completion
+from utils.llm_utils import run_llm_completion, safe_json_loads
 from utils.pydantic_models import (
     AnalysisData,
     Prompt,
diff --git a/deep_research/steps/pydantic_final_report_step.py b/deep_research/steps/pydantic_final_report_step.py
@@ -12,12 +12,8 @@
 from typing import Annotated, Tuple
 
 from materializers.final_report_materializer import FinalReportMaterializer
-from utils.css_utils import get_shared_css_tag
-from utils.helper_functions import (
-    extract_html_from_content,
-    remove_reasoning_from_output,
-)
-from utils.llm_utils import run_llm_completion
+from utils.css_utils import extract_html_from_content, get_shared_css_tag
+from utils.llm_utils import remove_reasoning_from_output, run_llm_completion
 from utils.prompts import (
     STATIC_HTML_TEMPLATE,
     SUB_QUESTION_TEMPLATE,
diff --git a/deep_research/utils/config_utils.py b/deep_research/utils/config_utils.py
@@ -0,0 +1,72 @@
+"""Configuration and environment utilities for the Deep Research Agent."""
+
+import logging
+import os
+from typing import Any, Dict
+
+import yaml
+
+logger = logging.getLogger(__name__)
+
+
+def load_pipeline_config(config_path: str) -> Dict[str, Any]:
+    """Load pipeline configuration from YAML file.
+
+    This is used only for pipeline-level configuration, not for step parameters.
+    Step parameters should be defined directly in the step functions.
+
+    Args:
+        config_path: Path to the configuration YAML file
+
+    Returns:
+        Pipeline configuration dictionary
+    """
+    # Get absolute path if relative
+    if not os.path.isabs(config_path):
+        base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        config_path = os.path.join(base_dir, config_path)
+
+    # Load YAML configuration
+    try:
+        with open(config_path, "r") as f:
+            config = yaml.safe_load(f)
+        return config
+    except Exception as e:
+        logger.error(f"Error loading pipeline configuration: {e}")
+        # Return a minimal default configuration in case of loading error
+        return {
+            "pipeline": {
+                "name": "deep_research_pipeline",
+                "enable_cache": True,
+            },
+            "environment": {
+                "docker": {
+                    "requirements": [
+                        "openai>=1.0.0",
+                        "tavily-python>=0.2.8",
+                        "PyYAML>=6.0",
+                        "click>=8.0.0",
+                        "pydantic>=2.0.0",
+                        "typing_extensions>=4.0.0",
+                    ]
+                }
+            },
+            "resources": {"cpu": 1, "memory": "4Gi"},
+            "timeout": 3600,
+        }
+
+
+def check_required_env_vars(env_vars: list[str]) -> list[str]:
+    """Check if required environment variables are set.
+
+    Args:
+        env_vars: List of environment variable names to check
+
+    Returns:
+        List of missing environment variables
+    """
+    missing_vars = []
+    for var in env_vars:
+        if not os.environ.get(var):
+            missing_vars.append(var)
+    return missing_vars
diff --git a/deep_research/utils/css_utils.py b/deep_research/utils/css_utils.py
@@ -1,5 +1,6 @@
 """CSS utility functions for consistent styling across materializers."""
 
+import json
 import os
 from typing import Optional
 
@@ -219,3 +220,48 @@ def create_notice(content: str, notice_type: str = "info") -> str:
         {content}
     </div>
     """
+
+
+def extract_html_from_content(content: str) -> str:
+    """Attempt to extract HTML content from a response that might be wrapped in other formats.
+
+    Args:
+        content: The content to extract HTML from
+
+    Returns:
+        The extracted HTML, or a basic fallback if extraction fails
+    """
+    if not content:
+        return ""
+
+    # Try to find HTML between tags
+    if "<html" in content and "</html>" in content:
+        start = content.find("<html")
+        end = content.find("</html>") + 7  # Include the closing tag
+        return content[start:end]
+
+    # Try to find div class="research-report"
+    if '<div class="research-report"' in content and "</div>" in content:
+        start = content.find('<div class="research-report"')
+        # Find the last closing div
+        last_div = content.rfind("</div>")
+        if last_div > start:
+            return content[start : last_div + 6]  # Include the closing tag
+
+    # Look for code blocks
+    if "```html" in content and "```" in content:
+        start = content.find("```html") + 7
+        end = content.find("```", start)
+        if end > start:
+            return content[start:end].strip()
+
+    # Look for JSON with an "html" field
+    try:
+        parsed = json.loads(content)
+        if isinstance(parsed, dict) and "html" in parsed:
+            return parsed["html"]
+    except:
+        pass
+
+    # If all extraction attempts fail, return the original content
+    return content
diff --git a/deep_research/utils/helper_functions.py b/deep_research/utils/helper_functions.py
diff --git a/deep_research/utils/llm_utils.py b/deep_research/utils/llm_utils.py

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@`
`7`	`7`	`from pipelines.parallel_research_pipeline import (`
`8`	`8`	`parallelized_deep_research_pipeline,`
`9`	`9`	`)`
`10`		`-from utils.helper_functions import check_required_env_vars`
	`10`	`+from utils.config_utils import check_required_env_vars`
`11`	`11`
`12`	`12`	`logger = logging.getLogger(__name__)`
`13`	`13`