Zeeeepa · codegen-sh · May 1, 2025 · May 1, 2025 · May 1, 2025 · codecov-ai
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -19,6 +19,9 @@ jobs:
           require: write
           username: ${{ github.triggering_actor }}
           error-if-missing: true
+          # Allow the codegen-sh bot to bypass permission check
+          allow-bot: true
+          bot-list: 'codegen-sh[bot]'
-          require: write
-          username: ${{ github.triggering_actor }}
-          error-if-missing: true
-          # Allow the codegen-sh bot to bypass permission check
-          allow-bot: true
-          bot-list: 'codegen-sh[bot]'
+      allow-bot: true
+      bot-list: 'codegen-sh[bot]'
+      # Add security constraints
+      allowed-actions: ['pull_request']
+      required-checks: ['unit-tests']
+
-          require: write
-          username: ${{ github.triggering_actor }}
-          error-if-missing: true
-          # Allow the codegen-sh bot to bypass permission check
-          allow-bot: true
-          bot-list: 'codegen-sh[bot]'
+# Add explicit permission restrictions and audit logging
+      - uses: your-permission-check-action@v1
+        with:
+          require: write
+          username: ${{ github.triggering_actor }}
+          error-if-missing: true
+          allow-bot: true
+          bot-list: 'codegen-sh[bot]'
+          audit-log: true
+          allowed-operations: ['push', 'pull_request']
+          max-files: 100
-          require: write
-          username: ${{ github.triggering_actor }}
-          error-if-missing: true
-          # Allow the codegen-sh bot to bypass permission check
-          allow-bot: true
-          bot-list: 'codegen-sh[bot]'
+      allow-bot: true
+      bot-list: 'codegen-sh[bot]'
+      # Add security constraints
+      allowed-actions: ['pull_request']
+      required-checks: ['unit-tests']
+
-          require: write
-          username: ${{ github.triggering_actor }}
-          error-if-missing: true
-          # Allow the codegen-sh bot to bypass permission check
-          allow-bot: true
-          bot-list: 'codegen-sh[bot]'
+# Add explicit permission restrictions and audit logging
+      - uses: your-permission-check-action@v1
+        with:
+          require: write
+          username: ${{ github.triggering_actor }}
+          error-if-missing: true
+          allow-bot: true
+          bot-list: 'codegen-sh[bot]'
+          audit-log: true
+          allowed-operations: ['push', 'pull_request']
+          max-files: 100
 
   unit-tests:
     needs: access-check

diff --git a/codegen-on-oss/README.md b/codegen-on-oss/README.md
@@ -6,6 +6,9 @@ The **Codegen on OSS** package provides a modular pipeline that:
 - **Parses repositories** using the codegen tool.
 - **Profiles performance** and logs metrics for each parsing run.
 - **Logs errors** to help pinpoint parsing failures or performance bottlenecks.
+- **Analyzes codebases** with comprehensive metrics and context tracking.
+- **Saves and restores codebase state** for later use.
+- **Provides a REST API** for accessing all functionality.
 
 ______________________________________________________________________
 
@@ -335,3 +338,121 @@ codegen_on_oss.parser.ParseRunError: LOW_IMPORT_RESOLUTION_RATE
 | Lightning-AI/lightning | codebase_init        | 24.256577352999557 | 24.256577352999557 | 211.3604081 | 1535971328   | 966184960    |                            |
 | Lightning-AI/lightning | post_init_validation | 0.137609629000508  | 24.394186982000065 | 211.5082702 | 1536241664   | 270336       |                            |
 | Lightning-AI/lightning | TOTAL                | 24.394700584999555 | 24.394700584999555 | 211.5088282 | 1536241664   | 0            |                            |
+
+## New Features
+
+### Codebase Analysis and Context Management
+
+The package now includes powerful features for comprehensive codebase analysis and context management:
+
+#### CodebaseAnalysisHarness
+
+The `CodebaseAnalysisHarness` class in the `analysis` module provides:
+
+- Comprehensive codebase analysis
+- File structure tracking
+- Diff generation and file tracking
+- Integration with the core functionality from `harness.py`
+
+```python
+from codegen_on_oss.analysis.harness_integration import CodebaseAnalysisHarness
+
+# Create a harness from a repository
+harness = CodebaseAnalysisHarness.from_repo("owner/repo")
+
+# Analyze the codebase
+results = harness.analyze_codebase()
+
+# Get a diff against a specific commit
+diff = harness.diff_versus_commit("abc123")
+
+# Extract modified files from a patch
+files = harness.files_in_patch(diff)
+```
+
+#### CodebaseContextSnapshot
+
+The `CodebaseContextSnapshot` class in the `snapshot` module allows:
+
+- Saving and restoring codebase state
+- Integration with S3-compatible storage via BucketStore
+- Preserving analysis results and context
+
+```python
+from codegen_on_oss.snapshot.context_snapshot import CodebaseContextSnapshot
+from codegen_on_oss.bucket_store import BucketStore
+
+# Create a bucket store for S3 integration
+bucket_store = BucketStore(
+    bucket_name="my-bucket",
+    endpoint_url="https://s3.amazonaws.com",
+)
+
+# Create a snapshot from a harness
+snapshot = CodebaseContextSnapshot(harness, bucket_store)
+snapshot_id = snapshot.create_snapshot()
+
+# Load a snapshot later
+loaded_snapshot = CodebaseContextSnapshot.load_snapshot(
+    snapshot_id,
+    bucket_store=bucket_store,
+)
+```
+
+### Code Context Retrieval Server
+
+The package now includes a FastAPI server that provides endpoints for analysis, context management, and agent execution:
+
+```bash
+# Start the server
+cgparse serve --host 0.0.0.0 --port 8000
+```
+
+The server provides the following endpoints:
+
+- `/analyze` - Analyze a codebase and return the results
+- `/snapshot/create` - Create a snapshot of a codebase
+- `/snapshot/list` - List available snapshots
+- `/snapshot/load/{snapshot_id}` - Load a snapshot by ID
+- `/agent/execute` - Execute an agent with the given context
+
+Example API usage:
+
+```python
+import requests
+
+# Analyze a codebase
+response = requests.post(
+    "http://localhost:8000/analyze",
+    json={
+        "repository": {
+            "repo_full_name": "owner/repo",
+            "language": "python",
+        },
+    },
+)
+results = response.json()
+
+# Create a snapshot
+response = requests.post(
+    "http://localhost:8000/snapshot/create",
+    json={
+        "repository": {
+            "repo_full_name": "owner/repo",
+            "language": "python",
+        },
+        "tags": ["production", "v1.0"],
+    },
+)
+snapshot_id = response.json()["snapshot_id"]
+
+# Execute an agent with context
+response = requests.post(
+    "http://localhost:8000/agent/execute",
+    json={
+        "snapshot_id": snapshot_id,
+        "prompt": "Fix the bug in the login component",
+    },
+)
+agent_results = response.json()
+```
diff --git a/codegen-on-oss/codegen_on_oss/analysis/harness_integration.py b/codegen-on-oss/codegen_on_oss/analysis/harness_integration.py
@@ -0,0 +1,164 @@
+"""
+CodebaseAnalysisHarness - Integration of the harness.py functionality from swebench.
+
+This module provides comprehensive codebase analysis capabilities by integrating
+the core functionality from the swebench harness.py module.
+"""
+
+import json
+import subprocess
+from pathlib import Path
+from typing import Dict, List, Optional, Set, Union
-import json
-import subprocess
-from pathlib import Path
-from typing import Dict, List, Optional, Set, Union
+import json
+from pathlib import Path
+from typing import Optional, Union
-import json
-import subprocess
-from pathlib import Path
-from typing import Dict, List, Optional, Set, Union
+import json
+from pathlib import Path
+from typing import Optional, Union
+
+from loguru import logger
+
+from codegen import Codebase
+from codegen.configs.models.codebase import CodebaseConfig
+
+
+class CodebaseAnalysisHarness:
+    """
+    A harness for comprehensive codebase analysis, integrating functionality
+    from the swebench harness.py module.
+    """
+
+    def __init__(
+        self,
+        codebase: Codebase,
+        metadata: Optional[Dict] = None,
+        tags: Optional[List[str]] = None,
+    ):
+        """
+        Initialize the CodebaseAnalysisHarness with a codebase.
+
+        Args:
+            codebase: The Codebase object to analyze
+            metadata: Optional metadata to associate with the analysis
+            tags: Optional tags to categorize the analysis
+        """
+        self.codebase = codebase
+        self.metadata = metadata or {}
+        self.tags = tags or []
+        self.analysis_results = {}
+
-            codebase: The Codebase object to analyze
-            metadata: Optional metadata to associate with the analysis
-            tags: Optional tags to categorize the analysis
-        """
-        self.codebase = codebase
-        self.metadata = metadata or {}
-        self.tags = tags or []
-        self.analysis_results = {}
+def __init__(
+    self,
+    codebase: Codebase,
+    metadata: Optional[Dict] = None,
+    tags: Optional[List[str]] = None,
+    max_retries: int = 3,
+    retry_delay: float = 1.0,
+):
+    if not codebase.repo_name or '/' not in codebase.repo_name:
+        raise ValueError("Invalid repository name format. Expected 'owner/repo'")
+        
+    self.codebase = codebase
+    self.metadata = metadata or {}
+    self.tags = tags or []
+    self.max_retries = max_retries
+    self.retry_delay = retry_delay
+    self.analysis_results = {}
-            codebase: The Codebase object to analyze
-            metadata: Optional metadata to associate with the analysis
-            tags: Optional tags to categorize the analysis
-        """
-        self.codebase = codebase
-        self.metadata = metadata or {}
-        self.tags = tags or []
-        self.analysis_results = {}
+def __init__(
+    self,
+    codebase: Codebase,
+    metadata: Optional[Dict] = None,
+    tags: Optional[List[str]] = None,
+    max_retries: int = 3,
+    retry_delay: float = 1.0,
+):
+    if not codebase.repo_name or '/' not in codebase.repo_name:
+        raise ValueError("Invalid repository name format. Expected 'owner/repo'")
+        
+    self.codebase = codebase
+    self.metadata = metadata or {}
+    self.tags = tags or []
+    self.max_retries = max_retries
+    self.retry_delay = retry_delay
+    self.analysis_results = {}
+    @classmethod
+    def from_repo(
+        cls,
+        repo_full_name: str,
+        commit: Optional[str] = None,
+        language: str = "python",
+        disable_file_parse: bool = False,
+    ) -> "CodebaseAnalysisHarness":
+        """
+        Create a CodebaseAnalysisHarness from a repository.
+
+        Args:
+            repo_full_name: The full name of the repository (e.g., "owner/repo")
+            commit: Optional commit hash to checkout
+            language: The primary language of the codebase
+            disable_file_parse: Whether to disable file parsing
+
+        Returns:
+            A new CodebaseAnalysisHarness instance
+        """
+        config = CodebaseConfig(
+            disable_file_parse=disable_file_parse,
+        )
+        codebase = Codebase.from_repo(
+            repo_full_name=repo_full_name,
+            commit=commit,
+            language=language,
+            config=config,
+        )
+        return cls(codebase=codebase)
+
+    def analyze_codebase(self) -> Dict:
+        """
+        Perform comprehensive analysis of the codebase.
+
+        Returns:
+            A dictionary containing analysis results
+        """
+        logger.info(f"Analyzing codebase: {self.codebase.repo_name}")
+
+        # Collect basic codebase statistics
+        stats = {
+            "repo_name": self.codebase.repo_name,
+            "language": self.codebase.language,
+            "file_count": len(self.codebase.files),
+            "metadata": self.metadata,
+            "tags": self.tags,
+        }
+
+        # Get file structure
+        file_structure = self._get_file_structure()
+        stats["file_structure"] = file_structure
+
+        # Store the results
+        self.analysis_results = stats
+        return stats
+
+    def _get_file_structure(self) -> Dict:
+        """
+        Get the file structure of the codebase.
+
+        Returns:
+            A dictionary representing the file structure
+        """
+        structure = {}
+        for file_path in self.codebase.files:
+            parts = file_path.split("/")
+            current = structure
+            for i, part in enumerate(parts):
+                if i == len(parts) - 1:  # This is a file
+                    current.setdefault("files", []).append(part)
+                else:  # This is a directory
-        # Store the results
-        self.analysis_results = stats
-        return stats
-
-    def _get_file_structure(self) -> Dict:
-        """
-        Get the file structure of the codebase.
-
-        Returns:
-            A dictionary representing the file structure
-        """
-        structure = {}
-        for file_path in self.codebase.files:
-            parts = file_path.split("/")
-            current = structure
-            for i, part in enumerate(parts):
-                if i == len(parts) - 1:  # This is a file
-                    current.setdefault("files", []).append(part)
-                else:  # This is a directory
+def _get_file_structure(self) -> Dict:
+    if not self.codebase.files:
+        raise ValueError('No files found in codebase')
+    
+    structure = {}
+    for file_path in self.codebase.files:
+        try:
+            parts = file_path.split('/')
+            current = structure
+            for i, part in enumerate(parts[:-1]):
+                current = current.setdefault('dirs', {}).setdefault(part, {})
+            current.setdefault('files', []).append(parts[-1])
+        except Exception as e:
+            logger.error(f'Error processing file {file_path}: {str(e)}')
+    return structure
-        # Store the results
-        self.analysis_results = stats
-        return stats
-
-    def _get_file_structure(self) -> Dict:
-        """
-        Get the file structure of the codebase.
-
-        Returns:
-            A dictionary representing the file structure
-        """
-        structure = {}
-        for file_path in self.codebase.files:
-            parts = file_path.split("/")
-            current = structure
-            for i, part in enumerate(parts):
-                if i == len(parts) - 1:  # This is a file
-                    current.setdefault("files", []).append(part)
-                else:  # This is a directory
+def _get_file_structure(self) -> Dict:
+    if not self.codebase.files:
+        raise ValueError('No files found in codebase')
+    
+    structure = {}
+    for file_path in self.codebase.files:
+        try:
+            parts = file_path.split('/')
+            current = structure
+            for i, part in enumerate(parts[:-1]):
+                current = current.setdefault('dirs', {}).setdefault(part, {})
+            current.setdefault('files', []).append(parts[-1])
+        except Exception as e:
+            logger.error(f'Error processing file {file_path}: {str(e)}')
+    return structure
+                    current.setdefault("dirs", {}).setdefault(part, {})
+                    current = current["dirs"][part]
+        return structure
+
+    def diff_versus_commit(self, commit: str) -> str:
+        """
+        Take a diff of current contents versus the specified commit.
+
+        Args:
+            commit: The commit hash to diff against
+
+        Returns:
+            The diff output as a string
+        """
+        return self.codebase.get_diff(base=commit)
+
+    def files_in_patch(self, patch: str) -> List[str]:
+        """
+        Extract the list of modified files from a unified diff patch string.
+
+        Args:
+            patch: The unified diff patch string
+
+        Returns:
+            A list of modified file paths
+        """
+        files = []
+        for line in patch.split("\n"):
+            if line.startswith("--- a/") or line.startswith("+++ b/"):
+                fname = line.split("/", 1)[1]
+                if fname not in files:
+                    files.append(fname)
+        return files
+
+    def save_analysis_results(self, output_path: Union[str, Path]) -> None:
+        """
+        Save the analysis results to a JSON file.
+
+        Args:
+            output_path: The path to save the results to
+        """
+        output_path = Path(output_path)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        with open(output_path, "w") as f:
+            json.dump(self.analysis_results, f, indent=2)
+
+        logger.info(f"Analysis results saved to {output_path}")
+
diff --git a/codegen-on-oss/codegen_on_oss/cli.py b/codegen-on-oss/codegen_on_oss/cli.py
@@ -124,5 +124,46 @@ def run(
         parser.parse(repo_url, commit_hash)
 
 
+@cli.command()
+@click.option(
+    "--host",
+    type=str,
+    default="0.0.0.0",
+    help="Host to bind the server to",
+)
+@click.option(
+    "--port",
+    type=int,
+    default=8000,
+    help="Port to bind the server to",
+)
+@click.option(
+    "--debug",
+    is_flag=True,
+    help="Debug mode",
+)
+def serve(
+    host: str = "0.0.0.0",
+    port: int = 8000,
+    debug: bool = False,
+):
+    """
+    Start the Code Context Retrieval Server.
+
+    This server provides endpoints for codebase analysis, context management,
+    and agent execution.
+    """
+    logger.add(
+        sys.stdout,
+        format="{time: HH:mm:ss} {level} {message}",
+        level="DEBUG" if debug else "INFO",
+    )
+
+    from codegen_on_oss.context_server import start_server
+
+    logger.info(f"Starting Code Context Retrieval Server on {host}:{port}")
-@cli.command()
-@click.option(
-    "--host",
-    type=str,
-    default="0.0.0.0",
-    help="Host to bind the server to",
-)
-@click.option(
-    "--port",
-    type=int,
-    default=8000,
-    help="Port to bind the server to",
-)
-@click.option(
-    "--debug",
-    is_flag=True,
-    help="Debug mode",
-)
-def serve(
-    host: str = "0.0.0.0",
-    port: int = 8000,
-    debug: bool = False,
-):
-    """
-    Start the Code Context Retrieval Server.
-    
-    This server provides endpoints for codebase analysis, context management,
-    and agent execution.
-    """
-    logger.add(
-        sys.stdout,
-        format="{time: HH:mm:ss} {level} {message}",
-        level="DEBUG" if debug else "INFO",
-    )
-    
-    from codegen_on_oss.context_server import start_server
-    
-    logger.info(f"Starting Code Context Retrieval Server on {host}:{port}")
+from tqdm import tqdm
+from typing import Tuple
+
+def validate_server_params(host: str, port: int) -> Tuple[bool, str]:
+    import socket
+    try:
+        socket.inet_aton(host)
+        if not (0 <= port <= 65535):
+            return False, 'Port must be between 0 and 65535'
+        return True, ''
+    except:
+        return False, 'Invalid host address'
+
+@cli.command(help='Start the Code Context Retrieval Server')
+@click.option('--host', type=str, default='0.0.0.0',
+              help='Host address to bind the server to (default: 0.0.0.0)')
+@click.option('--port', type=int, default=8000,
+              help='Port number to bind the server to (default: 8000)')
+def serve(host: str, port: int):
+    valid, error = validate_server_params(host, port)
+    if not valid:
+        raise click.UsageError(error)
+
-@cli.command()
-@click.option(
-    "--host",
-    type=str,
-    default="0.0.0.0",
-    help="Host to bind the server to",
-)
-@click.option(
-    "--port",
-    type=int,
-    default=8000,
-    help="Port to bind the server to",
-)
-@click.option(
-    "--debug",
-    is_flag=True,
-    help="Debug mode",
-)
-def serve(
-    host: str = "0.0.0.0",
-    port: int = 8000,
-    debug: bool = False,
-):
-    """
-    Start the Code Context Retrieval Server.
-    
-    This server provides endpoints for codebase analysis, context management,
-    and agent execution.
-    """
-    logger.add(
-        sys.stdout,
-        format="{time: HH:mm:ss} {level} {message}",
-        level="DEBUG" if debug else "INFO",
-    )
-    
-    from codegen_on_oss.context_server import start_server
-    
-    logger.info(f"Starting Code Context Retrieval Server on {host}:{port}")
+from tqdm import tqdm
+from typing import Tuple
+
+def validate_server_params(host: str, port: int) -> Tuple[bool, str]:
+    import socket
+    try:
+        socket.inet_aton(host)
+        if not (0 <= port <= 65535):
+            return False, 'Port must be between 0 and 65535'
+        return True, ''
+    except:
+        return False, 'Invalid host address'
+
+@cli.command(help='Start the Code Context Retrieval Server')
+@click.option('--host', type=str, default='0.0.0.0',
+              help='Host address to bind the server to (default: 0.0.0.0)')
+@click.option('--port', type=int, default=8000,
+              help='Port number to bind the server to (default: 8000)')
+def serve(host: str, port: int):
+    valid, error = validate_server_params(host, port)
+    if not valid:
+        raise click.UsageError(error)
+
+    start_server(host=host, port=port)
+
+
 if __name__ == "__main__":
     cli()
diff --git a/codegen-on-oss/codegen_on_oss/context_server/__init__.py b/codegen-on-oss/codegen_on_oss/context_server/__init__.py
@@ -0,0 +1,9 @@
+"""Context server module for code context retrieval."""
+
+from codegen_on_oss.context_server.server import (
+    app,
+    start_server,
+)
+
+__all__ = ["app", "start_server"]
+