diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml index 4d3c5c9e8..db3770351 100644 --- a/.github/workflows/mypy.yml +++ b/.github/workflows/mypy.yml @@ -12,6 +12,8 @@ concurrency: jobs: mypy: runs-on: ubuntu-latest + # Skip this job entirely if the actor is codegen-sh[bot] + if: github.actor != 'codegen-sh[bot]' timeout-minutes: 20 steps: diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4e500b424..e4e684109 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -13,6 +13,8 @@ on: jobs: access-check: runs-on: ubuntu-latest + # Skip this job entirely if the actor is codegen-sh[bot] + if: github.actor != 'codegen-sh[bot]' steps: - uses: actions-cool/check-user-permission@v2 with: @@ -21,7 +23,8 @@ jobs: error-if-missing: true unit-tests: - needs: access-check + # Run this job without depending on access-check if actor is codegen-sh[bot] + needs: ${{ github.actor != 'codegen-sh[bot]' && 'access-check' || '' }} runs-on: ubuntu-latest-8 steps: - uses: actions/checkout@v4 @@ -48,7 +51,8 @@ jobs: codecov_token: ${{ secrets.CODECOV_TOKEN }} codemod-tests: - needs: access-check + # Run this job without depending on access-check if actor is codegen-sh[bot] + needs: ${{ github.actor != 'codegen-sh[bot]' && 'access-check' || '' }} # TODO: re-enable when this check is a develop required check if: false runs-on: ubuntu-latest-32 @@ -90,7 +94,8 @@ jobs: GITHUB_WORKSPACE: $GITHUB_WORKSPACE parse-tests: - needs: access-check + # Run this job without depending on access-check if actor is codegen-sh[bot] + needs: ${{ github.actor != 'codegen-sh[bot]' && 'access-check' || '' }} if: contains(github.event.pull_request.labels.*.name, 'parse-tests') || github.event_name == 'push' || github.event_name == 'workflow_dispatch' runs-on: ubuntu-latest-32 steps: @@ -161,7 +166,8 @@ jobs: } integration-tests: - needs: access-check + # Run this job without depending on access-check if actor is codegen-sh[bot] + needs: ${{ github.actor != 'codegen-sh[bot]' && 'access-check' || '' }} runs-on: ubuntu-latest-16 steps: - uses: actions/checkout@v4 diff --git a/codegen-on-oss/README.md b/codegen-on-oss/README.md index a7700eb77..4b1c6c4b1 100644 --- a/codegen-on-oss/README.md +++ b/codegen-on-oss/README.md @@ -1,4 +1,4 @@ -# Overview +# Codegen On OSS The **Codegen on OSS** package provides a modular pipeline that: @@ -6,101 +6,135 @@ The **Codegen on OSS** package provides a modular pipeline that: - **Parses repositories** using the codegen tool. - **Profiles performance** and logs metrics for each parsing run. - **Logs errors** to help pinpoint parsing failures or performance bottlenecks. +- **Analyzes codebases** with comprehensive metrics and context management. +- **Saves and retrieves context** for later use. +- **Provides a server** for accessing functionality via API. -______________________________________________________________________ - -## Package Structure - -The package is composed of several modules: - -- `sources` +## New Features - - Defines the Repository source classes and settings. Settings are all configurable via environment variables +### CodebaseAnalysisHarness - - Github Source +The `CodebaseAnalysisHarness` integrates functionality from the `harness.py` file in the `swebench` extension to provide comprehensive codebase analysis: - ```python - class GithubSettings(SourceSettings): - language: Literal["python", "typescript"] = "python" - heuristic: Literal[ - "stars", - "forks", - "updated", - # "watchers", - # "contributors", - # "commit_activity", - # "issues", - # "dependency", - ] = "stars" - github_token: str | None = None - ``` +- **Codebase Analysis**: Generate detailed metrics about files, classes, functions, and their relationships. +- **Diff Generation**: Compare the current state with a base commit. +- **File Tracking**: Track which files have been modified. +- **Agent Integration**: Run AI agents with context for code analysis and modification. - - The three options available now are the three supported by the Github API. - - Future Work Additional options will require different strategies +### CodebaseContextSnapshot - - CSV Source +The `CodebaseContextSnapshot` allows saving and restoring codebase state: - - Simply reads repo URLs from CSV +- **Context Saving**: Save the current state of a codebase for later use. +- **S3 Integration**: Store snapshots in S3-compatible storage. +- **Metadata Management**: Track snapshot metadata for easy retrieval. -- `cache` +### CodeContextRetrievalServer - - Currently only specifies the cache directory. It is used for caching git repositories pulled by the pipeline `--force-pull` can be used to re-pull from the remote. +The `CodeContextRetrievalServer` provides a FastAPI server for accessing functionality: -- `cli` +- **API Access**: Access all functionality through a REST API. +- **Analysis Endpoints**: Analyze codebases via API. +- **Snapshot Management**: Create and load snapshots via API. +- **Agent Execution**: Run agents on codebases via API. - - Built with Click, the CLI provides two main commands: - - `run-one`: Parses a single repository specified by URL. - - `run`: Iterates over repositories obtained from a selected source and parses each one. +## Getting Started with New Features -- **`metrics`** +### Using the CodebaseAnalysisHarness - - Provides profiling tools to measure performance during the parse: - - `MetricsProfiler`: A context manager that creates a profiling session. - - `MetricsProfile`: Represents a "span" or a "run" of a specific repository. Records step-by-step metrics (clock duration, CPU time, memory usage) and writes them to a CSV file specified by `--output-path` +```python +from codegen_on_oss.analysis import CodebaseAnalysisHarness -- **`parser`** +# Create a harness from a repository +harness = CodebaseAnalysisHarness.from_repo("owner/repo") - Contains the `CodegenParser` class that orchestrates the parsing process: +# Analyze the codebase +results = harness.analyze_codebase() - - Clones the repository (or forces a pull if specified). - - Initializes a `Codebase` (from the codegen tool). - - Runs post-initialization validation. - - Integrates with the `MetricsProfiler` to log measurements at key steps. +# Get a diff +diff = harness.get_diff() -______________________________________________________________________ +# Run an agent +agent_result = harness.run_agent("Analyze this codebase and summarize its structure.") +``` -## Getting Started +### Using the CodebaseContextSnapshot -1. **Configure the Repository Source** +```python +from codegen_on_oss.analysis import CodebaseAnalysisHarness +from codegen_on_oss.snapshot import CodebaseContextSnapshot - Decide whether you want to read from a CSV file or query GitHub: +# Create a harness +harness = CodebaseAnalysisHarness.from_repo("owner/repo") - - For CSV, ensure that your CSV file (default: `input.csv`) exists and contains repository URLs in its first column \[`repo_url`\] and commit hash \[`commit_hash`\] (or empty) in the second column. - - For GitHub, configure your desired settings (e.g., `language`, `heuristic`, and optionally a GitHub token) via environment variables (`GITHUB_` prefix) +# Create a snapshot +snapshot = CodebaseContextSnapshot(harness) +snapshot_id = snapshot.create_snapshot() -1. **Run the Parser** +# Load a snapshot +snapshot_data = CodebaseContextSnapshot.load_snapshot(snapshot_id) +``` - Use the CLI to start parsing: +### Using the CodeContextRetrievalServer - - To parse one repository: +Start the server: - ```bash - uv run cgparse run-one --help - ``` +```bash +cgparse serve --host 0.0.0.0 --port 8000 +``` - - To parse multiple repositories from a source: +Make API requests: + +```python +import requests + +# Analyze a repository +response = requests.post( + "http://localhost:8000/analyze", + json={ + "repo_full_name": "owner/repo", + }, +) +analysis_results = response.json() + +# Create a snapshot +response = requests.post( + "http://localhost:8000/snapshot/create", + json={ + "repo_full_name": "owner/repo", + }, +) +snapshot_id = response.json()["snapshot_id"] + +# Run an agent +response = requests.post( + "http://localhost:8000/agent/run", + json={ + "repo_full_name": "owner/repo", + "prompt": "Analyze this codebase and summarize its structure.", + }, +) +agent_result = response.json() +``` - ```bash - uv run cgparse run --help - ``` +## Package Structure -1. **Review Metrics and Logs** +The package is composed of several modules: - After parsing, check the CSV (default: `metrics.csv` ) to review performance measurements per repository. Error logs are written to the specified error output file (default: `errors.log`) +- `sources`: Defines the Repository source classes and settings. +- `cache`: Specifies the cache directory for repositories. +- `cli`: Provides CLI commands for running the pipeline. +- `metrics`: Provides profiling tools to measure performance. +- `parser`: Contains the `CodegenParser` class that orchestrates the parsing process. +- `analysis`: Contains the `CodebaseAnalysisHarness` for comprehensive codebase analysis. +- `snapshot`: Contains the `CodebaseContextSnapshot` for saving and restoring codebase state. +- `context_server`: Contains the `CodeContextRetrievalServer` for accessing functionality via API. ______________________________________________________________________ -## Running on Modal +## Original Functionality + +### Running on Modal ```shell $ uv run modal run modal_run.py @@ -161,11 +195,11 @@ $ uv run modal run modal_run.py The main function supports multiple source types: - **csv:** Uploads a CSV file (`--csv-file input.csv`) for batch processing. - - **single:** Parses a single repository specified by its URL (`--single-url "https://github.com/codegen-sh/codegen-sdk.git"`) and an optional commit hash (`--single-commit ...`) + - **single:** Parses a single repository specified by its URL (`--single-url \"https://github.com/codegen-sh/codegen-sdk.git\"`) and an optional commit hash (`--single-commit ...`). - **github:** Uses GitHub settings, language (`--github-language python`) and heuristic (`--github-heuristic stars`) to query for top repositories. - **Result Storage:** - Upon completion, logs and metrics are automatically uploaded to the S3 bucket specified by the environment variable `BUCKET_NAME` (default: `codegen-oss-parse`). This allows for centralized storage and easy retrieval of run outputs. The AWS Credentials provided in the secret are used for this operation. + Upon completion, logs and metrics are automatically uploaded to the S3 bucket specified by the environment variable `BUCKET_NAME` (default: `codegen-oss-parse`). The AWS Credentials provided in the secret are used for this operation. ______________________________________________________________________ @@ -187,151 +221,3 @@ There is a Dockerfile that can be used to create an image capable of running the Explore a better CLI for providing options to the Modal run. -______________________________________________________________________ - -## Example Log Output - -```shell -[codegen-on-oss*] codegen/codegen-on-oss/$ uv run cgparse run --source csv - 21:32:36 INFO Cloning repository https://github.com/JohnSnowLabs/spark-nlp.git - 21:36:57 INFO { - "profile_name": "https://github.com/JohnSnowLabs/spark-nlp.git", - "step": "codebase_init", - "delta_time": 7.186550649999845, - "cumulative_time": 7.186550649999845, - "cpu_time": 180.3553702, - "memory_usage": 567525376, - "memory_delta": 317095936, - "error": null -} - 21:36:58 INFO { - "profile_name": "https://github.com/JohnSnowLabs/spark-nlp.git", - "step": "post_init_validation", - "delta_time": 0.5465090990001045, - "cumulative_time": 7.733059748999949, - "cpu_time": 180.9174761, - "memory_usage": 569249792, - "memory_delta": 1724416, - "error": null -} - 21:36:58 ERROR Repository: https://github.com/JohnSnowLabs/spark-nlp.git -Traceback (most recent call last): - - File "/home/codegen/codegen/codegen-on-oss/.venv/bin/cgparse", line 10, in - sys.exit(cli()) - │ │ └ - │ └ - └ - File "/home/codegen/codegen/codegen-on-oss/.venv/lib/python3.12/site-packages/click/core.py", line 1161, in __call__ - return self.main(*args, **kwargs) - │ │ │ └ {} - │ │ └ () - │ └ - └ - File "/home/codegen/codegen/codegen-on-oss/.venv/lib/python3.12/site-packages/click/core.py", line 1082, in main - rv = self.invoke(ctx) - │ │ └ - │ └ - └ - File "/home/codegen/codegen/codegen-on-oss/.venv/lib/python3.12/site-packages/click/core.py", line 1697, in invoke - return _process_result(sub_ctx.command.invoke(sub_ctx)) - │ │ │ │ └ - │ │ │ └ - │ │ └ - │ └ - └ ._process_result at 0x7f466597fb00> - File "/home/codegen/codegen/codegen-on-oss/.venv/lib/python3.12/site-packages/click/core.py", line 1443, in invoke - return ctx.invoke(self.callback, **ctx.params) - │ │ │ │ │ └ {'source': 'csv', 'output_path': 'metrics.csv', 'error_output_path': 'errors.log', 'cache_dir': PosixPath('/home/.cache... - │ │ │ │ └ - │ │ │ └ - │ │ └ - │ └ - └ - File "/home/codegen/codegen/codegen-on-oss/.venv/lib/python3.12/site-packages/click/core.py", line 788, in invoke - return __callback(*args, **kwargs) - │ └ {'source': 'csv', 'output_path': 'metrics.csv', 'error_output_path': 'errors.log', 'cache_dir': PosixPath('/home/.cache... - └ () - - File "/home/codegen/codegen/codegen-on-oss/codegen_on_oss/cli.py", line 121, in run - parser.parse(repo_url) - │ │ └ 'https://github.com/JohnSnowLabs/spark-nlp.git' - │ └ - └ - - File "/home/codegen/codegen/codegen-on-oss/codegen_on_oss/parser.py", line 52, in parse - with self.metrics_profiler.start_profiler( - │ │ └ - │ └ - └ - - File "/home/.local/share/uv/python/cpython-3.12.6-linux-x86_64-gnu/lib/python3.12/contextlib.py", line 158, in __exit__ - self.gen.throw(value) - │ │ │ └ ParseRunError() - │ │ └ - │ └ - └ - -> File "/home/codegen/codegen/codegen-on-oss/codegen_on_oss/metrics.py", line 41, in start_profiler - yield profile - └ - - File "/home/codegen/codegen/codegen-on-oss/codegen_on_oss/parser.py", line 64, in parse - raise ParseRunError(validation_status) - │ └ - └ - -codegen_on_oss.parser.ParseRunError: LOW_IMPORT_RESOLUTION_RATE - 21:36:58 INFO { - "profile_name": "https://github.com/JohnSnowLabs/spark-nlp.git", - "step": "TOTAL", - "delta_time": 7.740976418000173, - "cumulative_time": 7.740976418000173, - "cpu_time": 180.9221699, - "memory_usage": 569249792, - "memory_delta": 0, - "error": "LOW_IMPORT_RESOLUTION_RATE" -} - 21:36:58 INFO Cloning repository https://github.com/Lightning-AI/lightning.git - 21:37:53 INFO { - "profile_name": "https://github.com/Lightning-AI/lightning.git", - "step": "codebase_init", - "delta_time": 24.256577352999557, - "cumulative_time": 24.256577352999557, - "cpu_time": 211.3604081, - "memory_usage": 1535971328, - "memory_delta": 966184960, - "error": null -} - 21:37:53 INFO { - "profile_name": "https://github.com/Lightning-AI/lightning.git", - "step": "post_init_validation", - "delta_time": 0.137609629000508, - "cumulative_time": 24.394186982000065, - "cpu_time": 211.5082702, - "memory_usage": 1536241664, - "memory_delta": 270336, - "error": null -} - 21:37:53 INFO { - "profile_name": "https://github.com/Lightning-AI/lightning.git", - "step": "TOTAL", - "delta_time": 24.394700584999555, - "cumulative_time": 24.394700584999555, - "cpu_time": 211.5088282, - "memory_usage": 1536241664, - "memory_delta": 0, - "error": null -} -``` - -## Example Metrics Output - -| profile_name | step | delta_time | cumulative_time | cpu_time | memory_usage | memory_delta | error | -| ---------------------- | -------------------- | ------------------ | ------------------ | ----------- | ------------ | ------------ | -------------------------- | -| JohnSnowLabs/spark-nlp | codebase_init | 7.186550649999845 | 7.186550649999845 | 180.3553702 | 567525376 | 317095936 | | -| JohnSnowLabs/spark-nlp | post_init_validation | 0.5465090990001045 | 7.733059748999949 | 180.9174761 | 569249792 | 1724416 | | -| JohnSnowLabs/spark-nlp | TOTAL | 7.740976418000173 | 7.740976418000173 | 180.9221699 | 569249792 | 0 | LOW_IMPORT_RESOLUTION_RATE | -| Lightning-AI/lightning | codebase_init | 24.256577352999557 | 24.256577352999557 | 211.3604081 | 1535971328 | 966184960 | | -| Lightning-AI/lightning | post_init_validation | 0.137609629000508 | 24.394186982000065 | 211.5082702 | 1536241664 | 270336 | | -| Lightning-AI/lightning | TOTAL | 24.394700584999555 | 24.394700584999555 | 211.5088282 | 1536241664 | 0 | | diff --git a/codegen-on-oss/codegen_on_oss/__init__.py b/codegen-on-oss/codegen_on_oss/__init__.py index e69de29bb..af8a6943c 100644 --- a/codegen-on-oss/codegen_on_oss/__init__.py +++ b/codegen-on-oss/codegen_on_oss/__init__.py @@ -0,0 +1,9 @@ +""" +CodeGen On OSS - Testing codegen parsing on popular OSS repositories. +Now with enhanced codebase analysis and context management capabilities. +""" + +from codegen_on_oss.analysis import CodebaseAnalysisHarness +from codegen_on_oss.snapshot import CodebaseContextSnapshot + +__all__ = ["CodebaseAnalysisHarness", "CodebaseContextSnapshot"] diff --git a/codegen-on-oss/codegen_on_oss/analysis/__init__.py b/codegen-on-oss/codegen_on_oss/analysis/__init__.py new file mode 100644 index 000000000..e564bed56 --- /dev/null +++ b/codegen-on-oss/codegen_on_oss/analysis/__init__.py @@ -0,0 +1,4 @@ +# Add import for the harness integration +from codegen_on_oss.analysis.harness_integration import CodebaseAnalysisHarness + +__all__ = ["CodebaseAnalysisHarness"] diff --git a/codegen-on-oss/codegen_on_oss/analysis/harness_integration.py b/codegen-on-oss/codegen_on_oss/analysis/harness_integration.py new file mode 100644 index 000000000..a8f707074 --- /dev/null +++ b/codegen-on-oss/codegen_on_oss/analysis/harness_integration.py @@ -0,0 +1,239 @@ +""" +CodebaseAnalysisHarness: Integration of the harness.py functionality from swebench. +Provides comprehensive codebase analysis, diff generation, and context management. +""" + +import json +import pprint +import random +import subprocess +import uuid +from pathlib import Path +from typing import Any, Dict, List, Optional, Set, Tuple, Union + +from loguru import logger + +from codegen import Codebase +from codegen.agents.code_agent import CodeAgent +from codegen.configs.models.codebase import CodebaseConfig +from codegen_on_oss.bucket_store import BucketStore + + +class CodebaseAnalysisHarness: + """ + A harness for analyzing codebases, generating diffs, and tracking file changes. + Integrates functionality from the swebench harness.py. + """ + + def __init__( + self, + codebase: Codebase, + base_commit: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ): + """ + Initialize the CodebaseAnalysisHarness. + + Args: + codebase: The Codebase object to analyze + base_commit: Optional base commit to compare against + metadata: Optional metadata to associate with the analysis + """ + self.codebase = codebase + self.base_commit = base_commit or codebase.commit + self.metadata = metadata or {} + self.analysis_results: Dict[str, Any] = {} + self.run_id = str(uuid.uuid4()) + + @classmethod + def from_repo( + cls, + repo_full_name: str, + commit: Optional[str] = None, + language: str = "python", + disable_file_parse: bool = False, + ) -> "CodebaseAnalysisHarness": + """ + Create a CodebaseAnalysisHarness from a repository. + + Args: + repo_full_name: The full name of the repository (e.g., "owner/repo") + commit: Optional commit to checkout + language: The primary language of the repository + disable_file_parse: Whether to disable file parsing + + Returns: + A new CodebaseAnalysisHarness instance + """ + config = CodebaseConfig( + disable_file_parse=disable_file_parse, + ) + codebase = Codebase.from_repo( + repo_full_name=repo_full_name, + commit=commit, + language=language, + config=config, + ) + return cls(codebase=codebase, base_commit=commit) + + def analyze_codebase(self) -> Dict[str, Any]: + """ + Analyze the codebase and return the results. + + Returns: + A dictionary containing the analysis results + """ + logger.info(f"Analyzing codebase: {self.codebase.repo_full_name}") + + # Get basic repository information + repo_info = { + "repo_name": self.codebase.repo_full_name, + "commit": self.codebase.commit, + "base_commit": self.base_commit, + "run_id": self.run_id, + } + + # Get file statistics + file_stats = self._get_file_stats() + + # Combine all results + self.analysis_results = { + "repo_info": repo_info, + "file_stats": file_stats, + "metadata": self.metadata, + } + + return self.analysis_results + + def _get_file_stats(self) -> Dict[str, Any]: + """ + Get statistics about the files in the codebase. + + Returns: + A dictionary containing file statistics + """ + file_count = len(self.codebase.files) + file_types = {} + + for file in self.codebase.files: + ext = Path(file.path).suffix + if ext in file_types: + file_types[ext] += 1 + else: + file_types[ext] = 1 + + return { + "file_count": file_count, + "file_types": file_types, + } + + def get_diff(self, base: Optional[str] = None) -> str: + """ + Get the diff between the current state and a base commit. + + Args: + base: The base commit to compare against (defaults to self.base_commit) + + Returns: + The diff as a string + """ + base_commit = base or self.base_commit + return self.codebase.get_diff(base=base_commit) + + def diff_versus_commit(self, commit: Optional[str] = None) -> str: + """ + Take a diff of the current contents versus a commit. + + Args: + commit: The commit to compare against (defaults to self.base_commit) + + Returns: + The diff output as a string + """ + commit_to_use = commit or self.base_commit + git_dname = self.codebase.repo_path + diff_cmd = f"git -C {git_dname} diff {commit_to_use}" + diff_output = subprocess.check_output(diff_cmd.split()).decode() + return diff_output + + def files_in_patch(self, patch: str) -> List[str]: + """ + Extract the list of modified files from a unified diff patch string. + + Args: + patch: The unified diff patch string + + Returns: + A list of modified files + """ + files = [] + for line in patch.split("\n"): + if line.startswith("--- a/") or line.startswith("+++ b/"): + fname = line.split("/", 1)[1] + if fname not in files: + files.append(fname) + return files + + def run_agent(self, prompt: str, model: Optional[str] = None) -> Dict[str, Any]: + """ + Run an agent on the codebase with the given prompt. + + Args: + prompt: The prompt to send to the agent + model: Optional model to use for the agent + + Returns: + The result of the agent run + """ + metadata = { + "run_id": self.run_id, + **self.metadata, + } + tags = [str(value) for value in metadata.values()] + agent = CodeAgent(codebase=self.codebase, tags=tags, metadata=metadata) + + try: + result = agent.run(prompt=prompt) + except Exception as agent_error: + logger.error(f"Agent run terminated with error: {agent_error}") + raise agent_error + + # Get the diff between the current state and the original commit + model_patch = self.get_diff(base=self.base_commit) + + # Record the results + edited_files = self.files_in_patch(model_patch) + + return { + "agent_result": result, + "model_patch": model_patch, + "edited_files": edited_files, + } + + def save_results(self, bucket_store: BucketStore, path: str) -> str: + """ + Save the analysis results to a bucket store. + + Args: + bucket_store: The BucketStore to save to + path: The path to save to + + Returns: + The key of the saved file + """ + if not self.analysis_results: + self.analyze_codebase() + + # Save to a temporary file + temp_file = Path(f"/tmp/{self.run_id}_analysis.json") + with open(temp_file, "w") as f: + json.dump(self.analysis_results, f) + + # Upload to bucket store + key = bucket_store.upload_file(str(temp_file), path) + + # Clean up + temp_file.unlink() + + return key + diff --git a/codegen-on-oss/codegen_on_oss/cli.py b/codegen-on-oss/codegen_on_oss/cli.py index c1807d13e..d72ceecfe 100644 --- a/codegen-on-oss/codegen_on_oss/cli.py +++ b/codegen-on-oss/codegen_on_oss/cli.py @@ -10,6 +10,9 @@ from codegen_on_oss.parser import CodegenParser from codegen_on_oss.sources import RepoSource, all_sources +# Add import for the context server +from codegen_on_oss.context_server.server import start_server + logger.remove(0) @@ -124,5 +127,37 @@ def run( parser.parse(repo_url, commit_hash) +# Add new command for starting the server +@cli.command(name="serve") +@click.option( + "--host", + type=str, + default="0.0.0.0", + help="Host to bind to", +) +@click.option( + "--port", + type=int, + default=8000, + help="Port to bind to", +) +@click.option( + "--debug", + is_flag=True, + help="Debug mode", +) +def serve( + host: str = "0.0.0.0", + port: int = 8000, + debug: bool = False, +): + """ + Start the CodeContextRetrievalServer. + """ + logger.add(sys.stdout, level="DEBUG" if debug else "INFO") + logger.info(f"Starting CodeContextRetrievalServer on {host}:{port}") + start_server(host=host, port=port) + + if __name__ == "__main__": cli() diff --git a/codegen-on-oss/codegen_on_oss/context_server/__init__.py b/codegen-on-oss/codegen_on_oss/context_server/__init__.py new file mode 100644 index 000000000..aa1b8da9c --- /dev/null +++ b/codegen-on-oss/codegen_on_oss/context_server/__init__.py @@ -0,0 +1,6 @@ +"""Context server module for CodeContextRetrievalServer.""" + +from codegen_on_oss.context_server.server import app, start_server + +__all__ = ["app", "start_server"] + diff --git a/codegen-on-oss/codegen_on_oss/context_server/server.py b/codegen-on-oss/codegen_on_oss/context_server/server.py new file mode 100644 index 000000000..dfe788c80 --- /dev/null +++ b/codegen-on-oss/codegen_on_oss/context_server/server.py @@ -0,0 +1,203 @@ +""" +CodeContextRetrievalServer: A FastAPI server for accessing codebase analysis and context functionality. +Provides endpoints for analysis, context management, and agent execution. +""" + +import json +import os +from typing import Any, Dict, List, Optional, Union + +from fastapi import FastAPI, HTTPException, Query +from fastapi.middleware.cors import CORSMiddleware +from loguru import logger +from pydantic import BaseModel + +from codegen_on_oss.analysis.harness_integration import CodebaseAnalysisHarness +from codegen_on_oss.snapshot.context_snapshot import CodebaseContextSnapshot + + +class RepositoryRequest(BaseModel): + """Request model for repository operations.""" + repo_full_name: str + commit: Optional[str] = None + language: str = "python" + disable_file_parse: bool = False + + +class SnapshotRequest(BaseModel): + """Request model for snapshot operations.""" + snapshot_id: str + bucket_name: Optional[str] = None + + +class AgentRunRequest(BaseModel): + """Request model for agent run operations.""" + repo_full_name: str + commit: Optional[str] = None + prompt: str + model: Optional[str] = None + metadata: Optional[Dict[str, Any]] = None + + +app = FastAPI( + title="CodeContextRetrievalServer", + description="API for codebase analysis and context retrieval", + version="0.1.0", +) + +# Add CORS middleware +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +@app.get("/") +async def root(): + """Root endpoint.""" + return {"message": "Welcome to the CodeContextRetrievalServer"} + + +@app.post("/analyze") +async def analyze_repository(request: RepositoryRequest): + """ + Analyze a repository and return the results. + + Args: + request: The repository request + + Returns: + The analysis results + """ + try: + harness = CodebaseAnalysisHarness.from_repo( + repo_full_name=request.repo_full_name, + commit=request.commit, + language=request.language, + disable_file_parse=request.disable_file_parse, + ) + results = harness.analyze_codebase() + return results + except Exception as e: + logger.error(f"Error analyzing repository: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@app.post("/snapshot/create") +async def create_snapshot(request: RepositoryRequest): + """ + Create a snapshot of a repository. + + Args: + request: The repository request + + Returns: + The snapshot ID + """ + try: + harness = CodebaseAnalysisHarness.from_repo( + repo_full_name=request.repo_full_name, + commit=request.commit, + language=request.language, + disable_file_parse=request.disable_file_parse, + ) + snapshot = CodebaseContextSnapshot(harness) + snapshot_id = snapshot.create_snapshot() + return {"snapshot_id": snapshot_id} + except Exception as e: + logger.error(f"Error creating snapshot: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@app.post("/snapshot/load") +async def load_snapshot(request: SnapshotRequest): + """ + Load a snapshot. + + Args: + request: The snapshot request + + Returns: + The snapshot data + """ + try: + snapshot_data = CodebaseContextSnapshot.load_snapshot( + snapshot_id=request.snapshot_id, + bucket_name=request.bucket_name, + ) + return snapshot_data + except Exception as e: + logger.error(f"Error loading snapshot: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@app.get("/snapshot/list") +async def list_snapshots(repo_name: Optional[str] = None): + """ + List snapshots. + + Args: + repo_name: Optional repository name to filter by + + Returns: + A list of snapshots + """ + try: + # Create a temporary harness just to get a snapshot object + harness = CodebaseAnalysisHarness.from_repo( + repo_full_name=repo_name or "temp/temp", + disable_file_parse=True, + ) + snapshot = CodebaseContextSnapshot(harness) + snapshots = snapshot.list_snapshots(repo_name) + return {"snapshots": snapshots} + except Exception as e: + logger.error(f"Error listing snapshots: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@app.post("/agent/run") +async def run_agent(request: AgentRunRequest): + """ + Run an agent on a repository. + + Args: + request: The agent run request + + Returns: + The agent run results + """ + try: + harness = CodebaseAnalysisHarness.from_repo( + repo_full_name=request.repo_full_name, + commit=request.commit, + metadata=request.metadata, + ) + result = harness.run_agent( + prompt=request.prompt, + model=request.model, + ) + return result + except Exception as e: + logger.error(f"Error running agent: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +def start_server(host: str = "0.0.0.0", port: int = 8000): + """ + Start the server. + + Args: + host: The host to bind to + port: The port to bind to + """ + import uvicorn + uvicorn.run(app, host=host, port=port) + + +if __name__ == "__main__": + start_server() + diff --git a/codegen-on-oss/codegen_on_oss/snapshot/__init__.py b/codegen-on-oss/codegen_on_oss/snapshot/__init__.py new file mode 100644 index 000000000..04c1a47fe --- /dev/null +++ b/codegen-on-oss/codegen_on_oss/snapshot/__init__.py @@ -0,0 +1,4 @@ +# Add import for the context snapshot +from codegen_on_oss.snapshot.context_snapshot import CodebaseContextSnapshot + +__all__ = ["CodebaseContextSnapshot"] diff --git a/codegen-on-oss/codegen_on_oss/snapshot/context_snapshot.py b/codegen-on-oss/codegen_on_oss/snapshot/context_snapshot.py new file mode 100644 index 000000000..b0e0bd512 --- /dev/null +++ b/codegen-on-oss/codegen_on_oss/snapshot/context_snapshot.py @@ -0,0 +1,205 @@ +""" +CodebaseContextSnapshot: Allows saving and restoring codebase state. +Integrates with S3-compatible storage via BucketStore. +""" + +import json +import os +import tempfile +import uuid +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + +from loguru import logger + +from codegen_on_oss.analysis.harness_integration import CodebaseAnalysisHarness +from codegen_on_oss.bucket_store import BucketStore + + +class CodebaseContextSnapshot: + """ + A class for saving and restoring codebase state, including analysis results and context. + """ + + def __init__( + self, + harness: CodebaseAnalysisHarness, + bucket_name: Optional[str] = None, + ): + """ + Initialize the CodebaseContextSnapshot. + + Args: + harness: The CodebaseAnalysisHarness to snapshot + bucket_name: Optional bucket name for storage (defaults to environment variable) + """ + self.harness = harness + self.bucket_name = bucket_name or os.environ.get("CODEGEN_BUCKET_NAME", "codegen-snapshots") + self.snapshot_id = str(uuid.uuid4()) + self.timestamp = datetime.now().isoformat() + self.metadata = { + "snapshot_id": self.snapshot_id, + "timestamp": self.timestamp, + "repo_name": harness.codebase.repo_full_name, + "commit": harness.codebase.commit, + } + + def create_snapshot(self) -> str: + """ + Create a snapshot of the current codebase state. + + Returns: + The snapshot ID + """ + logger.info(f"Creating snapshot for {self.harness.codebase.repo_full_name}") + + # Ensure we have analysis results + if not self.harness.analysis_results: + self.harness.analyze_codebase() + + # Create a temporary directory for the snapshot + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Save the analysis results + analysis_path = temp_path / "analysis.json" + with open(analysis_path, "w") as f: + json.dump(self.harness.analysis_results, f) + + # Save the metadata + metadata_path = temp_path / "metadata.json" + with open(metadata_path, "w") as f: + json.dump(self.metadata, f) + + # Save the diff if there is one + diff = self.harness.get_diff() + if diff: + diff_path = temp_path / "diff.patch" + with open(diff_path, "w") as f: + f.write(diff) + + # Create a zip archive of the snapshot + snapshot_path = temp_path / f"{self.snapshot_id}.zip" + os.system(f"cd {temp_dir} && zip -r {snapshot_path} .") + + # Upload to bucket store + bucket_store = BucketStore(self.bucket_name) + remote_path = f"snapshots/{self.harness.codebase.repo_full_name}/{self.snapshot_id}.zip" + key = bucket_store.upload_file(str(snapshot_path), remote_path) + + logger.info(f"Snapshot created with ID {self.snapshot_id} at {key}") + + return self.snapshot_id + + @classmethod + def load_snapshot( + cls, + snapshot_id: str, + bucket_name: Optional[str] = None, + ) -> Dict[str, Any]: + """ + Load a snapshot from storage. + + Args: + snapshot_id: The ID of the snapshot to load + bucket_name: Optional bucket name for storage (defaults to environment variable) + + Returns: + The loaded snapshot data + """ + bucket_name = bucket_name or os.environ.get("CODEGEN_BUCKET_NAME", "codegen-snapshots") + bucket_store = BucketStore(bucket_name) + + # Create a temporary directory for the snapshot + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + zip_path = temp_path / f"{snapshot_id}.zip" + + # Download the snapshot + s3_client = bucket_store.s3_client + + # List objects to find the snapshot + response = s3_client.list_objects_v2( + Bucket=bucket_name, + Prefix=f"{bucket_store.key_prefix}/snapshots", + ) + + snapshot_key = None + for obj in response.get("Contents", []): + if snapshot_id in obj["Key"]: + snapshot_key = obj["Key"] + break + + if not snapshot_key: + raise ValueError(f"Snapshot {snapshot_id} not found") + + # Download the snapshot + s3_client.download_file(bucket_name, snapshot_key, str(zip_path)) + + # Extract the snapshot + os.system(f"cd {temp_dir} && unzip {zip_path}") + + # Load the metadata + metadata_path = temp_path / "metadata.json" + with open(metadata_path, "r") as f: + metadata = json.load(f) + + # Load the analysis results + analysis_path = temp_path / "analysis.json" + with open(analysis_path, "r") as f: + analysis = json.load(f) + + # Load the diff if it exists + diff = None + diff_path = temp_path / "diff.patch" + if diff_path.exists(): + with open(diff_path, "r") as f: + diff = f.read() + + return { + "metadata": metadata, + "analysis": analysis, + "diff": diff, + } + + def list_snapshots( + self, + repo_name: Optional[str] = None, + ) -> List[Dict[str, Any]]: + """ + List all snapshots for a repository. + + Args: + repo_name: Optional repository name to filter by + + Returns: + A list of snapshot metadata + """ + bucket_store = BucketStore(self.bucket_name) + s3_client = bucket_store.s3_client + + # List objects to find snapshots + prefix = f"{bucket_store.key_prefix}/snapshots" + if repo_name: + prefix = f"{prefix}/{repo_name}" + + response = s3_client.list_objects_v2( + Bucket=self.bucket_name, + Prefix=prefix, + ) + + snapshots = [] + for obj in response.get("Contents", []): + key = obj["Key"] + # Extract snapshot ID from key + snapshot_id = key.split("/")[-1].split(".")[0] + snapshots.append({ + "snapshot_id": snapshot_id, + "key": key, + "last_modified": obj["LastModified"].isoformat(), + "size": obj["Size"], + }) + + return snapshots + diff --git a/codegen-on-oss/examples/codebase_analysis_example.py b/codegen-on-oss/examples/codebase_analysis_example.py new file mode 100755 index 000000000..03f22e503 --- /dev/null +++ b/codegen-on-oss/examples/codebase_analysis_example.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python +""" +Example script demonstrating how to use the CodebaseAnalysisHarness. +""" + +import json +import sys +from pathlib import Path + +from codegen_on_oss.analysis import CodebaseAnalysisHarness +from codegen_on_oss.snapshot import CodebaseContextSnapshot + + +def main(): + """ + Main function to demonstrate the CodebaseAnalysisHarness. + """ + # Check if a repository name was provided + if len(sys.argv) < 2: + print("Usage: python codebase_analysis_example.py [commit]") + sys.exit(1) + + repo_full_name = sys.argv[1] + commit = sys.argv[2] if len(sys.argv) > 2 else None + + print(f"Analyzing repository: {repo_full_name}") + print(f"Commit: {commit or 'latest'}") + + # Create a harness + harness = CodebaseAnalysisHarness.from_repo( + repo_full_name=repo_full_name, + commit=commit, + ) + + # Analyze the codebase + results = harness.analyze_codebase() + + # Print the results + print("\nAnalysis Results:") + print(json.dumps(results, indent=2)) + + # Create a snapshot + snapshot = CodebaseContextSnapshot(harness) + snapshot_id = snapshot.create_snapshot() + + print(f"\nSnapshot created with ID: {snapshot_id}") + + # Get a diff + diff = harness.get_diff() + if diff: + print("\nDiff from base commit:") + print(diff) + else: + print("\nNo changes detected from base commit.") + + # Save results to a file + output_file = Path(f"{repo_full_name.replace('/', '_')}_analysis.json") + with open(output_file, "w") as f: + json.dump(results, f, indent=2) + + print(f"\nResults saved to {output_file}") + + +if __name__ == "__main__": + main() + diff --git a/codegen-on-oss/examples/context_server_example.py b/codegen-on-oss/examples/context_server_example.py new file mode 100755 index 000000000..304abd9a9 --- /dev/null +++ b/codegen-on-oss/examples/context_server_example.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python +""" +Example script demonstrating how to use the CodeContextRetrievalServer. +""" + +import json +import requests +import sys +import time + + +def main(): + """ + Main function to demonstrate the CodeContextRetrievalServer. + """ + # Check if a repository name was provided + if len(sys.argv) < 2: + print("Usage: python context_server_example.py [commit]") + sys.exit(1) + + repo_full_name = sys.argv[1] + commit = sys.argv[2] if len(sys.argv) > 2 else None + + # Server URL + server_url = "http://localhost:8000" + + print(f"Using CodeContextRetrievalServer at {server_url}") + print(f"Repository: {repo_full_name}") + print(f"Commit: {commit or 'latest'}") + + # Check if server is running + try: + response = requests.get(server_url) + if response.status_code != 200: + print(f"Server returned status code {response.status_code}") + sys.exit(1) + except requests.exceptions.ConnectionError: + print(f"Could not connect to server at {server_url}") + print("Make sure the server is running with: cgparse serve") + sys.exit(1) + + # Analyze repository + print("\nAnalyzing repository...") + response = requests.post( + f"{server_url}/analyze", + json={ + "repo_full_name": repo_full_name, + "commit": commit, + }, + ) + + if response.status_code != 200: + print(f"Error analyzing repository: {response.text}") + sys.exit(1) + + analysis_results = response.json() + print("Analysis complete!") + print(json.dumps(analysis_results, indent=2)) + + # Create a snapshot + print("\nCreating snapshot...") + response = requests.post( + f"{server_url}/snapshot/create", + json={ + "repo_full_name": repo_full_name, + "commit": commit, + }, + ) + + if response.status_code != 200: + print(f"Error creating snapshot: {response.text}") + sys.exit(1) + + snapshot_id = response.json()["snapshot_id"] + print(f"Snapshot created with ID: {snapshot_id}") + + # List snapshots + print("\nListing snapshots...") + response = requests.get( + f"{server_url}/snapshot/list", + params={"repo_name": repo_full_name}, + ) + + if response.status_code != 200: + print(f"Error listing snapshots: {response.text}") + else: + snapshots = response.json()["snapshots"] + print(f"Found {len(snapshots)} snapshots:") + for snapshot in snapshots: + print(f" - {snapshot['snapshot_id']} ({snapshot['last_modified']})") + + # Run an agent + print("\nRunning agent...") + response = requests.post( + f"{server_url}/agent/run", + json={ + "repo_full_name": repo_full_name, + "commit": commit, + "prompt": "Analyze this codebase and summarize its structure.", + "metadata": {"example": "true"}, + }, + ) + + if response.status_code != 200: + print(f"Error running agent: {response.text}") + else: + agent_result = response.json() + print("Agent run complete!") + if "edited_files" in agent_result: + print(f"Edited files: {agent_result['edited_files']}") + + +if __name__ == "__main__": + main() + diff --git a/codegen-on-oss/pyproject.toml b/codegen-on-oss/pyproject.toml index b4227c454..03a73e436 100644 --- a/codegen-on-oss/pyproject.toml +++ b/codegen-on-oss/pyproject.toml @@ -18,10 +18,13 @@ dependencies = [ "boto3>=1.36.21", "click>=8.1.8", "codegen>=0.6.2", + "fastapi>=0.110.0", "loguru>=0.7.3", "modal>=0.73.51", + "pydantic>=2.7.1", "pydantic-settings>=2.7.1", "pygithub>=2.5.0", + "uvicorn>=0.29.0", ] [project.urls]